From 77634e622e7db2a5093ee8c142f912b6b7e9c111 Mon Sep 17 00:00:00 2001
From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Date: Tue, 19 May 2026 11:30:27 +0000
Subject: [PATCH 1/6] [None][refactor] Update model path definitions in
 test_perf.py and clean up waives.txt

Removed outdated model paths and unnecessary entries from MODEL_PATH_DICT in test_perf.py. Updated waives.txt to reflect the removal of tests that are no longer applicable, improving clarity and maintainability.

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
---
 tests/integration/defs/perf/test_perf.py | 78 +-----------------------
 tests/integration/test_lists/waives.txt  |  9 ++-
 2 files changed, 5 insertions(+), 82 deletions(-)

diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 33e3be93cb1d..32612df2fe52 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -45,9 +45,6 @@
 
 # Model PATH of local dir synced from internal LLM models repo
 MODEL_PATH_DICT = {
-    "llama_v2_7b": "llama-models-v2/llama-v2-7b-hf",  # not safetensors repo
-    "llama_v2_13b": "llama-models-v2/llama-v2-13b-hf",  # not safetensors repo
-    "llama_v2_70b": "llama-models-v2/llama-v2-70b-hf",  # not safetensors repo
     "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B",
     "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
     "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
@@ -88,20 +85,9 @@
     "llama4-models/Llama-4-Maverick-17B-128E-Instruct",
     "llama_v4_maverick_17b_128e_instruct_fp8":
     "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
-    "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
-    "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
-    "mixtral_8x7b_v0.1_instruct_fp8": "Mixtral-8x7B-Instruct-v0.1-fp8",
-    "mixtral_8x7b_v0.1_instruct_fp4":
-    "modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp4",
-    "mistral_nemo_12b_base": "Mistral-Nemo-Base-2407",
     "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B",
     "deepseek_r1_distill_llama_70b":
     "DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B/",
-    "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1",
-    "mistral_7b_v0.1": "mistral-7b-v0.1",
-    "ministral_8b": "Ministral-8B-Instruct-2410",
-    "ministral_8b_fp8": "Ministral-8B-Instruct-2410-FP8",
-    "gemma_3_1b_it": "gemma/gemma-3-1b-it",
     "gemma_3_27b_it": "gemma/gemma-3-27b-it",
     "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8",
     "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4",
@@ -136,43 +122,15 @@
     "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8",
     "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4",
     "starcoder2_3b": "starcoder2-3b",
-    "starcoder2_7b": "starcoder2-7b",
-    "starcoder2_15b": "starcoder2-15b",
-    "t5": "t5-small",  # not supported for trtllm-bench build config
-    "flan_t5_base":
-    "flan-t5-small",  # not supported for trtllm-bench build config
-    "flan_t5_large":
-    "flan-t5-xl",  # not supported for trtllm-bench build config
-    "whisper_large_v3":
-    "whisper-models/large-v3",  # not supported for trtllm-bench tokenizer
-    "bart_large_cnn": "bart-large-cnn",  # not safetensors repo
-    "mbart_large_50_many_to_one_mmt": "mbart-large-50-many-to-one-mmt",
-    "mamba_130m": "mamba/mamba-130m-hf",
-    "mamba_370m": "mamba/mamba-370m-hf",
-    "mamba_2.8b": "mamba/mamba-2.8b-hf",
-    "gpt_20b": "gpt-neox-20b",
-    "gpt_350m_moe": "gpt2-medium",
     "phi_4_mini_instruct": "Phi-4-mini-instruct",
     "phi_4_reasoning_plus": "Phi-4-reasoning-plus",
     "phi_4_reasoning_plus_fp8": "nvidia-Phi-4-reasoning-plus-FP8",
     "phi_4_reasoning_plus_fp4": "nvidia-Phi-4-reasoning-plus-NVFP4",
     "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct",
-    "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct",
-    "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct",
     "phi_4_multimodal_instruct_fp4":
     "multimodals/Phi-4-multimodal-instruct-FP4",
-    "phi_4_multimodal_instruct_fp4_image":
-    "multimodals/Phi-4-multimodal-instruct-FP4",
-    "phi_4_multimodal_instruct_fp4_audio":
-    "multimodals/Phi-4-multimodal-instruct-FP4",
-    "phi_4_multimodal_instruct_fp8_image":
-    "multimodals/Phi-4-multimodal-instruct-FP8",
-    "phi_4_multimodal_instruct_fp8_audio":
-    "multimodals/Phi-4-multimodal-instruct-FP8",
     "phi_4_multimodal_instruct_fp8":
     "multimodals/Phi-4-multimodal-instruct-FP8",
-    "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct",
-    "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8",
     "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
     "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
     "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b",
@@ -202,7 +160,6 @@
     # DeepSeek V3.2 (671B MoE)
     "deepseek_v3.2_fp8": "DeepSeek-V3.2-hf",
     "deepseek_v3.2_fp4": "DeepSeek-V3.2-NVFP4",
-    "deepseek_v3.2_exp_fp4_v2": "DeepSeek-V3.2-Exp-FP4-v2",
     # GLM-5 FP8 (MoE)
     "glm_5_fp8": "GLM-5-FP8",
     # Kimi K2.5 NVFP4 (~1T MoE multimodal)
@@ -210,15 +167,9 @@
 }
 # Model PATH of HuggingFace
 HF_MODEL_PATH = {
-    "llama_v2_7b_hf": "meta-llama/Llama-2-7b-hf",
-    "llama_v2_70b_hf": "meta-llama/Llama-2-70b-hf",
-    "falcon_180b_hf": "tiiuae/falcon-180B",
-    "gptj_6b_hf": "EleutherAI/gpt-j-6b",
-    "llama_v3_8b_hf": "meta-llama/Meta-Llama-3-8B",
     "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B",
     "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8",
     "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-    "llama_v3_70b_hf": "meta-llama/Meta-Llama-3-70B",
     "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B",
     "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B",
     "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
@@ -230,37 +181,10 @@
     "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8",
     "llama_v3.1_nemotron_ultra_253b_fp8_hf":
     "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
-    "mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1",
-    "mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1",
-    "ministral_8b_hf": "mistralai/Ministral-8B-Instruct-2410",
-    "flan_t5_base_hf": "google/flan-t5-small",
     "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct",
-    "gemma_3_1b_it_hf": "google/gemma-3-1b-it",
 }
 LORA_MODEL_PATH = {
-    "llama_v2_13b":
-    "llama-models-v2/chinese-llama-2-lora-13b",
-    "mixtral_8x7b_v0.1":
-    "chinese-mixtral-lora",
-    "llama_v3.1_8b_instruct_fp8":
-    "lora/llama-3-chinese-8b-instruct-v2-lora/",
-    "ministral_8b":
-    "lora/ministral/Ministral-8B-Instruct-2410-Loras-Dummy",  # Dummy LoRA for Ministral
-    "gemma_3_1b_it":
-    "lora/gemma/gemma-3-1b-it-dummy-lora",  # Dummy LoRA for Gemma-3-1B-Instruct
-    "phi_4_multimodal_instruct_image":
-    "multimodals/Phi-4-multimodal-instruct/vision-lora",
-    "phi_4_multimodal_instruct_audio":
-    "multimodals/Phi-4-multimodal-instruct/speech-lora",
-    "phi_4_multimodal_instruct_fp4_image":
-    "multimodals/Phi-4-multimodal-instruct-FP4/vision-lora",
-    "phi_4_multimodal_instruct_fp4_audio":
-    "multimodals/Phi-4-multimodal-instruct-FP4/speech-lora",
-    "phi_4_multimodal_instruct_fp8_image":
-    "multimodals/Phi-4-multimodal-instruct-FP8/vision-lora",
-    "phi_4_multimodal_instruct_fp8_audio":
-    "multimodals/Phi-4-multimodal-instruct-FP8/speech-lora",
+    "llama_v3.1_8b_instruct_fp8": "lora/llama-3-chinese-8b-instruct-v2-lora/",
 }
 
 TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 541a1e5a0a57..1fa2ca62809e 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -251,6 +251,7 @@ full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[tr
 full:B200/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150)
 full:B200/perf/test_perf.py::test_perf[bart_large_cnn] SKIP (bert_attention_plugin does not support SM >= 100)
 full:B200/perf/test_perf.py::test_perf[bert_large] SKIP (bert_attention_plugin does not support SM >= 100)
+full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugs/5150255)
 full:B200/perf/test_perf.py::test_perf[flan_t5_base] SKIP (bert_attention_plugin does not support SM >= 100)
 full:B200/perf/test_perf.py::test_perf[flan_t5_large] SKIP (bert_attention_plugin does not support SM >= 100)
 full:B200/perf/test_perf.py::test_perf[flan_t5_xl] SKIP (bert_attention_plugin does not support SM >= 100)
@@ -259,11 +260,6 @@ full:B200/perf/test_perf.py::test_perf[mbart_large_50_many_to_one_mmt] SKIP (ber
 full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074)
 full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074)
 full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074)
-full:B200/perf/test_perf.py::test_perf[roberta_base] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[t5_11b] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[t5_3b] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[t5_base] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[t5_large] SKIP (bert_attention_plugin does not support SM >= 100)
 full:B300/unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" SKIP (https://nvbugs/6165866)
 full:DGX_H100/kv_cache/test_prefix_aware_scheduling.py::TestServePrefixAwareScheduling::test_multi_round_qa_shared_prefix[swa-chunked] SKIP (https://nvbugs/6136737)
 full:GB200-OCI/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150)
@@ -312,6 +308,8 @@ perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,
 perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP
 perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization)
 perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization)
+perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:20000,2000-reqs:500-con:250] SKIP (https://nvbugs/5304388)
+perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:5000,500-reqs:500-con:250] SKIP (https://nvbugs/5304388)
 perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] SKIP
 perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] SKIP
 perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP
@@ -320,6 +318,7 @@ perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] SK
 perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP
 perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
 perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP
+perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6130334)
 perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215810)
 perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] SKIP (https://nvbugs/6167060)
 perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6190071)

From fea317ce44417135dc2fb7ee5977d6772933b626 Mon Sep 17 00:00:00 2001
From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Date: Mon, 25 May 2026 06:05:15 +0000
Subject: [PATCH 2/6] Refactor model_path to ensure QA and CI have the same
 testing models

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
---
 tests/integration/defs/perf/_model_paths.py   | 146 +++++++++++++++++
 tests/integration/defs/perf/test_perf.py      | 147 +-----------------
 .../integration/defs/perf/test_perf_sanity.py |  25 +--
 3 files changed, 152 insertions(+), 166 deletions(-)
 create mode 100644 tests/integration/defs/perf/_model_paths.py

diff --git a/tests/integration/defs/perf/_model_paths.py b/tests/integration/defs/perf/_model_paths.py
new file mode 100644
index 000000000000..30d28fe15d9b
--- /dev/null
+++ b/tests/integration/defs/perf/_model_paths.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared model path constants for perf and perf-sanity tests."""
+
+# Model PATH of local dir synced from internal LLM models repo
+MODEL_PATH_DICT = {
+    "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B",
+    "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
+    "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
+    "llama_v3.1_8b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4",
+    "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B",
+    "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
+    "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8",
+    "llama_v3.3_70b_instruct_fp8": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
+    "llama_v3.3_70b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
+    "llama_v3.1_405b_instruct_fp8": "llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
+    "llama_v3.1_405b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
+    "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
+    "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B",
+    "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
+    "llama_v3.1_nemotron_nano_8b_fp8": "Llama-3.1-Nemotron-Nano-8B-v1-FP8",
+    "llama_v3.3_nemotron_super_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1",
+    "llama_v3.3_nemotron_super_49b_fp8": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8",
+    "llama_v3.3_nemotron_super_49b_v1.5_fp8": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8",
+    "llama_v3.1_nemotron_ultra_253b": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1",
+    "llama_v3.1_nemotron_ultra_253b_fp8": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
+    "llama_v4_scout_17b_16e_instruct": "llama4-models/Llama-4-Scout-17B-16E-Instruct",
+    "llama_v4_scout_17b_16e_instruct_fp8": "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
+    "llama_v4_scout_17b_16e_instruct_fp4": "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4",
+    "llama_v4_maverick_17b_128e_instruct": "llama4-models/Llama-4-Maverick-17B-128E-Instruct",
+    "llama_v4_maverick_17b_128e_instruct_fp8": "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B",
+    "deepseek_r1_distill_llama_70b": "DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B/",
+    "gemma_3_27b_it": "gemma/gemma-3-27b-it",
+    "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8",
+    "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4",
+    "gemma_3_12b_it": "gemma/gemma-3-12b-it",
+    "gemma_3_12b_it_fp8": "gemma/gemma-3-12b-it-fp8",
+    "gemma_3_12b_it_fp4": "gemma/gemma-3-12b-it-fp4",
+    "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
+    "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
+    "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
+    "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
+    "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
+    "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
+    "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
+    "qwen2_7b_instruct": "Qwen2-7B-Instruct",
+    "qwen_14b_chat": "Qwen-14B-Chat",
+    "qwen3_0.6b": "Qwen3/Qwen3-0.6B",
+    "qwen3_4b_eagle3": "Qwen3/Qwen3-4B",
+    "qwen3_8b": "Qwen3/Qwen3-8B",
+    "qwen3_8b_fp8": "Qwen3/nvidia-Qwen3-8B-FP8",
+    "qwen3_8b_fp4": "Qwen3/nvidia-Qwen3-8B-NVFP4",
+    "qwen3_14b": "Qwen3/Qwen3-14B",
+    "qwen3_14b_fp8": "Qwen3/nvidia-Qwen3-14B-FP8",
+    "qwen3_14b_fp4": "Qwen3/nvidia-Qwen3-14B-NVFP4",
+    "qwen3_30b_a3b": "Qwen3/Qwen3-30B-A3B",
+    "qwen3_30b_a3b_fp4": "Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf",
+    "qwen3_32b": "Qwen3/Qwen3-32B",
+    "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4",
+    "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
+    "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
+    "qwen3_235b_a22b_fp4_eagle3": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
+    "qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct",
+    "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8",
+    "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4",
+    "starcoder2_3b": "starcoder2-3b",
+    "phi_4_mini_instruct": "Phi-4-mini-instruct",
+    "phi_4_reasoning_plus": "Phi-4-reasoning-plus",
+    "phi_4_reasoning_plus_fp8": "nvidia-Phi-4-reasoning-plus-FP8",
+    "phi_4_reasoning_plus_fp4": "nvidia-Phi-4-reasoning-plus-NVFP4",
+    "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct",
+    "phi_4_multimodal_instruct_fp4": "multimodals/Phi-4-multimodal-instruct-FP4",
+    "phi_4_multimodal_instruct_fp8": "multimodals/Phi-4-multimodal-instruct-FP8",
+    "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
+    "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
+    "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b",
+    "gpt_oss_120b_eagle3": "gpt_oss/gpt-oss-120b",
+    "gpt_oss_120b_eagle3_throughput": "gpt_oss/gpt-oss-120b",
+    "nemotron_nano_3_30b_fp8": "Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev",
+    "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2",
+    "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4",
+    "nemotron_3_super_120b_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+    "nemotron_3_super_120b_nvfp4_mtp": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+    # Nemotron-3-Nano-Omni-30B (text + image multimodal)
+    "nemotron_3_nano_omni_nvfp4": "NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+    "nemotron_3_nano_omni_nvfp4_image": "NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+    "kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4",
+    # MiniMax M2.5 (FP8 block-scale, ~230B MoE)
+    "minimax_m2.5_fp8": "MiniMax-M2.5",
+    # Qwen3.5 dense + MoE
+    "qwen3.5_9b": "Qwen3.5-9B",
+    "qwen3.5_27b": "Qwen3.5-27B",
+    "qwen3.5_35b_a3b_fp8": "Qwen3.5-35B-A3B-FP8",
+    "qwen3.5_122b_a10b": "Qwen3.5-122B-A10B",
+    "qwen3.5_397b_a17b_fp8": "Qwen3.5-397B-A17B-FP8",
+    "qwen3.5_397b_a17b_fp4": "Qwen3.5-397B-A17B-NVFP4",
+    # DeepSeek V3.2 (671B MoE)
+    "deepseek_v3.2_fp8": "DeepSeek-V3.2-hf",
+    "deepseek_v3.2_fp4": "DeepSeek-V3.2-NVFP4",
+    # GLM-5 FP8 (MoE)
+    "glm_5_fp8": "GLM-5-FP8",
+    # Kimi K2.5 NVFP4 (~1T MoE multimodal)
+    "kimi_k2.5_fp4": "Kimi-K2.5-NVFP4",
+    # Keys below are sanity-side aliases; some point to the same weights as
+    # entries above but are kept under sanity's historical naming.
+    "deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2",
+    "k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4",
+    "k25_thinking_fp4": "Kimi-K2.5-NVFP4",
+    "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+    "super_fp8": "NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
+    "super_bf16": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
+    "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8",
+    "glm_5_nvfp4": "GLM-5-NVFP4",
+}
+
+# Model PATH of HuggingFace
+HF_MODEL_PATH = {
+    "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B",
+    "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8",
+    "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B",
+    "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B",
+    "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
+    "llama_v3.1_nemotron_nano_8b_fp8_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8",
+    "llama_v3.3_nemotron_super_49b_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+    "llama_v3.3_nemotron_super_49b_fp8_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8",
+    "llama_v3.1_nemotron_ultra_253b_fp8_hf": "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
+    "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct",
+}
+
+LORA_MODEL_PATH = {
+    "llama_v3.1_8b_instruct_fp8": "lora/llama-3-chinese-8b-instruct-v2-lora/",
+}
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 32612df2fe52..df1b01128efb 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,6 +31,7 @@
 
 from ..conftest import (get_device_count, get_llm_root, llm_models_root,
                         trt_environment)
+from ._model_paths import HF_MODEL_PATH, LORA_MODEL_PATH, MODEL_PATH_DICT
 from .pytorch_model_config import get_model_yaml_config
 from .sampler_options_config import get_sampler_options_config
 from .utils import (AbstractPerfScriptTestClass, PerfBenchScriptTestCmds,
@@ -43,150 +44,6 @@
 ALLOWED_CONFIGS_CACHE = None  # Cache to avoid modifying sys.path many times.
 MAP_BY_SOCKET = None
 
-# Model PATH of local dir synced from internal LLM models repo
-MODEL_PATH_DICT = {
-    "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B",
-    "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
-    "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
-    "llama_v3.1_8b_instruct_fp4":
-    "modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4",
-    "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B",
-    "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
-    "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8",
-    "llama_v3.3_70b_instruct_fp8":
-    "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
-    "llama_v3.3_70b_instruct_fp4":
-    "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
-    "llama_v3.1_405b_instruct_fp8":
-    "llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
-    "llama_v3.1_405b_instruct_fp4":
-    "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
-    "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
-    "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B",
-    "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
-    "llama_v3.1_nemotron_nano_8b_fp8": "Llama-3.1-Nemotron-Nano-8B-v1-FP8",
-    "llama_v3.3_nemotron_super_49b":
-    "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1",
-    "llama_v3.3_nemotron_super_49b_fp8":
-    "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8",
-    "llama_v3.3_nemotron_super_49b_v1.5_fp8":
-    "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8",
-    "llama_v3.1_nemotron_ultra_253b":
-    "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1",
-    "llama_v3.1_nemotron_ultra_253b_fp8":
-    "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
-    "llama_v4_scout_17b_16e_instruct":
-    "llama4-models/Llama-4-Scout-17B-16E-Instruct",
-    "llama_v4_scout_17b_16e_instruct_fp8":
-    "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
-    "llama_v4_scout_17b_16e_instruct_fp4":
-    "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4",
-    "llama_v4_maverick_17b_128e_instruct":
-    "llama4-models/Llama-4-Maverick-17B-128E-Instruct",
-    "llama_v4_maverick_17b_128e_instruct_fp8":
-    "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
-    "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B",
-    "deepseek_r1_distill_llama_70b":
-    "DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B/",
-    "gemma_3_27b_it": "gemma/gemma-3-27b-it",
-    "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8",
-    "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4",
-    "gemma_3_12b_it": "gemma/gemma-3-12b-it",
-    "gemma_3_12b_it_fp8": "gemma/gemma-3-12b-it-fp8",
-    "gemma_3_12b_it_fp4": "gemma/gemma-3-12b-it-fp4",
-    "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
-    "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
-    "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
-    "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
-    "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
-    "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
-    "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
-    "qwen2_7b_instruct": "Qwen2-7B-Instruct",
-    "qwen_14b_chat": "Qwen-14B-Chat",
-    "qwen3_0.6b": "Qwen3/Qwen3-0.6B",
-    "qwen3_4b_eagle3": "Qwen3/Qwen3-4B",
-    "qwen3_8b": "Qwen3/Qwen3-8B",
-    "qwen3_8b_fp8": "Qwen3/nvidia-Qwen3-8B-FP8",
-    "qwen3_8b_fp4": "Qwen3/nvidia-Qwen3-8B-NVFP4",
-    "qwen3_14b": "Qwen3/Qwen3-14B",
-    "qwen3_14b_fp8": "Qwen3/nvidia-Qwen3-14B-FP8",
-    "qwen3_14b_fp4": "Qwen3/nvidia-Qwen3-14B-NVFP4",
-    "qwen3_30b_a3b": "Qwen3/Qwen3-30B-A3B",
-    "qwen3_30b_a3b_fp4": "Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf",
-    "qwen3_32b": "Qwen3/Qwen3-32B",
-    "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4",
-    "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
-    "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
-    "qwen3_235b_a22b_fp4_eagle3": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
-    "qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct",
-    "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8",
-    "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4",
-    "starcoder2_3b": "starcoder2-3b",
-    "phi_4_mini_instruct": "Phi-4-mini-instruct",
-    "phi_4_reasoning_plus": "Phi-4-reasoning-plus",
-    "phi_4_reasoning_plus_fp8": "nvidia-Phi-4-reasoning-plus-FP8",
-    "phi_4_reasoning_plus_fp4": "nvidia-Phi-4-reasoning-plus-NVFP4",
-    "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct",
-    "phi_4_multimodal_instruct_fp4":
-    "multimodals/Phi-4-multimodal-instruct-FP4",
-    "phi_4_multimodal_instruct_fp8":
-    "multimodals/Phi-4-multimodal-instruct-FP8",
-    "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
-    "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
-    "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b",
-    "gpt_oss_120b_eagle3": "gpt_oss/gpt-oss-120b",
-    "gpt_oss_120b_eagle3_throughput": "gpt_oss/gpt-oss-120b",
-    "nemotron_nano_3_30b_fp8": "Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev",
-    "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2",
-    "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4",
-    "nemotron_3_super_120b_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
-    "nemotron_3_super_120b_nvfp4_mtp":
-    "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
-    # Nemotron-3-Nano-Omni-30B (text + image multimodal)
-    "nemotron_3_nano_omni_nvfp4":
-    "NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
-    "nemotron_3_nano_omni_nvfp4_image":
-    "NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
-    "kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4",
-    # MiniMax M2.5 (FP8 block-scale, ~230B MoE)
-    "minimax_m2.5_fp8": "MiniMax-M2.5",
-    # Qwen3.5 dense + MoE
-    "qwen3.5_9b": "Qwen3.5-9B",
-    "qwen3.5_27b": "Qwen3.5-27B",
-    "qwen3.5_35b_a3b_fp8": "Qwen3.5-35B-A3B-FP8",
-    "qwen3.5_122b_a10b": "Qwen3.5-122B-A10B",
-    "qwen3.5_397b_a17b_fp8": "Qwen3.5-397B-A17B-FP8",
-    "qwen3.5_397b_a17b_fp4": "Qwen3.5-397B-A17B-NVFP4",
-    # DeepSeek V3.2 (671B MoE)
-    "deepseek_v3.2_fp8": "DeepSeek-V3.2-hf",
-    "deepseek_v3.2_fp4": "DeepSeek-V3.2-NVFP4",
-    # GLM-5 FP8 (MoE)
-    "glm_5_fp8": "GLM-5-FP8",
-    # Kimi K2.5 NVFP4 (~1T MoE multimodal)
-    "kimi_k2.5_fp4": "Kimi-K2.5-NVFP4",
-}
-# Model PATH of HuggingFace
-HF_MODEL_PATH = {
-    "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B",
-    "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8",
-    "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-    "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B",
-    "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B",
-    "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
-    "llama_v3.1_nemotron_nano_8b_fp8_hf":
-    "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8",
-    "llama_v3.3_nemotron_super_49b_hf":
-    "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
-    "llama_v3.3_nemotron_super_49b_fp8_hf":
-    "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8",
-    "llama_v3.1_nemotron_ultra_253b_fp8_hf":
-    "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
-    "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct",
-}
-LORA_MODEL_PATH = {
-    "llama_v3.1_8b_instruct_fp8": "lora/llama-3-chinese-8b-instruct-v2-lora/",
-}
-
 TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")
 
 NEMOTRON_SUPER_MODELS = {
diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py
index 4d9882259f46..40deafd9b15f 100644
--- a/tests/integration/defs/perf/test_perf_sanity.py
+++ b/tests/integration/defs/perf/test_perf_sanity.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,30 +34,13 @@
 from tensorrt_llm._utils import get_free_port
 
 from ..conftest import get_llm_root, llm_models_root
+from ._model_paths import MODEL_PATH_DICT as _MODEL_PATH_DICT_BASE
 from .perf_regression_utils import process_and_upload_test_results
 
-# Model PATH of local dir synced from internal LLM models repo
+# Sanity-side path differs from test_perf for this key; preserve historical value.
 MODEL_PATH_DICT = {
-    "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
-    "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
-    "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
-    "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
-    "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
-    "deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2",
-    "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
-    "k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4",
-    "k25_thinking_fp4": "Kimi-K2.5-NVFP4",
-    "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",  # Qwen3-235B-A22B-FP4
-    "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",  # Super (Nemotron-H SSM+MoE) NvFP4
-    "super_fp8": "NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
-    "super_bf16": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
-    "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",  # Qwen3-235B-A22B-FP8
-    "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8",
+    **_MODEL_PATH_DICT_BASE,
     "llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4",
-    "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
-    "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
-    "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
-    "glm_5_nvfp4": "GLM-5-NVFP4",
 }
 
 SUPPORTED_GPU_MAPPING = {

From 204b9cd3bcfc682b8c0dbf4234c9801f2cfabf0c Mon Sep 17 00:00:00 2001
From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Date: Mon, 25 May 2026 07:00:50 +0000
Subject: [PATCH 3/6] test: remove dead bert_attention_plugin SM>=100 waivers

These 7 waivers referenced perf tests (bart_large_cnn, bert_large,
flan_t5_base/large/xl/xxl, mbart_large_50_many_to_one_mmt) that no
longer appear in any test-db yaml on main. Drop them to keep the
cleanup consistent with the 5 sibling waivers (roberta_base, t5_*)
that were already removed in this PR.

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 1fa2ca62809e..1440b04bf34c 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -249,14 +249,7 @@ full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954)
 full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-False] SKIP (https://nvbugs/6185480)
 full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-True] SKIP (https://nvbugs/6185480)
 full:B200/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150)
-full:B200/perf/test_perf.py::test_perf[bart_large_cnn] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[bert_large] SKIP (bert_attention_plugin does not support SM >= 100)
 full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugs/5150255)
-full:B200/perf/test_perf.py::test_perf[flan_t5_base] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[flan_t5_large] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[flan_t5_xl] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[flan_t5_xxl] SKIP (bert_attention_plugin does not support SM >= 100)
-full:B200/perf/test_perf.py::test_perf[mbart_large_50_many_to_one_mmt] SKIP (bert_attention_plugin does not support SM >= 100)
 full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074)
 full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074)
 full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074)

From 2109fc88f7ec05310b5db25077a115129c19bad9 Mon Sep 17 00:00:00 2001
From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Date: Mon, 25 May 2026 07:49:57 +0000
Subject: [PATCH 4/6] test: remove 4 perf waivers per author confirmation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the 4 perf waivers that the PR originally added — author confirmed
the underlying nvbugs (5150255 / 5304388 / 6130334) are no longer
necessary to waive.

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 1440b04bf34c..a53606807422 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -249,7 +249,6 @@ full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954)
 full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-False] SKIP (https://nvbugs/6185480)
 full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-True] SKIP (https://nvbugs/6185480)
 full:B200/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150)
-full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugs/5150255)
 full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074)
 full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074)
 full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074)
@@ -301,8 +300,6 @@ perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,
 perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP
 perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization)
 perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:20000,2000-reqs:500-con:250] SKIP (https://nvbugs/5304388)
-perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:5000,500-reqs:500-con:250] SKIP (https://nvbugs/5304388)
 perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] SKIP
 perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] SKIP
 perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP
@@ -311,7 +308,6 @@ perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] SK
 perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP
 perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
 perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP
-perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6130334)
 perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215810)
 perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] SKIP (https://nvbugs/6167060)
 perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6190071)

From 743679892a01134892339b9a4a0a2564939b7821 Mon Sep 17 00:00:00 2001
From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Date: Tue, 26 May 2026 12:11:52 +0000
Subject: [PATCH 5/6] test: add new models to TRUST_REMOTE_CODE_MODELS in
 test_perf.py

Included additional models "nemotron_nano_12b_v2", "phi_4_multimodal_instruct", "phi_4_multimodal_instruct_fp4", and "phi_4_multimodal_instruct_fp8" to the TRUST_REMOTE_CODE_MODELS dictionary to enhance testing coverage.

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
---
 tests/integration/defs/perf/test_perf.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index df1b01128efb..9550f1f502a0 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -64,6 +64,10 @@
     "glm_5_fp8",
     "nemotron_3_nano_omni_nvfp4",
     "nemotron_3_nano_omni_nvfp4_image",
+    "nemotron_nano_12b_v2",
+    "phi_4_multimodal_instruct",
+    "phi_4_multimodal_instruct_fp4",
+    "phi_4_multimodal_instruct_fp8",
 }
 
 # Models that use random_image dataset in serve mode benchmarks.

From e944483b7bee0dbd59bdf275f662732c4dec7b64 Mon Sep 17 00:00:00 2001
From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Date: Fri, 29 May 2026 13:02:35 +0000
Subject: [PATCH 6/6] Enhance performance configurations for
 Nemotron-3-Super-120B-NVFP4 models in pytorch_model_config.py and update
 test_perf.py to include new spec-decoding models. Added configurations for
 streaming and throughput variants, ensuring better performance tuning.
 Adjusted test conditions in llm_perf_core.yml to reflect new model tests and
 conditions for GPU capabilities.

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
---
 .../defs/perf/pytorch_model_config.py         |  65 +++++-
 tests/integration/defs/perf/test_perf.py      |  16 +-
 .../test_lists/qa/llm_perf_core.yml           | 204 ++++++------------
 3 files changed, 139 insertions(+), 146 deletions(-)

diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
index 7fc7d9a67c53..bb6514ddac34 100644
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@@ -458,9 +458,12 @@ def get_model_yaml_config(model_label: str,
                 },
             }
         },
-        # Nemotron-3-Super-120B-NVFP4: (no MTP)
+        # Nemotron-3-Super-120B-NVFP4 (streaming/low-latency variant for spark perf)
+        # Streaming serve cases use small cuda_graph batch and no attention DP for latency.
         {
-            'patterns': ['nemotron_3_super_120b_nvfp4-'],
+            'patterns': [
+                'nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-',
+            ],
             'config': {
                 'max_seq_len': 1048576,
                 'enable_chunked_prefill': True,
@@ -481,9 +484,11 @@ def get_model_yaml_config(model_label: str,
                 },
             }
         },
-        # Nemotron-3-Super-120B-NVFP4: MTP speculative decoding
+        # Nemotron-3-Super-120B-NVFP4_MTP (streaming/low-latency variant for spark perf)
         {
-            'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
+            'patterns': [
+                'nemotron_3_super_120b_nvfp4_mtp-serve-pytorch-streaming-',
+            ],
             'config': {
                 'max_seq_len': 1048576,
                 'enable_chunked_prefill': True,
@@ -509,6 +514,58 @@ def get_model_yaml_config(model_label: str,
                 },
             }
         },
+        # Nemotron-3-Super-120B-NVFP4 (throughput variant, aligned with curated yaml)
+        # Non-streaming cases use attention DP and larger cuda_graph batch for throughput.
+        {
+            'patterns': ['nemotron_3_super_120b_nvfp4-'],
+            'config': {
+                'max_seq_len': 1048576,
+                'enable_chunked_prefill': True,
+                'enable_attention_dp': True,
+                'stream_interval': 1,
+                'moe_config': {
+                    'backend': 'CUTLASS',
+                },
+                'cuda_graph_config': {
+                    'enable_padding': True,
+                    'max_batch_size': 256,
+                },
+                'kv_cache_config': {
+                    'enable_block_reuse': False,
+                    'mamba_ssm_cache_dtype': 'float16',
+                    'mamba_ssm_stochastic_rounding': True,
+                    'mamba_ssm_philox_rounds': 5,
+                },
+            }
+        },
+        # Nemotron-3-Super-120B-NVFP4_MTP (throughput variant with MTP spec decoding)
+        {
+            'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
+            'config': {
+                'max_seq_len': 1048576,
+                'enable_chunked_prefill': True,
+                'enable_attention_dp': True,
+                'stream_interval': 1,
+                'moe_config': {
+                    'backend': 'CUTLASS',
+                },
+                'cuda_graph_config': {
+                    'enable_padding': True,
+                    'max_batch_size': 256,
+                },
+                'kv_cache_config': {
+                    'enable_block_reuse': False,
+                    'mamba_ssm_cache_dtype': 'float16',
+                    'mamba_ssm_stochastic_rounding': True,
+                    'mamba_ssm_philox_rounds': 5,
+                },
+                'speculative_config': {
+                    'decoding_type': 'MTP',
+                    'num_nextn_predict_layers': 3,
+                    'allow_advanced_sampling': True,
+                },
+            }
+        },
     ]
 
     # Apply pattern-based configurations on top of base config
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 9550f1f502a0..884cc2d61a60 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -88,6 +88,17 @@
     "nemotron_3_super_120b_nvfp4_mtp": "cnn_dailymail",
 }
 
+# All spec-decoding models (MTP, Eagle3, etc.). Used to skip --ignore-eos in
+# benchmark client commands: forcing generation past EOS produces unstable
+# acceptance rates for spec-dec.
+SPEC_DEC_MODELS = {
+    "qwen3_4b_eagle3",
+    "qwen3_235b_a22b_fp4_eagle3",
+    "gpt_oss_120b_eagle3",
+    "gpt_oss_120b_eagle3_throughput",
+    *SPEC_DEC_REAL_DATASET_MODELS,
+}
+
 # Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)
 AUTODEPLOY_MODEL_CONFIGS = {
     "nemotron_nano_3_30b_fp8": "examples/auto_deploy/nano_v3.yaml",
@@ -1393,12 +1404,15 @@ def get_trtllm_serve_client_command(self,
             model_dir,
             "--num-prompts",
             str(self._config.num_reqs),
-            "--ignore-eos",
             "--tokenize-on-client",
             "--no-test-input",
             "--percentile-metrics",
             "ttft,tpot,itl,e2el",
         ]
+        # --ignore-eos must be off for spec-decoding models: forcing generation
+        # past EOS produces unstable acceptance rates.
+        if self._config.model_name not in SPEC_DEC_MODELS:
+            client_cmd.append("--ignore-eos")
         if self._config.model_name in OPENAI_CHAT_BACKEND_MODELS:
             client_cmd += ["--backend", "openai-chat"]
         if real_dataset_path:
diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml
index 05d63f11d4e3..92f2d900bc9e 100644
--- a/tests/integration/test_lists/qa/llm_perf_core.yml
+++ b/tests/integration/test_lists/qa/llm_perf_core.yml
@@ -3,19 +3,16 @@ llm_perf_core:
 # ===============================================================================
 # Test Conditions Index
 # ===============================================================================
-# 1: All GPUs common tests(L20, L40S, H100, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases)
-# 2: L20, L40S, H100, H20, H200
-# 3: L40S, H100, H20, H200
-# 4: H100, H20, H200 test cases
-# 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
-# 6: GB200, B200, B300, GB300, RTX6000-Server test cases
-# 7: B200, GB200, B300, GB300 test cases
-# 8: B200, B300 test cases
-# 9: H100, H20, H200, B200, B300 test cases
-# 10: H20, H200, B200, B300 test cases
-# 11: RTX-6000D, RTX-6000 Server test cases
-# 12: RTX6000-Server
-# 13: A100 (cc 8.0, BF16 dense only, A100-80G)
+# 1: All GPUs common tests(L40S, H100, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases)
+# 2: L40S, H100, H20, H200
+# 3: H100, H20, H200 test cases
+# 4: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
+# 5: GB200, B200, B300, GB300, RTX6000-Server, RTX6000-D test cases
+# 6: B200, GB200, B300, GB300 test cases
+# 7: B200, B300 test cases
+# 8: H100, H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
+# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
+# 10: RTX-6000D, RTX-6000 Server test cases
 # ===============================================================================
 
 
@@ -30,15 +27,22 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500]
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000]
   - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000]
+  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500]
+  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000]
+  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000]
+  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency
+  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput
 
 
-# 2: L20, L40S, H100, H20, H200
+# 2: L40S, H100, H20, H200
 - condition:
     ranges:
       system_gpu_count:
         gte: 2
       compute_capability:
-        lt: 10.0
+        lte: 9.0
   tests:
   #llama_v3.1_8b
   #pytorch backend
@@ -55,18 +59,23 @@ llm_perf_core:
   #nemotron_nano_12b_v2
   - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-con:1] #min_latency
   - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] #max_throughput
-  - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128]
-
-
-# 3: L40S, H100, H20, H200
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 4
-      compute_capability:
-        gt: 8.0
-        lte: 9.0
-  tests:
+  - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128]  #qwen3.5_9b (dense BF16 19G, 1-GPU)
+  #qwen3.5_27b (dense BF16 52G, 2-GPU)
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
+  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
+  #llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU)
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,32-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
@@ -88,11 +97,10 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:4-gpus:4] #min_latency
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:4-gpus:4] #max_throughput
 
 
-# 4: H100, H20, H200 test cases
+# 3: H100, H20, H200 test cases
 - condition:
     ranges:
       system_gpu_count:
@@ -112,17 +120,9 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:64]
   - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:128,128]
   - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:256]
-  #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU)
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput
 
 
-# 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
+# 4: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
 - condition:
     ranges:
       system_gpu_count:
@@ -136,14 +136,6 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency
   - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput
   - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-streaming-bfloat16-input_output_len:500,2000-con:250] #max_throughput streaming
-  #qwen3.5_9b (dense BF16 19G, 1-GPU)
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput
   #qwen3.5_27b (dense BF16 52G, 1-GPU)
   - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128]
   - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000]
@@ -168,9 +160,17 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:2-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
   - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
+    #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU)
+  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency
+  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput
 
 
-# 6: GB200, B200, B300, GB300, RTX6000-Server test cases
+# 5: GB200, B200, B300, GB300, RTX6000-Server, RTX6000-D test cases
 - condition:
     ranges:
       system_gpu_count:
@@ -195,17 +195,9 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:1000-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120)
   - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4]
-  #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU, covers GB200/GB300)
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency
-  - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput
 
 
-# 7: B200, GB200, B300, GB300 test cases
+# 6: B200, GB200, B300, GB300 test cases
 - condition:
     ranges:
       system_gpu_count:
@@ -229,7 +221,7 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,1000-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:4-gpus:4]
+# 7: B200, B300 test cases
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:4-gpus:4] #min_latency
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:256-tp:4-gpus:4] #max_throughput
   #deepseek_v3.2_fp4 (FP4 389G, 4-GPU)
@@ -256,9 +248,13 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,2000-ep:4-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:4-tp:4-gpus:4] #max_throughput
+  #nemotron_3_super_120b_nvfp4 (Hybrid MoE+SSM+Attn FP4 76G, 4-GPU ep=4 tp=4, throughput config)
+  - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:5-con:1-ep:4-tp:4-gpus:4] #min_latency
+  - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:160-con:32-ep:4-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:640-con:128-ep:4-tp:4-gpus:4] #max_throughput
 
 
-# 8: B200, B300 test cases
+# 7: B200, B300 test cases
 - condition:
     ranges:
       system_gpu_count:
@@ -275,7 +271,7 @@ llm_perf_core:
   # gpt_oss_120b_fp4
   - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(120)
+# 8: H100, H20, H200, B200, B300 test cases
   - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096-ep:8-tp:8-gpus:8] TIMEOUT(180)
   - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8]
@@ -309,19 +305,19 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[deepseek_v3.2_fp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,1000-con:3072-ep:8-tp:8-gpus:8] TIMEOUT(120) #max_throughput
 
 
-# 9: H100, H20, H200, B200, B300 test cases
+# 8: H100, H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
 - condition:
     ranges:
       system_gpu_count:
         gte: 8
       compute_capability:
         gte: 9.0
-        lt: 12.0
+        lte: 12.0
   tests:
   #llama_v3.3_70b_instruct_fp8
   #pytorch backend
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-tp:8-gpus:8] TIMEOUT(120)
-  #minimax_m2.5 (FP8 216G, 8-GPU)
+# 9: H20, H200, B200, B300 test cases
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:8-gpus:8]
@@ -354,15 +350,14 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput
 
-
-# 10: H20, H200, B200, B300 test cases
+# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases
 - condition:
     ranges:
       system_gpu_count:
         gte: 8
       compute_capability:
         gte: 9.0
-        lt: 12.0
+        lte: 12.0
       gpu_memory:
         gt: 90000
   tests:
@@ -387,12 +382,11 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency
   - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput
 
-
-# 11: RTX-6000D, RTX-6000 Server test cases
+# 10: RTX-6000D, RTX-6000 Server test cases
 - condition:
     ranges:
       system_gpu_count:
-        gte: 2
+        gte: 4
       compute_capability:
         gte: 12.0
         lte: 12.0
@@ -411,82 +405,10 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8-tp:2-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:500,2000-tp:2-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:1000,1000-tp:2-gpus:2]
-  #llama_v3.3_nemotron_super_49b_fp8 (nemotron-nas FP8 49G, 2-GPU)
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
-
-
-# 12: RTX6000-Server test cases
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 8
-      compute_capability:
-        gte: 12.0
-        lte: 12.0
-  tests:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:8-tp:8-gpus:8]
   # deepseek_r1_0528
   - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] TIMEOUT(120)
-  - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-streaming-float4-maxbs:512-maxnt:5220-input_output_len:4000,2000-reqs:512-ep:8-tp:8-gpus:8]
-  #qwen3.5_397b_a17b_fp8 (MoE FP8 380G, 8-GPU ep=8)
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:1000,1000-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput
-  #qwen3.5_397b_a17b_fp4 (MoE FP4 234G, 8-GPU ep=8)
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:128,128-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:500,2000-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:2000,500-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,1000-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,2000-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency
-  - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput
-
-
-# 13: A100 test cases (cc 8.0, BF16 dense only, A100-80G)
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 2
-      compute_capability:
-        gte: 8.0
-        lt: 9.0
-      gpu_memory:
-        gt: 40000
-  tests:
-  #qwen3.5_9b (dense BF16 19G, 1-GPU)
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000]
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency
-  - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput
-  #qwen3.5_27b (dense BF16 52G, 2-GPU for A100-80G safety)
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
-  - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
-  #llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU)
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput
+  - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-streaming-float4-maxbs:512-maxnt:5220-input_output_len:4000,2000-reqs:512-ep:8-tp:8-gpus:8]#max_throughput