From 77634e622e7db2a5093ee8c142f912b6b7e9c111 Mon Sep 17 00:00:00 2001 From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Date: Tue, 19 May 2026 11:30:27 +0000 Subject: [PATCH 1/6] [None][refactor] Update model path definitions in test_perf.py and clean up waives.txt Removed outdated model paths and unnecessary entries from MODEL_PATH_DICT in test_perf.py. Updated waives.txt to reflect the removal of tests that are no longer applicable, improving clarity and maintainability. Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> --- tests/integration/defs/perf/test_perf.py | 78 +----------------------- tests/integration/test_lists/waives.txt | 9 ++- 2 files changed, 5 insertions(+), 82 deletions(-) diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 33e3be93cb1d..32612df2fe52 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -45,9 +45,6 @@ # Model PATH of local dir synced from internal LLM models repo MODEL_PATH_DICT = { - "llama_v2_7b": "llama-models-v2/llama-v2-7b-hf", # not safetensors repo - "llama_v2_13b": "llama-models-v2/llama-v2-13b-hf", # not safetensors repo - "llama_v2_70b": "llama-models-v2/llama-v2-70b-hf", # not safetensors repo "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B", "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8", @@ -88,20 +85,9 @@ "llama4-models/Llama-4-Maverick-17B-128E-Instruct", "llama_v4_maverick_17b_128e_instruct_fp8": "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", - "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1", - "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1", - "mixtral_8x7b_v0.1_instruct_fp8": "Mixtral-8x7B-Instruct-v0.1-fp8", - "mixtral_8x7b_v0.1_instruct_fp4": - "modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp4", - "mistral_nemo_12b_base": "Mistral-Nemo-Base-2407", "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B", "deepseek_r1_distill_llama_70b": "DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B/", - "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1", - "mistral_7b_v0.1": "mistral-7b-v0.1", - "ministral_8b": "Ministral-8B-Instruct-2410", - "ministral_8b_fp8": "Ministral-8B-Instruct-2410-FP8", - "gemma_3_1b_it": "gemma/gemma-3-1b-it", "gemma_3_27b_it": "gemma/gemma-3-27b-it", "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8", "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4", @@ -136,43 +122,15 @@ "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8", "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4", "starcoder2_3b": "starcoder2-3b", - "starcoder2_7b": "starcoder2-7b", - "starcoder2_15b": "starcoder2-15b", - "t5": "t5-small", # not supported for trtllm-bench build config - "flan_t5_base": - "flan-t5-small", # not supported for trtllm-bench build config - "flan_t5_large": - "flan-t5-xl", # not supported for trtllm-bench build config - "whisper_large_v3": - "whisper-models/large-v3", # not supported for trtllm-bench tokenizer - "bart_large_cnn": "bart-large-cnn", # not safetensors repo - "mbart_large_50_many_to_one_mmt": "mbart-large-50-many-to-one-mmt", - "mamba_130m": "mamba/mamba-130m-hf", - "mamba_370m": "mamba/mamba-370m-hf", - "mamba_2.8b": "mamba/mamba-2.8b-hf", - "gpt_20b": "gpt-neox-20b", - "gpt_350m_moe": "gpt2-medium", "phi_4_mini_instruct": "Phi-4-mini-instruct", "phi_4_reasoning_plus": "Phi-4-reasoning-plus", "phi_4_reasoning_plus_fp8": "nvidia-Phi-4-reasoning-plus-FP8", "phi_4_reasoning_plus_fp4": "nvidia-Phi-4-reasoning-plus-NVFP4", "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct", - "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct", - "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_fp4": "multimodals/Phi-4-multimodal-instruct-FP4", - "phi_4_multimodal_instruct_fp4_image": - "multimodals/Phi-4-multimodal-instruct-FP4", - "phi_4_multimodal_instruct_fp4_audio": - "multimodals/Phi-4-multimodal-instruct-FP4", - "phi_4_multimodal_instruct_fp8_image": - "multimodals/Phi-4-multimodal-instruct-FP8", - "phi_4_multimodal_instruct_fp8_audio": - "multimodals/Phi-4-multimodal-instruct-FP8", "phi_4_multimodal_instruct_fp8": "multimodals/Phi-4-multimodal-instruct-FP8", - "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct", - "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8", "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b", @@ -202,7 +160,6 @@ # DeepSeek V3.2 (671B MoE) "deepseek_v3.2_fp8": "DeepSeek-V3.2-hf", "deepseek_v3.2_fp4": "DeepSeek-V3.2-NVFP4", - "deepseek_v3.2_exp_fp4_v2": "DeepSeek-V3.2-Exp-FP4-v2", # GLM-5 FP8 (MoE) "glm_5_fp8": "GLM-5-FP8", # Kimi K2.5 NVFP4 (~1T MoE multimodal) @@ -210,15 +167,9 @@ } # Model PATH of HuggingFace HF_MODEL_PATH = { - "llama_v2_7b_hf": "meta-llama/Llama-2-7b-hf", - "llama_v2_70b_hf": "meta-llama/Llama-2-70b-hf", - "falcon_180b_hf": "tiiuae/falcon-180B", - "gptj_6b_hf": "EleutherAI/gpt-j-6b", - "llama_v3_8b_hf": "meta-llama/Meta-Llama-3-8B", "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B", "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8", "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "llama_v3_70b_hf": "meta-llama/Meta-Llama-3-70B", "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B", "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B", "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1", @@ -230,37 +181,10 @@ "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8", "llama_v3.1_nemotron_ultra_253b_fp8_hf": "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", - "mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1", - "mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1", - "ministral_8b_hf": "mistralai/Ministral-8B-Instruct-2410", - "flan_t5_base_hf": "google/flan-t5-small", "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct", - "gemma_3_1b_it_hf": "google/gemma-3-1b-it", } LORA_MODEL_PATH = { - "llama_v2_13b": - "llama-models-v2/chinese-llama-2-lora-13b", - "mixtral_8x7b_v0.1": - "chinese-mixtral-lora", - "llama_v3.1_8b_instruct_fp8": - "lora/llama-3-chinese-8b-instruct-v2-lora/", - "ministral_8b": - "lora/ministral/Ministral-8B-Instruct-2410-Loras-Dummy", # Dummy LoRA for Ministral - "gemma_3_1b_it": - "lora/gemma/gemma-3-1b-it-dummy-lora", # Dummy LoRA for Gemma-3-1B-Instruct - "phi_4_multimodal_instruct_image": - "multimodals/Phi-4-multimodal-instruct/vision-lora", - "phi_4_multimodal_instruct_audio": - "multimodals/Phi-4-multimodal-instruct/speech-lora", - "phi_4_multimodal_instruct_fp4_image": - "multimodals/Phi-4-multimodal-instruct-FP4/vision-lora", - "phi_4_multimodal_instruct_fp4_audio": - "multimodals/Phi-4-multimodal-instruct-FP4/speech-lora", - "phi_4_multimodal_instruct_fp8_image": - "multimodals/Phi-4-multimodal-instruct-FP8/vision-lora", - "phi_4_multimodal_instruct_fp8_audio": - "multimodals/Phi-4-multimodal-instruct-FP8/speech-lora", + "llama_v3.1_8b_instruct_fp8": "lora/llama-3-chinese-8b-instruct-v2-lora/", } TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "") diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 541a1e5a0a57..1fa2ca62809e 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -251,6 +251,7 @@ full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[tr full:B200/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150) full:B200/perf/test_perf.py::test_perf[bart_large_cnn] SKIP (bert_attention_plugin does not support SM >= 100) full:B200/perf/test_perf.py::test_perf[bert_large] SKIP (bert_attention_plugin does not support SM >= 100) +full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugs/5150255) full:B200/perf/test_perf.py::test_perf[flan_t5_base] SKIP (bert_attention_plugin does not support SM >= 100) full:B200/perf/test_perf.py::test_perf[flan_t5_large] SKIP (bert_attention_plugin does not support SM >= 100) full:B200/perf/test_perf.py::test_perf[flan_t5_xl] SKIP (bert_attention_plugin does not support SM >= 100) @@ -259,11 +260,6 @@ full:B200/perf/test_perf.py::test_perf[mbart_large_50_many_to_one_mmt] SKIP (ber full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074) full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074) full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074) -full:B200/perf/test_perf.py::test_perf[roberta_base] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[t5_11b] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[t5_3b] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[t5_base] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[t5_large] SKIP (bert_attention_plugin does not support SM >= 100) full:B300/unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" SKIP (https://nvbugs/6165866) full:DGX_H100/kv_cache/test_prefix_aware_scheduling.py::TestServePrefixAwareScheduling::test_multi_round_qa_shared_prefix[swa-chunked] SKIP (https://nvbugs/6136737) full:GB200-OCI/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150) @@ -312,6 +308,8 @@ perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128, perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization) perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization) +perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:20000,2000-reqs:500-con:250] SKIP (https://nvbugs/5304388) +perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:5000,500-reqs:500-con:250] SKIP (https://nvbugs/5304388) perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] SKIP perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] SKIP perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP @@ -320,6 +318,7 @@ perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] SK perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP +perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6130334) perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215810) perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] SKIP (https://nvbugs/6167060) perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6190071) From fea317ce44417135dc2fb7ee5977d6772933b626 Mon Sep 17 00:00:00 2001 From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Date: Mon, 25 May 2026 06:05:15 +0000 Subject: [PATCH 2/6] Refactor model_path to ensure QA and CI have the same testing models Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> --- tests/integration/defs/perf/_model_paths.py | 146 +++++++++++++++++ tests/integration/defs/perf/test_perf.py | 147 +----------------- .../integration/defs/perf/test_perf_sanity.py | 25 +-- 3 files changed, 152 insertions(+), 166 deletions(-) create mode 100644 tests/integration/defs/perf/_model_paths.py diff --git a/tests/integration/defs/perf/_model_paths.py b/tests/integration/defs/perf/_model_paths.py new file mode 100644 index 000000000000..30d28fe15d9b --- /dev/null +++ b/tests/integration/defs/perf/_model_paths.py @@ -0,0 +1,146 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Shared model path constants for perf and perf-sanity tests.""" + +# Model PATH of local dir synced from internal LLM models repo +MODEL_PATH_DICT = { + "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B", + "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", + "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8", + "llama_v3.1_8b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4", + "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B", + "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", + "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8", + "llama_v3.3_70b_instruct_fp8": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8", + "llama_v3.3_70b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4", + "llama_v3.1_405b_instruct_fp8": "llama-3.1-model/Llama-3.1-405B-Instruct-FP8", + "llama_v3.1_405b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4", + "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct", + "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B", + "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1", + "llama_v3.1_nemotron_nano_8b_fp8": "Llama-3.1-Nemotron-Nano-8B-v1-FP8", + "llama_v3.3_nemotron_super_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1", + "llama_v3.3_nemotron_super_49b_fp8": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8", + "llama_v3.3_nemotron_super_49b_v1.5_fp8": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8", + "llama_v3.1_nemotron_ultra_253b": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1", + "llama_v3.1_nemotron_ultra_253b_fp8": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", + "llama_v4_scout_17b_16e_instruct": "llama4-models/Llama-4-Scout-17B-16E-Instruct", + "llama_v4_scout_17b_16e_instruct_fp8": "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8", + "llama_v4_scout_17b_16e_instruct_fp4": "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4", + "llama_v4_maverick_17b_128e_instruct": "llama4-models/Llama-4-Maverick-17B-128E-Instruct", + "llama_v4_maverick_17b_128e_instruct_fp8": "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", + "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B", + "deepseek_r1_distill_llama_70b": "DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B/", + "gemma_3_27b_it": "gemma/gemma-3-27b-it", + "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8", + "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4", + "gemma_3_12b_it": "gemma/gemma-3-12b-it", + "gemma_3_12b_it_fp8": "gemma/gemma-3-12b-it-fp8", + "gemma_3_12b_it_fp4": "gemma/gemma-3-12b-it-fp4", + "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1", + "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4", + "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/", + "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/", + "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/", + "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", + "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only", + "qwen2_7b_instruct": "Qwen2-7B-Instruct", + "qwen_14b_chat": "Qwen-14B-Chat", + "qwen3_0.6b": "Qwen3/Qwen3-0.6B", + "qwen3_4b_eagle3": "Qwen3/Qwen3-4B", + "qwen3_8b": "Qwen3/Qwen3-8B", + "qwen3_8b_fp8": "Qwen3/nvidia-Qwen3-8B-FP8", + "qwen3_8b_fp4": "Qwen3/nvidia-Qwen3-8B-NVFP4", + "qwen3_14b": "Qwen3/Qwen3-14B", + "qwen3_14b_fp8": "Qwen3/nvidia-Qwen3-14B-FP8", + "qwen3_14b_fp4": "Qwen3/nvidia-Qwen3-14B-NVFP4", + "qwen3_30b_a3b": "Qwen3/Qwen3-30B-A3B", + "qwen3_30b_a3b_fp4": "Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf", + "qwen3_32b": "Qwen3/Qwen3-32B", + "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4", + "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", + "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", + "qwen3_235b_a22b_fp4_eagle3": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", + "qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct", + "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8", + "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4", + "starcoder2_3b": "starcoder2-3b", + "phi_4_mini_instruct": "Phi-4-mini-instruct", + "phi_4_reasoning_plus": "Phi-4-reasoning-plus", + "phi_4_reasoning_plus_fp8": "nvidia-Phi-4-reasoning-plus-FP8", + "phi_4_reasoning_plus_fp4": "nvidia-Phi-4-reasoning-plus-NVFP4", + "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct", + "phi_4_multimodal_instruct_fp4": "multimodals/Phi-4-multimodal-instruct-FP4", + "phi_4_multimodal_instruct_fp8": "multimodals/Phi-4-multimodal-instruct-FP8", + "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", + "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", + "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b", + "gpt_oss_120b_eagle3": "gpt_oss/gpt-oss-120b", + "gpt_oss_120b_eagle3_throughput": "gpt_oss/gpt-oss-120b", + "nemotron_nano_3_30b_fp8": "Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev", + "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2", + "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4", + "nemotron_3_super_120b_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + "nemotron_3_super_120b_nvfp4_mtp": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + # Nemotron-3-Nano-Omni-30B (text + image multimodal) + "nemotron_3_nano_omni_nvfp4": "NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "nemotron_3_nano_omni_nvfp4_image": "NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4", + # MiniMax M2.5 (FP8 block-scale, ~230B MoE) + "minimax_m2.5_fp8": "MiniMax-M2.5", + # Qwen3.5 dense + MoE + "qwen3.5_9b": "Qwen3.5-9B", + "qwen3.5_27b": "Qwen3.5-27B", + "qwen3.5_35b_a3b_fp8": "Qwen3.5-35B-A3B-FP8", + "qwen3.5_122b_a10b": "Qwen3.5-122B-A10B", + "qwen3.5_397b_a17b_fp8": "Qwen3.5-397B-A17B-FP8", + "qwen3.5_397b_a17b_fp4": "Qwen3.5-397B-A17B-NVFP4", + # DeepSeek V3.2 (671B MoE) + "deepseek_v3.2_fp8": "DeepSeek-V3.2-hf", + "deepseek_v3.2_fp4": "DeepSeek-V3.2-NVFP4", + # GLM-5 FP8 (MoE) + "glm_5_fp8": "GLM-5-FP8", + # Kimi K2.5 NVFP4 (~1T MoE multimodal) + "kimi_k2.5_fp4": "Kimi-K2.5-NVFP4", + # Keys below are sanity-side aliases; some point to the same weights as + # entries above but are kept under sanity's historical naming. + "deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2", + "k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4", + "k25_thinking_fp4": "Kimi-K2.5-NVFP4", + "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + "super_fp8": "NVIDIA-Nemotron-3-Super-120B-A12B-FP8", + "super_bf16": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", + "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8", + "glm_5_nvfp4": "GLM-5-NVFP4", +} + +# Model PATH of HuggingFace +HF_MODEL_PATH = { + "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B", + "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8", + "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B", + "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B", + "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1", + "llama_v3.1_nemotron_nano_8b_fp8_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8", + "llama_v3.3_nemotron_super_49b_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "llama_v3.3_nemotron_super_49b_fp8_hf": "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8", + "llama_v3.1_nemotron_ultra_253b_fp8_hf": "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", + "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct", +} + +LORA_MODEL_PATH = { + "llama_v3.1_8b_instruct_fp8": "lora/llama-3-chinese-8b-instruct-v2-lora/", +} diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 32612df2fe52..df1b01128efb 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -31,6 +31,7 @@ from ..conftest import (get_device_count, get_llm_root, llm_models_root, trt_environment) +from ._model_paths import HF_MODEL_PATH, LORA_MODEL_PATH, MODEL_PATH_DICT from .pytorch_model_config import get_model_yaml_config from .sampler_options_config import get_sampler_options_config from .utils import (AbstractPerfScriptTestClass, PerfBenchScriptTestCmds, @@ -43,150 +44,6 @@ ALLOWED_CONFIGS_CACHE = None # Cache to avoid modifying sys.path many times. MAP_BY_SOCKET = None -# Model PATH of local dir synced from internal LLM models repo -MODEL_PATH_DICT = { - "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B", - "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", - "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8", - "llama_v3.1_8b_instruct_fp4": - "modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4", - "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B", - "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", - "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8", - "llama_v3.3_70b_instruct_fp8": - "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8", - "llama_v3.3_70b_instruct_fp4": - "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4", - "llama_v3.1_405b_instruct_fp8": - "llama-3.1-model/Llama-3.1-405B-Instruct-FP8", - "llama_v3.1_405b_instruct_fp4": - "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4", - "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct", - "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B", - "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1", - "llama_v3.1_nemotron_nano_8b_fp8": "Llama-3.1-Nemotron-Nano-8B-v1-FP8", - "llama_v3.3_nemotron_super_49b": - "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1", - "llama_v3.3_nemotron_super_49b_fp8": - "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8", - "llama_v3.3_nemotron_super_49b_v1.5_fp8": - "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8", - "llama_v3.1_nemotron_ultra_253b": - "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1", - "llama_v3.1_nemotron_ultra_253b_fp8": - "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", - "llama_v4_scout_17b_16e_instruct": - "llama4-models/Llama-4-Scout-17B-16E-Instruct", - "llama_v4_scout_17b_16e_instruct_fp8": - "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8", - "llama_v4_scout_17b_16e_instruct_fp4": - "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4", - "llama_v4_maverick_17b_128e_instruct": - "llama4-models/Llama-4-Maverick-17B-128E-Instruct", - "llama_v4_maverick_17b_128e_instruct_fp8": - "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", - "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B", - "deepseek_r1_distill_llama_70b": - "DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B/", - "gemma_3_27b_it": "gemma/gemma-3-27b-it", - "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8", - "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4", - "gemma_3_12b_it": "gemma/gemma-3-12b-it", - "gemma_3_12b_it_fp8": "gemma/gemma-3-12b-it-fp8", - "gemma_3_12b_it_fp4": "gemma/gemma-3-12b-it-fp4", - "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1", - "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4", - "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/", - "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/", - "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/", - "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", - "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only", - "qwen2_7b_instruct": "Qwen2-7B-Instruct", - "qwen_14b_chat": "Qwen-14B-Chat", - "qwen3_0.6b": "Qwen3/Qwen3-0.6B", - "qwen3_4b_eagle3": "Qwen3/Qwen3-4B", - "qwen3_8b": "Qwen3/Qwen3-8B", - "qwen3_8b_fp8": "Qwen3/nvidia-Qwen3-8B-FP8", - "qwen3_8b_fp4": "Qwen3/nvidia-Qwen3-8B-NVFP4", - "qwen3_14b": "Qwen3/Qwen3-14B", - "qwen3_14b_fp8": "Qwen3/nvidia-Qwen3-14B-FP8", - "qwen3_14b_fp4": "Qwen3/nvidia-Qwen3-14B-NVFP4", - "qwen3_30b_a3b": "Qwen3/Qwen3-30B-A3B", - "qwen3_30b_a3b_fp4": "Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf", - "qwen3_32b": "Qwen3/Qwen3-32B", - "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4", - "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", - "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", - "qwen3_235b_a22b_fp4_eagle3": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", - "qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct", - "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8", - "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4", - "starcoder2_3b": "starcoder2-3b", - "phi_4_mini_instruct": "Phi-4-mini-instruct", - "phi_4_reasoning_plus": "Phi-4-reasoning-plus", - "phi_4_reasoning_plus_fp8": "nvidia-Phi-4-reasoning-plus-FP8", - "phi_4_reasoning_plus_fp4": "nvidia-Phi-4-reasoning-plus-NVFP4", - "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct", - "phi_4_multimodal_instruct_fp4": - "multimodals/Phi-4-multimodal-instruct-FP4", - "phi_4_multimodal_instruct_fp8": - "multimodals/Phi-4-multimodal-instruct-FP8", - "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", - "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", - "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b", - "gpt_oss_120b_eagle3": "gpt_oss/gpt-oss-120b", - "gpt_oss_120b_eagle3_throughput": "gpt_oss/gpt-oss-120b", - "nemotron_nano_3_30b_fp8": "Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev", - "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2", - "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4", - "nemotron_3_super_120b_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", - "nemotron_3_super_120b_nvfp4_mtp": - "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", - # Nemotron-3-Nano-Omni-30B (text + image multimodal) - "nemotron_3_nano_omni_nvfp4": - "NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", - "nemotron_3_nano_omni_nvfp4_image": - "NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", - "kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4", - # MiniMax M2.5 (FP8 block-scale, ~230B MoE) - "minimax_m2.5_fp8": "MiniMax-M2.5", - # Qwen3.5 dense + MoE - "qwen3.5_9b": "Qwen3.5-9B", - "qwen3.5_27b": "Qwen3.5-27B", - "qwen3.5_35b_a3b_fp8": "Qwen3.5-35B-A3B-FP8", - "qwen3.5_122b_a10b": "Qwen3.5-122B-A10B", - "qwen3.5_397b_a17b_fp8": "Qwen3.5-397B-A17B-FP8", - "qwen3.5_397b_a17b_fp4": "Qwen3.5-397B-A17B-NVFP4", - # DeepSeek V3.2 (671B MoE) - "deepseek_v3.2_fp8": "DeepSeek-V3.2-hf", - "deepseek_v3.2_fp4": "DeepSeek-V3.2-NVFP4", - # GLM-5 FP8 (MoE) - "glm_5_fp8": "GLM-5-FP8", - # Kimi K2.5 NVFP4 (~1T MoE multimodal) - "kimi_k2.5_fp4": "Kimi-K2.5-NVFP4", -} -# Model PATH of HuggingFace -HF_MODEL_PATH = { - "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B", - "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8", - "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B", - "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B", - "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1", - "llama_v3.1_nemotron_nano_8b_fp8_hf": - "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8", - "llama_v3.3_nemotron_super_49b_hf": - "nvidia/Llama-3_3-Nemotron-Super-49B-v1", - "llama_v3.3_nemotron_super_49b_fp8_hf": - "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8", - "llama_v3.1_nemotron_ultra_253b_fp8_hf": - "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", - "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct", -} -LORA_MODEL_PATH = { - "llama_v3.1_8b_instruct_fp8": "lora/llama-3-chinese-8b-instruct-v2-lora/", -} - TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "") NEMOTRON_SUPER_MODELS = { diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index 4d9882259f46..40deafd9b15f 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -34,30 +34,13 @@ from tensorrt_llm._utils import get_free_port from ..conftest import get_llm_root, llm_models_root +from ._model_paths import MODEL_PATH_DICT as _MODEL_PATH_DICT_BASE from .perf_regression_utils import process_and_upload_test_results -# Model PATH of local dir synced from internal LLM models repo +# Sanity-side path differs from test_perf for this key; preserve historical value. MODEL_PATH_DICT = { - "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1", - "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4", - "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/", - "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/", - "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/", - "deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2", - "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", - "k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4", - "k25_thinking_fp4": "Kimi-K2.5-NVFP4", - "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4 - "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", # Super (Nemotron-H SSM+MoE) NvFP4 - "super_fp8": "NVIDIA-Nemotron-3-Super-120B-A12B-FP8", - "super_bf16": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", - "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", # Qwen3-235B-A22B-FP8 - "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8", + **_MODEL_PATH_DICT_BASE, "llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4", - "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", - "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", - "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8", - "glm_5_nvfp4": "GLM-5-NVFP4", } SUPPORTED_GPU_MAPPING = { From 204b9cd3bcfc682b8c0dbf4234c9801f2cfabf0c Mon Sep 17 00:00:00 2001 From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Date: Mon, 25 May 2026 07:00:50 +0000 Subject: [PATCH 3/6] test: remove dead bert_attention_plugin SM>=100 waivers These 7 waivers referenced perf tests (bart_large_cnn, bert_large, flan_t5_base/large/xl/xxl, mbart_large_50_many_to_one_mmt) that no longer appear in any test-db yaml on main. Drop them to keep the cleanup consistent with the 5 sibling waivers (roberta_base, t5_*) that were already removed in this PR. Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 1fa2ca62809e..1440b04bf34c 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -249,14 +249,7 @@ full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954) full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-False] SKIP (https://nvbugs/6185480) full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-True] SKIP (https://nvbugs/6185480) full:B200/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150) -full:B200/perf/test_perf.py::test_perf[bart_large_cnn] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[bert_large] SKIP (bert_attention_plugin does not support SM >= 100) full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugs/5150255) -full:B200/perf/test_perf.py::test_perf[flan_t5_base] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[flan_t5_large] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[flan_t5_xl] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[flan_t5_xxl] SKIP (bert_attention_plugin does not support SM >= 100) -full:B200/perf/test_perf.py::test_perf[mbart_large_50_many_to_one_mmt] SKIP (bert_attention_plugin does not support SM >= 100) full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074) full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074) full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074) From 2109fc88f7ec05310b5db25077a115129c19bad9 Mon Sep 17 00:00:00 2001 From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Date: Mon, 25 May 2026 07:49:57 +0000 Subject: [PATCH 4/6] test: remove 4 perf waivers per author confirmation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the 4 perf waivers that the PR originally added — author confirmed the underlying nvbugs (5150255 / 5304388 / 6130334) are no longer necessary to waive. Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 1440b04bf34c..a53606807422 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -249,7 +249,6 @@ full:A10/unittest/kv_cache_manager_v2_tests/ SKIP (https://nvbugs/5841954) full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-False] SKIP (https://nvbugs/6185480) full:A100/accuracy/test_llm_api_autodeploy.py::TestGLM4Flash::test_auto_dtype[trtllm-True] SKIP (https://nvbugs/6185480) full:B200/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-trtllm] SKIP (https://nvbugs/6185150) -full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugs/5150255) full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074) full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074) full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074) @@ -301,8 +300,6 @@ perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128, perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization) perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:20000,2000-reqs:500-con:250] SKIP (https://nvbugs/5304388) -perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:5000,500-reqs:500-con:250] SKIP (https://nvbugs/5304388) perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] SKIP perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] SKIP perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP @@ -311,7 +308,6 @@ perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] SK perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP -perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6130334) perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6215810) perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] SKIP (https://nvbugs/6167060) perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6190071) From 743679892a01134892339b9a4a0a2564939b7821 Mon Sep 17 00:00:00 2001 From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Date: Tue, 26 May 2026 12:11:52 +0000 Subject: [PATCH 5/6] test: add new models to TRUST_REMOTE_CODE_MODELS in test_perf.py Included additional models "nemotron_nano_12b_v2", "phi_4_multimodal_instruct", "phi_4_multimodal_instruct_fp4", and "phi_4_multimodal_instruct_fp8" to the TRUST_REMOTE_CODE_MODELS dictionary to enhance testing coverage. Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> --- tests/integration/defs/perf/test_perf.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index df1b01128efb..9550f1f502a0 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -64,6 +64,10 @@ "glm_5_fp8", "nemotron_3_nano_omni_nvfp4", "nemotron_3_nano_omni_nvfp4_image", + "nemotron_nano_12b_v2", + "phi_4_multimodal_instruct", + "phi_4_multimodal_instruct_fp4", + "phi_4_multimodal_instruct_fp8", } # Models that use random_image dataset in serve mode benchmarks. From e944483b7bee0dbd59bdf275f662732c4dec7b64 Mon Sep 17 00:00:00 2001 From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Date: Fri, 29 May 2026 13:02:35 +0000 Subject: [PATCH 6/6] Enhance performance configurations for Nemotron-3-Super-120B-NVFP4 models in pytorch_model_config.py and update test_perf.py to include new spec-decoding models. Added configurations for streaming and throughput variants, ensuring better performance tuning. Adjusted test conditions in llm_perf_core.yml to reflect new model tests and conditions for GPU capabilities. Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> --- .../defs/perf/pytorch_model_config.py | 65 +++++- tests/integration/defs/perf/test_perf.py | 16 +- .../test_lists/qa/llm_perf_core.yml | 204 ++++++------------ 3 files changed, 139 insertions(+), 146 deletions(-) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 7fc7d9a67c53..bb6514ddac34 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -458,9 +458,12 @@ def get_model_yaml_config(model_label: str, }, } }, - # Nemotron-3-Super-120B-NVFP4: (no MTP) + # Nemotron-3-Super-120B-NVFP4 (streaming/low-latency variant for spark perf) + # Streaming serve cases use small cuda_graph batch and no attention DP for latency. { - 'patterns': ['nemotron_3_super_120b_nvfp4-'], + 'patterns': [ + 'nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-', + ], 'config': { 'max_seq_len': 1048576, 'enable_chunked_prefill': True, @@ -481,9 +484,11 @@ def get_model_yaml_config(model_label: str, }, } }, - # Nemotron-3-Super-120B-NVFP4: MTP speculative decoding + # Nemotron-3-Super-120B-NVFP4_MTP (streaming/low-latency variant for spark perf) { - 'patterns': ['nemotron_3_super_120b_nvfp4_mtp'], + 'patterns': [ + 'nemotron_3_super_120b_nvfp4_mtp-serve-pytorch-streaming-', + ], 'config': { 'max_seq_len': 1048576, 'enable_chunked_prefill': True, @@ -509,6 +514,58 @@ def get_model_yaml_config(model_label: str, }, } }, + # Nemotron-3-Super-120B-NVFP4 (throughput variant, aligned with curated yaml) + # Non-streaming cases use attention DP and larger cuda_graph batch for throughput. + { + 'patterns': ['nemotron_3_super_120b_nvfp4-'], + 'config': { + 'max_seq_len': 1048576, + 'enable_chunked_prefill': True, + 'enable_attention_dp': True, + 'stream_interval': 1, + 'moe_config': { + 'backend': 'CUTLASS', + }, + 'cuda_graph_config': { + 'enable_padding': True, + 'max_batch_size': 256, + }, + 'kv_cache_config': { + 'enable_block_reuse': False, + 'mamba_ssm_cache_dtype': 'float16', + 'mamba_ssm_stochastic_rounding': True, + 'mamba_ssm_philox_rounds': 5, + }, + } + }, + # Nemotron-3-Super-120B-NVFP4_MTP (throughput variant with MTP spec decoding) + { + 'patterns': ['nemotron_3_super_120b_nvfp4_mtp'], + 'config': { + 'max_seq_len': 1048576, + 'enable_chunked_prefill': True, + 'enable_attention_dp': True, + 'stream_interval': 1, + 'moe_config': { + 'backend': 'CUTLASS', + }, + 'cuda_graph_config': { + 'enable_padding': True, + 'max_batch_size': 256, + }, + 'kv_cache_config': { + 'enable_block_reuse': False, + 'mamba_ssm_cache_dtype': 'float16', + 'mamba_ssm_stochastic_rounding': True, + 'mamba_ssm_philox_rounds': 5, + }, + 'speculative_config': { + 'decoding_type': 'MTP', + 'num_nextn_predict_layers': 3, + 'allow_advanced_sampling': True, + }, + } + }, ] # Apply pattern-based configurations on top of base config diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 9550f1f502a0..884cc2d61a60 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -88,6 +88,17 @@ "nemotron_3_super_120b_nvfp4_mtp": "cnn_dailymail", } +# All spec-decoding models (MTP, Eagle3, etc.). Used to skip --ignore-eos in +# benchmark client commands: forcing generation past EOS produces unstable +# acceptance rates for spec-dec. +SPEC_DEC_MODELS = { + "qwen3_4b_eagle3", + "qwen3_235b_a22b_fp4_eagle3", + "gpt_oss_120b_eagle3", + "gpt_oss_120b_eagle3_throughput", + *SPEC_DEC_REAL_DATASET_MODELS, +} + # Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root) AUTODEPLOY_MODEL_CONFIGS = { "nemotron_nano_3_30b_fp8": "examples/auto_deploy/nano_v3.yaml", @@ -1393,12 +1404,15 @@ def get_trtllm_serve_client_command(self, model_dir, "--num-prompts", str(self._config.num_reqs), - "--ignore-eos", "--tokenize-on-client", "--no-test-input", "--percentile-metrics", "ttft,tpot,itl,e2el", ] + # --ignore-eos must be off for spec-decoding models: forcing generation + # past EOS produces unstable acceptance rates. + if self._config.model_name not in SPEC_DEC_MODELS: + client_cmd.append("--ignore-eos") if self._config.model_name in OPENAI_CHAT_BACKEND_MODELS: client_cmd += ["--backend", "openai-chat"] if real_dataset_path: diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml index 05d63f11d4e3..92f2d900bc9e 100644 --- a/tests/integration/test_lists/qa/llm_perf_core.yml +++ b/tests/integration/test_lists/qa/llm_perf_core.yml @@ -3,19 +3,16 @@ llm_perf_core: # =============================================================================== # Test Conditions Index # =============================================================================== -# 1: All GPUs common tests(L20, L40S, H100, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases) -# 2: L20, L40S, H100, H20, H200 -# 3: L40S, H100, H20, H200 -# 4: H100, H20, H200 test cases -# 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases -# 6: GB200, B200, B300, GB300, RTX6000-Server test cases -# 7: B200, GB200, B300, GB300 test cases -# 8: B200, B300 test cases -# 9: H100, H20, H200, B200, B300 test cases -# 10: H20, H200, B200, B300 test cases -# 11: RTX-6000D, RTX-6000 Server test cases -# 12: RTX6000-Server -# 13: A100 (cc 8.0, BF16 dense only, A100-80G) +# 1: All GPUs common tests(L40S, H100, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases) +# 2: L40S, H100, H20, H200 +# 3: H100, H20, H200 test cases +# 4: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases +# 5: GB200, B200, B300, GB300, RTX6000-Server, RTX6000-D test cases +# 6: B200, GB200, B300, GB300 test cases +# 7: B200, B300 test cases +# 8: H100, H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases +# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases +# 10: RTX-6000D, RTX-6000 Server test cases # =============================================================================== @@ -30,15 +27,22 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput -# 2: L20, L40S, H100, H20, H200 +# 2: L40S, H100, H20, H200 - condition: ranges: system_gpu_count: gte: 2 compute_capability: - lt: 10.0 + lte: 9.0 tests: #llama_v3.1_8b #pytorch backend @@ -55,18 +59,23 @@ llm_perf_core: #nemotron_nano_12b_v2 - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-con:1] #min_latency - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] #max_throughput - - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128] - - -# 3: L40S, H100, H20, H200 -- condition: - ranges: - system_gpu_count: - gte: 4 - compute_capability: - gt: 8.0 - lte: 9.0 - tests: + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128] #qwen3.5_9b (dense BF16 19G, 1-GPU) + #qwen3.5_27b (dense BF16 52G, 2-GPU) + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput + #llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU) + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,32-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4] @@ -88,11 +97,10 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:4-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:4-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:4-gpus:4] #min_latency - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:4-gpus:4] #max_throughput -# 4: H100, H20, H200 test cases +# 3: H100, H20, H200 test cases - condition: ranges: system_gpu_count: @@ -112,17 +120,9 @@ llm_perf_core: - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:64] - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:128,128] - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:256] - #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU) - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput -# 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases +# 4: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases - condition: ranges: system_gpu_count: @@ -136,14 +136,6 @@ llm_perf_core: - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-streaming-bfloat16-input_output_len:500,2000-con:250] #max_throughput streaming - #qwen3.5_9b (dense BF16 19G, 1-GPU) - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput #qwen3.5_27b (dense BF16 52G, 1-GPU) - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128] - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000] @@ -168,9 +160,17 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput + #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU) + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput -# 6: GB200, B200, B300, GB300, RTX6000-Server test cases +# 5: GB200, B200, B300, GB300, RTX6000-Server, RTX6000-D test cases - condition: ranges: system_gpu_count: @@ -195,17 +195,9 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:1000-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120) - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4] - #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU, covers GB200/GB300) - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput -# 7: B200, GB200, B300, GB300 test cases +# 6: B200, GB200, B300, GB300 test cases - condition: ranges: system_gpu_count: @@ -229,7 +221,7 @@ llm_perf_core: - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:4-gpus:4] - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:4-gpus:4] - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,1000-tp:4-gpus:4] - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:4-gpus:4] +# 7: B200, B300 test cases - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:4-gpus:4] #min_latency - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:256-tp:4-gpus:4] #max_throughput #deepseek_v3.2_fp4 (FP4 389G, 4-GPU) @@ -256,9 +248,13 @@ llm_perf_core: - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,2000-ep:4-tp:4-gpus:4] - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:4-tp:4-gpus:4] #max_throughput + #nemotron_3_super_120b_nvfp4 (Hybrid MoE+SSM+Attn FP4 76G, 4-GPU ep=4 tp=4, throughput config) + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:5-con:1-ep:4-tp:4-gpus:4] #min_latency + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:160-con:32-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:640-con:128-ep:4-tp:4-gpus:4] #max_throughput -# 8: B200, B300 test cases +# 7: B200, B300 test cases - condition: ranges: system_gpu_count: @@ -275,7 +271,7 @@ llm_perf_core: # gpt_oss_120b_fp4 - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(120) +# 8: H100, H20, H200, B200, B300 test cases - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096-ep:8-tp:8-gpus:8] TIMEOUT(180) - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8] @@ -309,19 +305,19 @@ llm_perf_core: - perf/test_perf.py::test_perf[deepseek_v3.2_fp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,1000-con:3072-ep:8-tp:8-gpus:8] TIMEOUT(120) #max_throughput -# 9: H100, H20, H200, B200, B300 test cases +# 8: H100, H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases - condition: ranges: system_gpu_count: gte: 8 compute_capability: gte: 9.0 - lt: 12.0 + lte: 12.0 tests: #llama_v3.3_70b_instruct_fp8 #pytorch backend - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-tp:8-gpus:8] TIMEOUT(120) - #minimax_m2.5 (FP8 216G, 8-GPU) +# 9: H20, H200, B200, B300 test cases - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8] - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:8-gpus:8] - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:8-gpus:8] @@ -354,15 +350,14 @@ llm_perf_core: - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput - -# 10: H20, H200, B200, B300 test cases +# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases - condition: ranges: system_gpu_count: gte: 8 compute_capability: gte: 9.0 - lt: 12.0 + lte: 12.0 gpu_memory: gt: 90000 tests: @@ -387,12 +382,11 @@ llm_perf_core: - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput - -# 11: RTX-6000D, RTX-6000 Server test cases +# 10: RTX-6000D, RTX-6000 Server test cases - condition: ranges: system_gpu_count: - gte: 2 + gte: 4 compute_capability: gte: 12.0 lte: 12.0 @@ -411,82 +405,10 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:500,2000-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:1000,1000-tp:2-gpus:2] - #llama_v3.3_nemotron_super_49b_fp8 (nemotron-nas FP8 49G, 2-GPU) - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput - - -# 12: RTX6000-Server test cases -- condition: - ranges: - system_gpu_count: - gte: 8 - compute_capability: - gte: 12.0 - lte: 12.0 - tests: - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8] - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:8-tp:8-gpus:8] # deepseek_r1_0528 - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-streaming-float4-maxbs:512-maxnt:5220-input_output_len:4000,2000-reqs:512-ep:8-tp:8-gpus:8] - #qwen3.5_397b_a17b_fp8 (MoE FP8 380G, 8-GPU ep=8) - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:1000,1000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput - #qwen3.5_397b_a17b_fp4 (MoE FP4 234G, 8-GPU ep=8) - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:500,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:2000,500-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,1000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput - - -# 13: A100 test cases (cc 8.0, BF16 dense only, A100-80G) -- condition: - ranges: - system_gpu_count: - gte: 2 - compute_capability: - gte: 8.0 - lt: 9.0 - gpu_memory: - gt: 40000 - tests: - #qwen3.5_9b (dense BF16 19G, 1-GPU) - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput - #qwen3.5_27b (dense BF16 52G, 2-GPU for A100-80G safety) - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput - #llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU) - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-streaming-float4-maxbs:512-maxnt:5220-input_output_len:4000,2000-reqs:512-ep:8-tp:8-gpus:8]#max_throughput