diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 7fc7d9a67c5..bb6514ddac3 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -458,9 +458,12 @@ def get_model_yaml_config(model_label: str, }, } }, - # Nemotron-3-Super-120B-NVFP4: (no MTP) + # Nemotron-3-Super-120B-NVFP4 (streaming/low-latency variant for spark perf) + # Streaming serve cases use small cuda_graph batch and no attention DP for latency. { - 'patterns': ['nemotron_3_super_120b_nvfp4-'], + 'patterns': [ + 'nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-', + ], 'config': { 'max_seq_len': 1048576, 'enable_chunked_prefill': True, @@ -481,9 +484,11 @@ def get_model_yaml_config(model_label: str, }, } }, - # Nemotron-3-Super-120B-NVFP4: MTP speculative decoding + # Nemotron-3-Super-120B-NVFP4_MTP (streaming/low-latency variant for spark perf) { - 'patterns': ['nemotron_3_super_120b_nvfp4_mtp'], + 'patterns': [ + 'nemotron_3_super_120b_nvfp4_mtp-serve-pytorch-streaming-', + ], 'config': { 'max_seq_len': 1048576, 'enable_chunked_prefill': True, @@ -509,6 +514,58 @@ def get_model_yaml_config(model_label: str, }, } }, + # Nemotron-3-Super-120B-NVFP4 (throughput variant, aligned with curated yaml) + # Non-streaming cases use attention DP and larger cuda_graph batch for throughput. + { + 'patterns': ['nemotron_3_super_120b_nvfp4-'], + 'config': { + 'max_seq_len': 1048576, + 'enable_chunked_prefill': True, + 'enable_attention_dp': True, + 'stream_interval': 1, + 'moe_config': { + 'backend': 'CUTLASS', + }, + 'cuda_graph_config': { + 'enable_padding': True, + 'max_batch_size': 256, + }, + 'kv_cache_config': { + 'enable_block_reuse': False, + 'mamba_ssm_cache_dtype': 'float16', + 'mamba_ssm_stochastic_rounding': True, + 'mamba_ssm_philox_rounds': 5, + }, + } + }, + # Nemotron-3-Super-120B-NVFP4_MTP (throughput variant with MTP spec decoding) + { + 'patterns': ['nemotron_3_super_120b_nvfp4_mtp'], + 'config': { + 'max_seq_len': 1048576, + 'enable_chunked_prefill': True, + 'enable_attention_dp': True, + 'stream_interval': 1, + 'moe_config': { + 'backend': 'CUTLASS', + }, + 'cuda_graph_config': { + 'enable_padding': True, + 'max_batch_size': 256, + }, + 'kv_cache_config': { + 'enable_block_reuse': False, + 'mamba_ssm_cache_dtype': 'float16', + 'mamba_ssm_stochastic_rounding': True, + 'mamba_ssm_philox_rounds': 5, + }, + 'speculative_config': { + 'decoding_type': 'MTP', + 'num_nextn_predict_layers': 3, + 'allow_advanced_sampling': True, + }, + } + }, ] # Apply pattern-based configurations on top of base config diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 9550f1f502a..884cc2d61a6 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -88,6 +88,17 @@ "nemotron_3_super_120b_nvfp4_mtp": "cnn_dailymail", } +# All spec-decoding models (MTP, Eagle3, etc.). Used to skip --ignore-eos in +# benchmark client commands: forcing generation past EOS produces unstable +# acceptance rates for spec-dec. +SPEC_DEC_MODELS = { + "qwen3_4b_eagle3", + "qwen3_235b_a22b_fp4_eagle3", + "gpt_oss_120b_eagle3", + "gpt_oss_120b_eagle3_throughput", + *SPEC_DEC_REAL_DATASET_MODELS, +} + # Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root) AUTODEPLOY_MODEL_CONFIGS = { "nemotron_nano_3_30b_fp8": "examples/auto_deploy/nano_v3.yaml", @@ -1393,12 +1404,15 @@ def get_trtllm_serve_client_command(self, model_dir, "--num-prompts", str(self._config.num_reqs), - "--ignore-eos", "--tokenize-on-client", "--no-test-input", "--percentile-metrics", "ttft,tpot,itl,e2el", ] + # --ignore-eos must be off for spec-decoding models: forcing generation + # past EOS produces unstable acceptance rates. + if self._config.model_name not in SPEC_DEC_MODELS: + client_cmd.append("--ignore-eos") if self._config.model_name in OPENAI_CHAT_BACKEND_MODELS: client_cmd += ["--backend", "openai-chat"] if real_dataset_path: diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml index 05d63f11d4e..92f2d900bc9 100644 --- a/tests/integration/test_lists/qa/llm_perf_core.yml +++ b/tests/integration/test_lists/qa/llm_perf_core.yml @@ -3,19 +3,16 @@ llm_perf_core: # =============================================================================== # Test Conditions Index # =============================================================================== -# 1: All GPUs common tests(L20, L40S, H100, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases) -# 2: L20, L40S, H100, H20, H200 -# 3: L40S, H100, H20, H200 -# 4: H100, H20, H200 test cases -# 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases -# 6: GB200, B200, B300, GB300, RTX6000-Server test cases -# 7: B200, GB200, B300, GB300 test cases -# 8: B200, B300 test cases -# 9: H100, H20, H200, B200, B300 test cases -# 10: H20, H200, B200, B300 test cases -# 11: RTX-6000D, RTX-6000 Server test cases -# 12: RTX6000-Server -# 13: A100 (cc 8.0, BF16 dense only, A100-80G) +# 1: All GPUs common tests(L40S, H100, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases) +# 2: L40S, H100, H20, H200 +# 3: H100, H20, H200 test cases +# 4: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases +# 5: GB200, B200, B300, GB300, RTX6000-Server, RTX6000-D test cases +# 6: B200, GB200, B300, GB300 test cases +# 7: B200, B300 test cases +# 8: H100, H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases +# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases +# 10: RTX-6000D, RTX-6000 Server test cases # =============================================================================== @@ -30,15 +27,22 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000] + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency + - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput -# 2: L20, L40S, H100, H20, H200 +# 2: L40S, H100, H20, H200 - condition: ranges: system_gpu_count: gte: 2 compute_capability: - lt: 10.0 + lte: 9.0 tests: #llama_v3.1_8b #pytorch backend @@ -55,18 +59,23 @@ llm_perf_core: #nemotron_nano_12b_v2 - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-con:1] #min_latency - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] #max_throughput - - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128] - - -# 3: L40S, H100, H20, H200 -- condition: - ranges: - system_gpu_count: - gte: 4 - compute_capability: - gt: 8.0 - lte: 9.0 - tests: + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128] #qwen3.5_9b (dense BF16 19G, 1-GPU) + #qwen3.5_27b (dense BF16 52G, 2-GPU) + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency + - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput + #llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU) + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,32-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4] @@ -88,11 +97,10 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:4-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:4-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:4-gpus:4] #min_latency - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:4-gpus:4] #max_throughput -# 4: H100, H20, H200 test cases +# 3: H100, H20, H200 test cases - condition: ranges: system_gpu_count: @@ -112,17 +120,9 @@ llm_perf_core: - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:64] - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:128,128] - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:256] - #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU) - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput -# 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases +# 4: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases - condition: ranges: system_gpu_count: @@ -136,14 +136,6 @@ llm_perf_core: - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-streaming-bfloat16-input_output_len:500,2000-con:250] #max_throughput streaming - #qwen3.5_9b (dense BF16 19G, 1-GPU) - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput #qwen3.5_27b (dense BF16 52G, 1-GPU) - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128] - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000] @@ -168,9 +160,17 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput + #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU) + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency + - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput -# 6: GB200, B200, B300, GB300, RTX6000-Server test cases +# 5: GB200, B200, B300, GB300, RTX6000-Server, RTX6000-D test cases - condition: ranges: system_gpu_count: @@ -195,17 +195,9 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:1000-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120) - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4] - #qwen3.5_122b_a10b (MoE BF16 234G, 4-GPU, covers GB200/GB300) - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:128,128-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:500,2000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:2000,500-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,1000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-input_output_len:1000,2000-ep:4-tp:4-gpus:4] - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_122b_a10b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-con:256-ep:4-tp:4-gpus:4] #max_throughput -# 7: B200, GB200, B300, GB300 test cases +# 6: B200, GB200, B300, GB300 test cases - condition: ranges: system_gpu_count: @@ -229,7 +221,7 @@ llm_perf_core: - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:4-gpus:4] - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:4-gpus:4] - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,1000-tp:4-gpus:4] - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:4-gpus:4] +# 7: B200, B300 test cases - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:4-gpus:4] #min_latency - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:256-tp:4-gpus:4] #max_throughput #deepseek_v3.2_fp4 (FP4 389G, 4-GPU) @@ -256,9 +248,13 @@ llm_perf_core: - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,2000-ep:4-tp:4-gpus:4] - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:4-tp:4-gpus:4] #min_latency - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:4-tp:4-gpus:4] #max_throughput + #nemotron_3_super_120b_nvfp4 (Hybrid MoE+SSM+Attn FP4 76G, 4-GPU ep=4 tp=4, throughput config) + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:5-con:1-ep:4-tp:4-gpus:4] #min_latency + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:160-con:32-ep:4-tp:4-gpus:4] + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.8-input_output_len:1024,1024-reqs:640-con:128-ep:4-tp:4-gpus:4] #max_throughput -# 8: B200, B300 test cases +# 7: B200, B300 test cases - condition: ranges: system_gpu_count: @@ -275,7 +271,7 @@ llm_perf_core: # gpt_oss_120b_fp4 - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(120) +# 8: H100, H20, H200, B200, B300 test cases - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096-ep:8-tp:8-gpus:8] TIMEOUT(180) - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8] @@ -309,19 +305,19 @@ llm_perf_core: - perf/test_perf.py::test_perf[deepseek_v3.2_fp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,1000-con:3072-ep:8-tp:8-gpus:8] TIMEOUT(120) #max_throughput -# 9: H100, H20, H200, B200, B300 test cases +# 8: H100, H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases - condition: ranges: system_gpu_count: gte: 8 compute_capability: gte: 9.0 - lt: 12.0 + lte: 12.0 tests: #llama_v3.3_70b_instruct_fp8 #pytorch backend - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-tp:8-gpus:8] TIMEOUT(120) - #minimax_m2.5 (FP8 216G, 8-GPU) +# 9: H20, H200, B200, B300 test cases - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8] - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:8-gpus:8] - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:8-gpus:8] @@ -354,15 +350,14 @@ llm_perf_core: - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput - -# 10: H20, H200, B200, B300 test cases +# 9: H20, H200, B200, B300, RTX6000D, RTX6000-Server test cases - condition: ranges: system_gpu_count: gte: 8 compute_capability: gte: 9.0 - lt: 12.0 + lte: 12.0 gpu_memory: gt: 90000 tests: @@ -387,12 +382,11 @@ llm_perf_core: - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput - -# 11: RTX-6000D, RTX-6000 Server test cases +# 10: RTX-6000D, RTX-6000 Server test cases - condition: ranges: system_gpu_count: - gte: 2 + gte: 4 compute_capability: gte: 12.0 lte: 12.0 @@ -411,82 +405,10 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:500,2000-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:1000,1000-tp:2-gpus:2] - #llama_v3.3_nemotron_super_49b_fp8 (nemotron-nas FP8 49G, 2-GPU) - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput - - -# 12: RTX6000-Server test cases -- condition: - ranges: - system_gpu_count: - gte: 8 - compute_capability: - gte: 12.0 - lte: 12.0 - tests: - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8] - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:8-tp:8-gpus:8] # deepseek_r1_0528 - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-streaming-float4-maxbs:512-maxnt:5220-input_output_len:4000,2000-reqs:512-ep:8-tp:8-gpus:8] - #qwen3.5_397b_a17b_fp8 (MoE FP8 380G, 8-GPU ep=8) - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:1000,1000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput - #qwen3.5_397b_a17b_fp4 (MoE FP4 234G, 8-GPU ep=8) - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:500,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:2000,500-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,1000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-input_output_len:1000,2000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-tp:8-gpus:8] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_397b_a17b_fp4-bench-pytorch-float4-maxbs:512-input_output_len:1000,1000-con:512-ep:8-tp:8-gpus:8] #max_throughput - - -# 13: A100 test cases (cc 8.0, BF16 dense only, A100-80G) -- condition: - ranges: - system_gpu_count: - gte: 2 - compute_capability: - gte: 8.0 - lt: 9.0 - gpu_memory: - gt: 40000 - tests: - #qwen3.5_9b (dense BF16 19G, 1-GPU) - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:500,2000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:2000,500] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,2000] - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_9b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput - #qwen3.5_27b (dense BF16 52G, 2-GPU for A100-80G safety) - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency - - perf/test_perf.py::test_perf[qwen3.5_27b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput - #llama_v3.3_nemotron_super_49b (nemotron-nas BF16 94G, 2-GPU) - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:2000,500-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,2000-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:2-gpus:2] #min_latency - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250-tp:2-gpus:2] #max_throughput + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-streaming-float4-maxbs:512-maxnt:5220-input_output_len:4000,2000-reqs:512-ep:8-tp:8-gpus:8]#max_throughput