Skip to content
Open
65 changes: 61 additions & 4 deletions tests/integration/defs/perf/pytorch_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,9 +458,12 @@ def get_model_yaml_config(model_label: str,
},
}
},
# Nemotron-3-Super-120B-NVFP4: (no MTP)
# Nemotron-3-Super-120B-NVFP4 (streaming/low-latency variant for spark perf)
# Streaming serve cases use small cuda_graph batch and no attention DP for latency.
{
'patterns': ['nemotron_3_super_120b_nvfp4-'],
'patterns': [
'nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-',
],
'config': {
'max_seq_len': 1048576,
'enable_chunked_prefill': True,
Expand All @@ -481,9 +484,11 @@ def get_model_yaml_config(model_label: str,
},
}
},
# Nemotron-3-Super-120B-NVFP4: MTP speculative decoding
# Nemotron-3-Super-120B-NVFP4_MTP (streaming/low-latency variant for spark perf)
{
'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
'patterns': [
'nemotron_3_super_120b_nvfp4_mtp-serve-pytorch-streaming-',
],
'config': {
'max_seq_len': 1048576,
'enable_chunked_prefill': True,
Expand All @@ -509,6 +514,58 @@ def get_model_yaml_config(model_label: str,
},
}
},
# Nemotron-3-Super-120B-NVFP4 (throughput variant, aligned with curated yaml)
# Non-streaming cases use attention DP and larger cuda_graph batch for throughput.
{
'patterns': ['nemotron_3_super_120b_nvfp4-'],
'config': {
'max_seq_len': 1048576,
'enable_chunked_prefill': True,
'enable_attention_dp': True,
'stream_interval': 1,
'moe_config': {
'backend': 'CUTLASS',
},
'cuda_graph_config': {
'enable_padding': True,
'max_batch_size': 256,
},
'kv_cache_config': {
'enable_block_reuse': False,
'mamba_ssm_cache_dtype': 'float16',
'mamba_ssm_stochastic_rounding': True,
'mamba_ssm_philox_rounds': 5,
},
}
},
# Nemotron-3-Super-120B-NVFP4_MTP (throughput variant with MTP spec decoding)
{
'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
'config': {
'max_seq_len': 1048576,
'enable_chunked_prefill': True,
'enable_attention_dp': True,
'stream_interval': 1,
'moe_config': {
'backend': 'CUTLASS',
},
'cuda_graph_config': {
'enable_padding': True,
'max_batch_size': 256,
},
'kv_cache_config': {
'enable_block_reuse': False,
'mamba_ssm_cache_dtype': 'float16',
'mamba_ssm_stochastic_rounding': True,
'mamba_ssm_philox_rounds': 5,
},
'speculative_config': {
'decoding_type': 'MTP',
'num_nextn_predict_layers': 3,
'allow_advanced_sampling': True,
},
}
},
Comment on lines +517 to +568
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Throughput patterns currently override streaming variants unintentionally.

'nemotron_3_super_120b_nvfp4-' and 'nemotron_3_super_120b_nvfp4_mtp' also match the streaming serve labels, so these later entries overwrite the streaming low-latency config (e.g., enable_attention_dp and max_batch_size).

Suggested fix (make throughput patterns non-overlapping)
-            'patterns': ['nemotron_3_super_120b_nvfp4-'],
+            'patterns': [
+                'nemotron_3_super_120b_nvfp4-bench-pytorch-',
+                'nemotron_3_super_120b_nvfp4-serve-pytorch-float',
+            ],
@@
-            'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
+            'patterns': [
+                'nemotron_3_super_120b_nvfp4_mtp-bench-pytorch-',
+                'nemotron_3_super_120b_nvfp4_mtp-serve-pytorch-float',
+            ],
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/integration/defs/perf/pytorch_model_config.py` around lines 517 - 568,
The throughput entries' pattern strings ('nemotron_3_super_120b_nvfp4-' and
'nemotron_3_super_120b_nvfp4_mtp') are too broad and accidentally match
streaming/low-latency labels, causing their 'config' (e.g., enable_attention_dp,
cuda_graph_config.max_batch_size) to override streaming variants; fix by making
the patterns non-overlapping (for example rename to a distinct suffix like
'nemotron_3_super_120b_nvfp4_throughput' and
'nemotron_3_super_120b_nvfp4_mtp_throughput' or use more specific anchors) so
the throughput entries in the 'patterns' lists no longer match streaming serve
labels and won't overwrite the streaming configs.

]

# Apply pattern-based configurations on top of base config
Expand Down
16 changes: 15 additions & 1 deletion tests/integration/defs/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,17 @@
"nemotron_3_super_120b_nvfp4_mtp": "cnn_dailymail",
}

# All spec-decoding models (MTP, Eagle3, etc.). Used to skip --ignore-eos in
# benchmark client commands: forcing generation past EOS produces unstable
# acceptance rates for spec-dec.
SPEC_DEC_MODELS = {
"qwen3_4b_eagle3",
"qwen3_235b_a22b_fp4_eagle3",
"gpt_oss_120b_eagle3",
"gpt_oss_120b_eagle3_throughput",
*SPEC_DEC_REAL_DATASET_MODELS,
}

# Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)
AUTODEPLOY_MODEL_CONFIGS = {
"nemotron_nano_3_30b_fp8": "examples/auto_deploy/nano_v3.yaml",
Expand Down Expand Up @@ -1393,12 +1404,15 @@ def get_trtllm_serve_client_command(self,
model_dir,
"--num-prompts",
str(self._config.num_reqs),
"--ignore-eos",
"--tokenize-on-client",
"--no-test-input",
"--percentile-metrics",
"ttft,tpot,itl,e2el",
]
# --ignore-eos must be off for spec-decoding models: forcing generation
# past EOS produces unstable acceptance rates.
if self._config.model_name not in SPEC_DEC_MODELS:
client_cmd.append("--ignore-eos")
if self._config.model_name in OPENAI_CHAT_BACKEND_MODELS:
client_cmd += ["--backend", "openai-chat"]
if real_dataset_path:
Expand Down
Loading
Loading