NVIDIA · yufeiwu-nv · May 19, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
@@ -458,9 +458,12 @@ def get_model_yaml_config(model_label: str,
                 },
             }
         },
-        # Nemotron-3-Super-120B-NVFP4: (no MTP)
+        # Nemotron-3-Super-120B-NVFP4 (streaming/low-latency variant for spark perf)
+        # Streaming serve cases use small cuda_graph batch and no attention DP for latency.
         {
-            'patterns': ['nemotron_3_super_120b_nvfp4-'],
+            'patterns': [
+                'nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-',
+            ],
             'config': {
                 'max_seq_len': 1048576,
                 'enable_chunked_prefill': True,
@@ -481,9 +484,11 @@ def get_model_yaml_config(model_label: str,
                 },
             }
         },
-        # Nemotron-3-Super-120B-NVFP4: MTP speculative decoding
+        # Nemotron-3-Super-120B-NVFP4_MTP (streaming/low-latency variant for spark perf)
         {
-            'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
+            'patterns': [
+                'nemotron_3_super_120b_nvfp4_mtp-serve-pytorch-streaming-',
+            ],
             'config': {
                 'max_seq_len': 1048576,
                 'enable_chunked_prefill': True,
@@ -509,6 +514,58 @@ def get_model_yaml_config(model_label: str,
                 },
             }
         },
+        # Nemotron-3-Super-120B-NVFP4 (throughput variant, aligned with curated yaml)
+        # Non-streaming cases use attention DP and larger cuda_graph batch for throughput.
+        {
+            'patterns': ['nemotron_3_super_120b_nvfp4-'],
+            'config': {
+                'max_seq_len': 1048576,
+                'enable_chunked_prefill': True,
+                'enable_attention_dp': True,
+                'stream_interval': 1,
+                'moe_config': {
+                    'backend': 'CUTLASS',
+                },
+                'cuda_graph_config': {
+                    'enable_padding': True,
+                    'max_batch_size': 256,
+                },
+                'kv_cache_config': {
+                    'enable_block_reuse': False,
+                    'mamba_ssm_cache_dtype': 'float16',
+                    'mamba_ssm_stochastic_rounding': True,
+                    'mamba_ssm_philox_rounds': 5,
+                },
+            }
+        },
+        # Nemotron-3-Super-120B-NVFP4_MTP (throughput variant with MTP spec decoding)
+        {
+            'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
+            'config': {
+                'max_seq_len': 1048576,
+                'enable_chunked_prefill': True,
+                'enable_attention_dp': True,
+                'stream_interval': 1,
+                'moe_config': {
+                    'backend': 'CUTLASS',
+                },
+                'cuda_graph_config': {
+                    'enable_padding': True,
+                    'max_batch_size': 256,
+                },
+                'kv_cache_config': {
+                    'enable_block_reuse': False,
+                    'mamba_ssm_cache_dtype': 'float16',
+                    'mamba_ssm_stochastic_rounding': True,
+                    'mamba_ssm_philox_rounds': 5,
+                },
+                'speculative_config': {
+                    'decoding_type': 'MTP',
+                    'num_nextn_predict_layers': 3,
+                    'allow_advanced_sampling': True,
+                },
+            }
+        },
     ]
 
     # Apply pattern-based configurations on top of base config

@@ -88,6 +88,17 @@
     "nemotron_3_super_120b_nvfp4_mtp": "cnn_dailymail",
 }
 
+# All spec-decoding models (MTP, Eagle3, etc.). Used to skip --ignore-eos in
+# benchmark client commands: forcing generation past EOS produces unstable
+# acceptance rates for spec-dec.
+SPEC_DEC_MODELS = {
+    "qwen3_4b_eagle3",
+    "qwen3_235b_a22b_fp4_eagle3",
+    "gpt_oss_120b_eagle3",
+    "gpt_oss_120b_eagle3_throughput",
+    *SPEC_DEC_REAL_DATASET_MODELS,
+}
+
 # Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)
 AUTODEPLOY_MODEL_CONFIGS = {
     "nemotron_nano_3_30b_fp8": "examples/auto_deploy/nano_v3.yaml",
@@ -1393,12 +1404,15 @@ def get_trtllm_serve_client_command(self,
             model_dir,
             "--num-prompts",
             str(self._config.num_reqs),
-            "--ignore-eos",
             "--tokenize-on-client",
             "--no-test-input",
             "--percentile-metrics",
             "ttft,tpot,itl,e2el",
         ]
+        # --ignore-eos must be off for spec-decoding models: forcing generation
+        # past EOS produces unstable acceptance rates.
+        if self._config.model_name not in SPEC_DEC_MODELS:
+            client_cmd.append("--ignore-eos")
         if self._config.model_name in OPENAI_CHAT_BACKEND_MODELS:
             client_cmd += ["--backend", "openai-chat"]
         if real_dataset_path: