diff --git a/leaderboard/generate.py b/leaderboard/generate.py index ead41d62..49d5838b 100644 --- a/leaderboard/generate.py +++ b/leaderboard/generate.py @@ -19,7 +19,7 @@ _pricing_cache: dict = {} _pricing_path = Path("schema/cloud_pricing.json") if _pricing_path.exists(): - with open(_pricing_path) as _f: + with open(_pricing_path, encoding='utf-8') as _f: _pricing_cache = json.load(_f) RESULTS_DIR = Path("results") @@ -45,26 +45,14 @@ def _get_suite_precision_required(suite_id: str) -> str: """Read precision_required from suite.json. Returns 'BF16' if not found.""" path = Path("suites") / suite_id / "suite.json" try: - with open(path) as f: + with open(path, encoding='utf-8') as f: return json.load(f).get("precision_required", "BF16") except Exception: return "BF16" def _collect_suite_specs() -> dict: - """Collect UI-relevant per-suite spec from suites/suite_*/suite.json. - - Baked into the generated leaderboard.js as ``window.SUITE_SPECS`` so - the static leaderboard UI auto-syncs whenever a maintainer edits a - suite contract — model id, dataset, prompt distribution, scenarios - default/extra split, online SLA, etc. Editorial UI content (titles, - taglines, descriptions) stays in assets/js/data.js since it isn't a - property of the suite contract. - - Returns a ``{ suite_id: spec }`` mapping with only the fields the UI - consumes. Missing fields are omitted (the JS-side merge keeps the - hardcoded fallback when a key is absent). - """ + """Collect UI-relevant per-suite spec from suites/suite_*/suite.json.""" out: dict = {} suites_dir = Path("suites") if not suites_dir.exists(): @@ -76,7 +64,7 @@ def _collect_suite_specs() -> dict: if not sf.exists(): continue try: - with open(sf) as f: + with open(sf, encoding='utf-8') as f: data = json.load(f) except Exception: continue @@ -84,7 +72,6 @@ def _collect_suite_specs() -> dict: rd = data.get("request_distribution") or {} scn = data.get("scenarios") or {} spec: dict = {} - # Fields the UI displays in suite cards / specs / compare headers. for k in ( "model_id", "model_revision", @@ -110,6 +97,95 @@ def _collect_suite_specs() -> dict: return out +# ── Scenario metric extraction ──────────────────────────────────────────────── + +def _extract_scenario_metric(result: dict, scenario_name: str) -> dict: + """Extract the best-throughput info for a single scenario from a result. + + Returns a dict with keys: + throughput, metric_label, concurrency, peak_memory_gb, is_valid + """ + metrics = result.get("metrics") or {} + out = { + "throughput": None, + "metric_label": "", + "concurrency": None, + "peak_memory_gb": None, + "is_valid": False, + } + + if scenario_name == "offline": + offline = metrics.get("offline") + if offline: + rows = offline.get("results_by_concurrency") or offline.get("results_by_batch_size") or [] + valid = [r for r in rows if not r.get("oom") and r.get("throughput_tokens_per_sec")] + if valid: + best = max(valid, key=lambda r: r["throughput_tokens_per_sec"]) + out["throughput"] = best["throughput_tokens_per_sec"] + out["metric_label"] = "tokens/sec" + out["concurrency"] = best.get("client_concurrency") or best.get("concurrency") + out["peak_memory_gb"] = best.get("peak_memory_gb") + out["is_valid"] = True + + elif scenario_name == "online": + online = metrics.get("online") + if online: + qps = online.get("max_valid_qps") + if qps is not None: + out["throughput"] = qps + out["metric_label"] = "max valid QPS" + out["is_valid"] = True + + elif scenario_name == "interactive": + # interactive uses the same inference path as offline — reuse offline metric + offline = metrics.get("offline") + if offline: + rows = offline.get("results_by_concurrency") or offline.get("results_by_batch_size") or [] + valid = [r for r in rows if not r.get("oom") and r.get("throughput_tokens_per_sec")] + if valid: + best = max(valid, key=lambda r: r["throughput_tokens_per_sec"]) + out["throughput"] = best["throughput_tokens_per_sec"] + out["metric_label"] = "tokens/sec" + out["concurrency"] = best.get("client_concurrency") or best.get("concurrency") + out["peak_memory_gb"] = best.get("peak_memory_gb") + out["is_valid"] = True + + elif scenario_name == "sustained": + sustained = metrics.get("sustained") + if sustained: + thr = sustained.get("sustained_throughput_tokens_per_sec") + if thr is not None: + out["throughput"] = thr + out["metric_label"] = "tok/s (sustained mean)" + out["concurrency"] = sustained.get("sustained_concurrency") + out["is_valid"] = True + + elif scenario_name == "speculative": + speculative = metrics.get("speculative") + if speculative: + rows = speculative.get("results_by_concurrency") or speculative.get("results_by_batch_size") or [] + valid = [r for r in rows if not r.get("oom") and r.get("throughput_tokens_per_sec")] + if valid: + best = max(valid, key=lambda r: r["throughput_tokens_per_sec"]) + out["throughput"] = best["throughput_tokens_per_sec"] + out["metric_label"] = "tok/s (speculative)" + out["concurrency"] = best.get("client_concurrency") or best.get("concurrency") + out["peak_memory_gb"] = best.get("peak_memory_gb") + out["is_valid"] = True + + elif scenario_name == "burst": + burst = metrics.get("burst") + if burst: + ratio = burst.get("burst_degradation_ratio") + if ratio is not None: + # Invert: higher = better, same polarity as throughput + out["throughput"] = round(1.0 - ratio, 4) if ratio <= 1.0 else 0.0 + out["metric_label"] = "1 − degradation_ratio" + out["is_valid"] = True + + return out + + # ── Data loading ────────────────────────────────────────────────────────────── def load_results() -> list[dict]: @@ -125,7 +201,7 @@ def load_results() -> list[dict]: if not result_path.exists(): continue try: - with open(result_path) as f: + with open(result_path, encoding='utf-8') as f: data = json.load(f) data["_tier"] = tier data["_submission_name"] = submission_dir.name @@ -133,11 +209,10 @@ def load_results() -> list[dict]: "scenarios_run" in data.get("task", {}) or "chip_counts_run" in data.get("task", {}) ) - # Load env_info.json alongside result.json (optional, best-effort) env_path = submission_dir / "env_info.json" if env_path.exists(): try: - with open(env_path) as ef: + with open(env_path, encoding='utf-8') as ef: data["_env_info"] = json.load(ef) except Exception as ee: print(f"Warning: could not load {env_path}: {ee}") @@ -163,14 +238,12 @@ def extract_detail(result: dict) -> dict: parallelism = task.get("parallelism") or {} env = result.get("_env_info") or {} - # CPU string cpu_info = env.get("cpu", {}) cpu_str = None if cpu_info.get("model"): cores = cpu_info.get("physical_cores") cpu_str = cpu_info["model"] + (f", {cores} cores" if cores else "") - # NIC string nics = env.get("network_interfaces", []) nic_str = None if nics: @@ -180,7 +253,6 @@ def extract_detail(result: dict) -> dict: names_str = ", ".join(nic_names) if nic_names else "" nic_str = f"{len(nics)}x {type_str}" + (f" ({names_str})" if names_str else "") - # Intra-node interconnect: prefer result.json, fall back to topology parse intra = chip.get("interconnect_intra_node") if not intra and env.get("accelerator_topology"): nv_matches = re.findall(r'NV(\d+)', env["accelerator_topology"]) @@ -188,7 +260,6 @@ def extract_detail(result: dict) -> dict: intra = f"NVLink {max(int(x) for x in nv_matches)} (full mesh)" return { - # Hardware "hw_chip": chip.get("name"), "hw_vendor": chip.get("vendor"), "hw_count": chip.get("count"), @@ -199,7 +270,6 @@ def extract_detail(result: dict) -> dict: "hw_system_memory_gb": env.get("system_memory_gb"), "hw_pcie": env.get("pcie_generation"), "hw_network": nic_str, - # Software "sw_framework": software.get("framework"), "sw_framework_version": software.get("framework_version"), "sw_driver": software.get("driver_version"), @@ -207,31 +277,27 @@ def extract_detail(result: dict) -> dict: "sw_os": software.get("os"), "sw_python": software.get("python_version"), "sw_pytorch": env.get("pytorch_version"), - # Model "model_id": model.get("model_id"), "model_revision": model.get("model_revision"), - "model_name": model.get("model_name"), # ← new - "model_note": model.get("model_note"), # ← new - "model_source": model.get("model_source"), # ← new + "model_name": model.get("model_name"), + "model_note": model.get("model_note"), + "model_source": model.get("model_source"), "model_arch": model.get("architecture"), "model_params_b": model.get("parameter_count_b"), "model_precision": model.get("precision"), "model_effective_dtype": model.get("effective_dtype"), "model_quant_method": model.get("quantization_method"), "model_format": model.get("model_format"), - # Run settings "run_scenarios": task.get("scenarios_run"), "run_chip_counts": task.get("chip_counts_run"), "run_num_runs": task.get("num_runs"), "run_tp": parallelism.get("tensor_parallel_size"), "run_pp": parallelism.get("pipeline_parallel_size"), "run_dp": parallelism.get("data_parallel_size"), - # Accuracy "acc_score": accuracy.get("subset_score"), "acc_baseline_delta": accuracy.get("baseline_delta"), "acc_valid": accuracy.get("valid"), "acc_notes": accuracy.get("notes"), - # Metadata "meta_submitted_by": meta.get("submitted_by"), "meta_submission_type": meta.get("submission_type"), "meta_date": meta.get("date"), @@ -240,10 +306,6 @@ def extract_detail(result: dict) -> dict: "meta_model_load_sec": meta.get("model_load_seconds"), "meta_start_time": meta.get("benchmark_start_time"), "meta_notes": meta.get("notes"), - # Vendor-specific environment fields collected by platforms/.py - # (e.g. ROCm-SMI link health, NVML clock telemetry). The modal flattens - # this dict and shows only non-null entries — different vendors record - # different keys by design and no UI tries to unify them. "env_vendor_details": env.get("vendor_details") or {}, } @@ -251,11 +313,6 @@ def extract_detail(result: dict) -> dict: # ── Implementation extraction (modal impl tab) ─────────────────────────────── def extract_impl(result: dict) -> dict | None: - """ - Load runner meta.json for the implementation_id referenced in result.json. - Returns None if implementation_id is absent or the runner folder is not found. - Fields returned match meta.json schema plus a GitHub link. - """ impl_id = result.get("implementation_id") if not impl_id: return None @@ -265,7 +322,8 @@ def extract_impl(result: dict) -> dict | None: return None try: - meta = json.loads(meta_path.read_text()) + with open(meta_path, encoding='utf-8') as f: + meta = json.load(f) except Exception: return None @@ -285,7 +343,7 @@ def extract_impl(result: dict) -> dict | None: } -# ── Visualization data extraction (modal viz tab) ───────────────────────────── +# ── Visualization data extraction ───────────────────────────────────────────── def extract_viz(result: dict, metrics: dict) -> dict: """Chart-ready data for the per-suite visualization panel.""" @@ -304,9 +362,6 @@ def _concurrency_labels(rows): def _online_block(): online = metrics.get("online", {}) qps_rows = online.get("results_by_qps", []) - # Per-QPS reliability blocks. Emitted as a parallel array so the - # frontend can render a badge next to each QPS row without joining - # by index from a separate object. return { "labels": [str(r.get("target_qps", "")) for r in qps_rows], "ttft_p50": [r.get("ttft_ms_p50") for r in qps_rows], @@ -388,8 +443,6 @@ def _speculative_block(): "mean_accepted_tokens": rm.get("mean_accepted_tokens"), } - # Per-concurrency-level offline reliability blocks. Parallel array to - # `throughput` and `memory_gb` so the frontend can join by row index. def _offline_reliability(rows): return [r.get("throughput_tokens_per_sec_reliability") or {} for r in rows] @@ -463,7 +516,6 @@ def _offline_reliability(rows): None ) - # ── Online cross-format data ────────────────────────────────────── online_by_precision = None q_online = metrics.get("quantization_online", {}) if q_online: @@ -479,7 +531,6 @@ def _offline_reliability(rows): "sla_met": [r.get("sla_met") for r in qps_rows], }) - # ── Sustained cross-format data ─────────────────────────────────── sustained_by_precision = None q_sus = metrics.get("quantization_sustained", {}) if q_sus: @@ -601,7 +652,6 @@ def extract_row(result: dict) -> dict: is_suite_level = result.get("_is_suite_level", False) suite_id = result.get("suite_id", "") - # ── Offline ─────────────────────────────────────────────────────────────── offline_throughput = None tokens_per_sec_per_chip = None peak_memory_gb = None @@ -620,15 +670,12 @@ def extract_row(result: dict) -> dict: valid_mem, key=lambda r: r.get("throughput_tokens_per_sec", 0) ).get("peak_memory_gb") - # ── Online ──────────────────────────────────────────────────────────────── online = metrics.get("online") online_max_qps = online.get("max_valid_qps") if online else None - # ── Interactive ─────────────────────────────────────────────────────────── interactive = metrics.get("interactive") interactive_ttft_p99 = interactive.get("ttft_ms_p99") if interactive else None - # ── Sustained ───────────────────────────────────────────────────────────── sustained_throughput = None throttle_ratio = None throttle_onset_minute = None @@ -643,7 +690,6 @@ def extract_row(result: dict) -> dict: ttft_p99_drift_ms = sustained.get("ttft_p99_drift_ms") sustained_concurrency = sustained.get("sustained_concurrency") - # ── Speculative ───────────────────────────────────────────────────────── speculative_throughput = None speculative_speedup = None speculative_acceptance = None @@ -659,7 +705,6 @@ def extract_row(result: dict) -> dict: if speculative_throughput and offline_throughput and offline_throughput > 0: speculative_speedup = round(speculative_throughput / offline_throughput, 3) - # ── Burst ──────────────────────────────────────────────────────────────── burst_degradation = None burst_steady_p99 = None burst_p99 = None @@ -672,7 +717,6 @@ def extract_row(result: dict) -> dict: burst_p99 = burst.get("burst_ttft_p99_ms") burst_sla_met = burst.get("sla_met_during_burst") - # ── Primary metric ──────────────────────────────────────────────────────── scenario = task.get("scenario", "offline") if is_suite_level and suite_id not in ("suite_E", "suite_C", "suite_F"): primary_metric = offline_throughput @@ -700,7 +744,6 @@ def extract_row(result: dict) -> dict: primary_metric = None primary_metric_label = None - # ── Suite E scaling ─────────────────────────────────────────────────────── scaling_efficiency_2x = None scaling_efficiency_4x = None scaling_base_throughput = None @@ -726,13 +769,12 @@ def extract_row(result: dict) -> dict: primary_metric = scaling_base_throughput primary_metric_label = "tokens/sec (1x baseline)" - # ── Suite C quantization ────────────────────────────────────────────────── quant_bf16_throughput = None quant_best_throughput = None quant_best_precision = None - quant_int8_speedup = None # W8A16 tier (best of W8A8/W8A16) - quant_int4_speedup = None # W4A16 tier - quant_quality_eff = None # best quality_efficiency across all formats + quant_int8_speedup = None + quant_int4_speedup = None + quant_quality_eff = None quantization = metrics.get("quantization") if quantization: @@ -746,23 +788,19 @@ def extract_row(result: dict) -> dict: if p == "BF16": quant_bf16_throughput = thr elif p in ("W8A8", "W8A16"): - # Use W8A16 as "int8-tier" speedup if available, fall back to W8A8 if quant_int8_speedup is None or p == "W8A16": quant_int8_speedup = spd elif p == "W4A16": quant_int4_speedup = spd - # Track best throughput across all precision formats if thr and (quant_best_throughput is None or thr > quant_best_throughput): quant_best_throughput = thr quant_best_precision = p - # Track best quality_efficiency across all formats if qe and (best_qe is None or qe > best_qe): best_qe = qe quant_quality_eff = qe - # Primary metric for Suite C: best throughput across all precision formats if quant_best_throughput: primary_metric = quant_best_throughput primary_metric_label = f"tokens/sec ({quant_best_precision})" @@ -770,7 +808,6 @@ def extract_row(result: dict) -> dict: primary_metric = quant_bf16_throughput primary_metric_label = "tokens/sec (BF16 baseline)" - # ── Efficiency ──────────────────────────────────────────────────────────── memory_gb_per_chip = chip.get("memory_gb", 0) memory_efficiency = ( round(offline_throughput / peak_memory_gb, 1) @@ -790,7 +827,6 @@ def extract_row(result: dict) -> dict: if offline_throughput and min_price and min_price > 0 else None ) - # ── Precision fallback detection ────────────────────────────────────────── precision = model.get("precision", "BF16") effective_dtype = model.get("effective_dtype") quantization_method = model.get("quantization_method") @@ -799,7 +835,6 @@ def extract_row(result: dict) -> dict: precision.upper() != suite_required.upper() if precision and suite_required else False ) - # Emulated flag: precision was requested but compute was in a different dtype precision_emulated = ( effective_dtype is not None and effective_dtype.replace("torch.", "") != _precision_to_dtype(precision) @@ -827,22 +862,18 @@ def extract_row(result: dict) -> dict: "architecture": model.get("architecture"), "suite": suite_id, "scenario": "all" if is_suite_level else scenario, - # Primary "primary_metric": primary_metric, "primary_metric_label": primary_metric_label, "tokens_per_sec_per_chip": tokens_per_sec_per_chip, - # Scenario metrics "offline_throughput": offline_throughput, "online_max_qps": online_max_qps, "interactive_ttft_p99": interactive_ttft_p99, - # Efficiency "peak_memory_gb": peak_memory_gb, "memory_utilization_pct": memory_utilization_pct, "memory_efficiency_toks_per_gb": memory_efficiency, "min_price_usd_per_hr": min_price, "cost_efficiency_toks_per_dollar_hr": cost_efficiency, "tokens_per_watt": derived.get("tokens_per_sec_per_watt"), - # Metadata "accuracy_valid": accuracy.get("valid"), "accuracy_score": accuracy.get("subset_score"), "date": meta.get("date"), @@ -852,37 +883,30 @@ def extract_row(result: dict) -> dict: "run_id": meta.get("run_id"), "run_name": meta.get("run_name"), "flagged": meta.get("flagged"), - # Suite E "scaling_efficiency_2x": scaling_efficiency_2x, "scaling_efficiency_4x": scaling_efficiency_4x, "scaling_base_throughput": scaling_base_throughput, - # Suite C "quant_bf16_throughput": quant_bf16_throughput, "quant_best_throughput": quant_best_throughput, "quant_best_precision": quant_best_precision, "quant_int8_speedup": quant_int8_speedup, "quant_int4_speedup": quant_int4_speedup, "quant_quality_eff": quant_quality_eff, - # Sustained "sustained_throughput": sustained_throughput, "throttle_ratio": throttle_ratio, "throttle_onset_minute": throttle_onset_minute, "ttft_p99_drift_ms": ttft_p99_drift_ms, "sustained_concurrency": sustained_concurrency, - # Speculative "speculative_throughput": speculative_throughput, "speculative_speedup": speculative_speedup, "speculative_acceptance": speculative_acceptance, - # Burst "burst_degradation": burst_degradation, "burst_steady_p99": burst_steady_p99, "burst_p99": burst_p99, "burst_sla_met": burst_sla_met, - # Panel data "detail": extract_detail(result), "viz": extract_viz(result, metrics), "impl": extract_impl(result), - # Implementation ID (flat, for filtering/display without loading impl) "implementation_id": result.get("implementation_id"), } @@ -890,20 +914,11 @@ def extract_row(result: dict) -> dict: # ── API generation ──────────────────────────────────────────────────────────── def generate_api(results: list[dict], output_dir: Path) -> None: - """ - Generate static JSON API for external tooling (OpenClaw Skill etc.). - - api/rank.json — per-submission ranking within chip+suite group - api/chips.json — chip summary list (best offline throughput) - api/index.json — chip lookup with per-suite best metrics - api/suites.json — suite metadata for discovery - """ + """Generate static JSON API for external tooling (OpenClaw Skill etc.).""" api_dir = output_dir / "api" api_dir.mkdir(exist_ok=True) - # Group by chip+suite for fair per-suite ranking by_chip_suite: dict[tuple, list] = defaultdict(list) - # Also track chip-level best across all suites for chips.json by_chip: dict[str, list] = defaultdict(list) for r in results: @@ -912,7 +927,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: submission_name = r.get("_submission_name", "unknown") tier = r.get("_tier", "community") - # Primary metric per result offline = r.get("metrics", {}).get("offline") best_thr = None if offline: @@ -923,7 +937,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: if valid: best_thr = max(row["throughput_tokens_per_sec"] for row in valid) - # Suite E fallback if best_thr is None: scaling = r.get("metrics", {}).get("scaling", {}) if scaling: @@ -934,7 +947,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: best_thr = entry.get("best_throughput_tokens_per_sec") break - # Suite C: use best quality_efficiency as primary if best_thr is None: quant = r.get("metrics", {}).get("quantization", {}) if quant: @@ -950,7 +962,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: by_chip_suite[(chip_name, suite_id)].append((submission_name, best_thr, tier)) by_chip[chip_name].append((submission_name, best_thr, suite_id, tier)) - # ── rank.json ───────────────────────────────────────────────────────────── rank_data: dict[str, dict] = {} for (chip_name, suite_id), entries in by_chip_suite.items(): sorted_entries = sorted(entries, key=lambda x: x[1], reverse=True) @@ -970,7 +981,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: with open(api_dir / "rank.json", "w") as f: json.dump(rank_data, f, indent=2) - # ── chips.json ──────────────────────────────────────────────────────────── chips = [] chip_bests: dict[str, float] = {} for chip_name, entries in by_chip.items(): @@ -987,8 +997,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: with open(api_dir / "chips.json", "w") as f: json.dump(chips, f, indent=2) - # ── index.json ──────────────────────────────────────────────────────────── - # Per-chip lookup with best metric per suite chip_index: dict[str, dict] = {} for chip_name in by_chip: chip_index[chip_name] = { @@ -1010,7 +1018,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: suite_entry = chip_index[chip_name]["suites"].setdefault(suite_id, {}) - # Offline throughput offline = metrics.get("offline") if offline: rows = offline.get("results_by_concurrency") or \ @@ -1023,7 +1030,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: if cur is None or thr > cur: suite_entry["best_throughput_tokens_per_sec"] = round(thr, 1) - # Online if online: qps = online.get("max_valid_qps") if qps is not None: @@ -1031,7 +1037,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: if cur is None or qps > cur: suite_entry["best_online_max_qps"] = qps - # Interactive if iv: ttft = iv.get("ttft_ms_p99") if ttft is not None: @@ -1039,7 +1044,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: if cur is None or ttft < cur: suite_entry["best_interactive_ttft_p99_ms"] = round(ttft, 1) - # Scaling (Suite E) if scaling: base_thr = ( scaling.get("base_throughput_tokens_per_sec") or @@ -1062,7 +1066,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: elif count == 4 and eff: suite_entry["best_scaling_efficiency_4x"] = eff - # Sustained if sustained: s_thr = sustained.get("sustained_throughput_tokens_per_sec") throttle = sustained.get("throttle_ratio") @@ -1073,7 +1076,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: if throttle is not None: suite_entry["throttle_ratio"] = throttle - # Suite C quality efficiency quant = metrics.get("quantization") if quant: qes = [(e.get("precision"), e.get("quality_efficiency")) @@ -1087,8 +1089,6 @@ def generate_api(results: list[dict], output_dir: Path) -> None: with open(api_dir / "index.json", "w") as f: json.dump(chip_index, f, indent=2) - # ── suites.json ─────────────────────────────────────────────────────────── - # Static metadata about each suite for discovery suites_meta = {} for suite_dir in sorted(Path("suites").iterdir()): if not suite_dir.is_dir(): @@ -1097,7 +1097,7 @@ def generate_api(results: list[dict], output_dir: Path) -> None: if not suite_json.exists(): continue try: - with open(suite_json) as f: + with open(suite_json, encoding='utf-8') as f: s = json.load(f) suite_id = s.get("suite_id", suite_dir.name) scenarios_cfg = s.get("scenarios", {}) @@ -1129,6 +1129,367 @@ def generate_api(results: list[dict], output_dir: Path) -> None: print(f" suites.json: {len(suites_meta)} suites documented") +# ── Distribution data generation (新增) ─────────────────────────────────────── + +def generate_distribution_data(results: list[dict], output_dir: Path) -> None: + """生成性能分布数据,用于分布图视图。 + + 为每个去重后的提交生成完整元信息(包含所有 scenario 的指标), + 支持前端按 suite / vendor / framework / model / scenario 筛选, + 并按 (chip, suite) 聚合生成分组统计数据。 + """ + + # ── 1. 先去重(与 main() 相同的逻辑)───────────────────────────────── + _seen: dict = {} + for r in results: + meta = r.get("meta") or {} + rid = meta.get("run_id") + if not rid: + continue + suite_id = r.get("suite_id", "") + # 计算去重用的指标值 + if suite_id == "suite_C": + quant = (r.get("metrics") or {}).get("quantization", {}) + qes = [e.get("quality_efficiency") for e in quant.get("results_by_precision", []) + if e.get("quality_efficiency")] + metric = max(qes) if qes else 0 + elif suite_id == "suite_E": + scaling = (r.get("metrics") or {}).get("scaling", {}) + metric = 0 + for e in scaling.get("results_by_chip_count", []): + if e.get("chip_count") == 4: + metric = e.get("scaling_efficiency") or 0 + if not metric: + for e in scaling.get("results_by_chip_count", []): + if e.get("chip_count") == 2: + metric = e.get("scaling_efficiency") or 0 + if not metric: + metric = scaling.get("base_throughput_tokens_per_sec") or 0 + else: + offline = (r.get("metrics") or {}).get("offline", {}) + rows = offline.get("results_by_concurrency") or offline.get("results_by_batch_size") or [] + valid_rows = [row for row in rows + if not row.get("oom") and row.get("throughput_tokens_per_sec")] + metric = max((row["throughput_tokens_per_sec"] for row in valid_rows), default=0) + if rid not in _seen or metric > _seen[rid]["metric"]: + _seen[rid] = {"result": r, "metric": metric} + deduped = [entry["result"] for entry in _seen.values()] + print(f" distribution: {len(results)} raw → {len(deduped)} deduplicated results") + + # ── 2. 构建每个提交的详细数据 ───────────────────────────────────────── + all_submissions = [] + + for r in deduped: + chip_obj = r.get("chip") or {} + chip = chip_obj.get("name", "Unknown") + chip_vendor = chip_obj.get("vendor", "") + chip_count = chip_obj.get("count", 1) + memory_gb = chip_obj.get("memory_gb", 0) + suite = r.get("suite_id", "") + + model_obj = r.get("model") or {} + model_full = model_obj.get("model_id", "") + model_short = model_full.split("/")[-1] if model_full else "" + model_params_b = model_obj.get("parameter_count_b") + precision = model_obj.get("precision", "BF16") + effective_dtype = model_obj.get("effective_dtype") + + software = r.get("software") or {} + framework = software.get("framework", "") + framework_version = software.get("framework_version", "") + + meta = r.get("meta") or {} + submission_name = r.get("_submission_name", "") + tier = r.get("_tier", "community") + submitted_by = meta.get("submitted_by", "") + date = meta.get("date", "") + reproduce_script = meta.get("reproduce_script", "") + run_id = meta.get("run_id", "") + impl_id = r.get("implementation_id", "") + + # 收集该提交跑了哪些 scenario(从 task.scenarios_run 读取) + task = r.get("task") or {} + scenarios_run = task.get("scenarios_run") or [] + is_suite_level = "scenarios_run" in task or "chip_counts_run" in task + + # 为每个 scenario 提取指标 + scenarios = {} + for sc_name in scenarios_run: + scenarios[sc_name] = _extract_scenario_metric(r, sc_name) + + # 如果 scenarios_run 为空(旧格式),至少从 offline 提取 + if not scenarios_run: + offline_metric = _extract_scenario_metric(r, "offline") + if offline_metric["is_valid"]: + scenarios["offline"] = offline_metric + online_metric = _extract_scenario_metric(r, "online") + if online_metric["is_valid"]: + scenarios["online"] = online_metric + + # 处理 Suite E(scaling):从 metrics.scaling 提取 + suite_primary_thr = None + suite_primary_label = None + suite_primary_scenario = "offline" + + if suite == "suite_E": + scaling = (r.get("metrics") or {}).get("scaling", {}) + for entry in scaling.get("results_by_chip_count", []): + if entry.get("chip_count") == 1: + suite_primary_thr = entry.get("best_throughput_tokens_per_sec") + suite_primary_label = "tokens/sec (1x baseline)" + suite_primary_scenario = "scaling" + break + if not suite_primary_thr: + suite_primary_thr = scaling.get("base_throughput_tokens_per_sec") + suite_primary_label = "tokens/sec (1x baseline)" + suite_primary_scenario = "scaling" + + # ── Suite C:按精度爆炸,每种精度一条独立提交 ───────────────── + if suite == "suite_C": + quant = (r.get("metrics") or {}).get("quantization", {}) + quant_online = (r.get("metrics") or {}).get("quantization_online", {}) + quant_sus = (r.get("metrics") or {}).get("quantization_sustained", {}) + prec_online = {e.get("precision",""): e.get("max_valid_qps") + for e in quant_online.get("results_by_precision", [])} + prec_sustained = {e.get("precision",""): e.get("sustained_throughput_tokens_per_sec") + for e in quant_sus.get("results_by_precision", [])} + tp = (task.get("parallelism") or {}).get("tensor_parallel_size") + + for entry in quant.get("results_by_precision", []): + prec = entry.get("precision", "") + thr = entry.get("best_throughput_tokens_per_sec") + if not thr: + continue + prec_sc = {} + prec_sc["offline"] = {"throughput": thr, "metric_label": f"tokens/sec ({prec})", + "concurrency": None, "peak_memory_gb": None, "is_valid": True} + qps = prec_online.get(prec) + if qps is not None: + prec_sc["online"] = {"throughput": qps, "metric_label": "max valid QPS", + "concurrency": None, "peak_memory_gb": None, "is_valid": True} + sus = prec_sustained.get(prec) + if sus is not None: + prec_sc["sustained"] = {"throughput": sus, "metric_label": "tok/s (sustained mean)", + "concurrency": None, "peak_memory_gb": None, "is_valid": True} + for sc_name in scenarios_run: + if sc_name not in prec_sc and sc_name != "accuracy": + m = _extract_scenario_metric(r, sc_name) + if m["is_valid"]: + prec_sc[sc_name] = m + + config = {"concurrency": None, "batch_size": None, + "tensor_parallel": tp, "peak_memory_gb": None} + sub = { + "id": f"{run_id or submission_name}_{prec}", + "chip": chip, "chip_vendor": chip_vendor, + "chip_count": chip_count, "memory_gb": memory_gb, + "suite": suite, "model": model_short, "model_full": model_full, + "model_params_b": model_params_b, + "precision": prec, "effective_dtype": effective_dtype, + "framework": framework, "framework_version": framework_version, + "tier": tier, "submitted_by": submitted_by, + "date": date, "reproduce_script": reproduce_script, + "runner_id": impl_id, + "scenarios": prec_sc, + "primary_scenario": "quantization", + "primary_throughput": thr, + "primary_metric_label": f"tokens/sec ({prec})", + "config": config, + } + all_submissions.append(sub) + + else: + # ── 非 Suite C:标准单条提交 ──────────────────────────── + + # 确定 primary_throughput 和 primary_scenario + primary_throughput = suite_primary_thr + primary_scenario = suite_primary_scenario + primary_label = suite_primary_label + + if primary_throughput is None: + _SCENARIO_PRIORITY = ["offline", "online", "sustained", "speculative", "burst"] + for sc in _SCENARIO_PRIORITY: + if sc in scenarios and scenarios[sc]["is_valid"]: + primary_throughput = scenarios[sc]["throughput"] + primary_scenario = sc + primary_label = scenarios[sc]["metric_label"] + break + + if primary_throughput is None: + continue + + # 构建最佳配置信息 + best_sc = scenarios.get(primary_scenario) if primary_scenario in scenarios else None + config = {} + if best_sc: + tp = (task.get("parallelism") or {}).get("tensor_parallel_size") + config = { + "concurrency": best_sc.get("concurrency"), + "batch_size": None, + "tensor_parallel": tp, + "peak_memory_gb": best_sc.get("peak_memory_gb"), + } + else: + offline = (r.get("metrics") or {}).get("offline", {}) + rows = offline.get("results_by_concurrency") or offline.get("results_by_batch_size") or [] + valid_rows = [row for row in rows + if not row.get("oom") and row.get("throughput_tokens_per_sec")] + if valid_rows: + best_row = max(valid_rows, key=lambda row: row["throughput_tokens_per_sec"]) + tp = (task.get("parallelism") or {}).get("tensor_parallel_size") + config = { + "concurrency": best_row.get("client_concurrency") or best_row.get("concurrency"), + "batch_size": best_row.get("batch_size"), + "tensor_parallel": tp, + "peak_memory_gb": best_row.get("peak_memory_gb"), + } + + sub = { + "id": run_id or submission_name, + "chip": chip, + "chip_vendor": chip_vendor, + "chip_count": chip_count, + "memory_gb": memory_gb, + "suite": suite, + "model": model_short, + "model_full": model_full, + "model_params_b": model_params_b, + "precision": precision, + "effective_dtype": effective_dtype, + "framework": framework, + "framework_version": framework_version, + "tier": tier, + "submitted_by": submitted_by, + "date": date, + "reproduce_script": reproduce_script, + "runner_id": impl_id, + "scenarios": scenarios, + "primary_scenario": primary_scenario, + "primary_throughput": primary_throughput, + "primary_metric_label": primary_label, + "config": config, + } + all_submissions.append(sub) + + # ── 3. 按 (chip, suite) 分组聚合 ────────────────────────────────────── + groups: dict[tuple, dict] = defaultdict(lambda: { + "submissions": [], + "throughputs": [], + }) + for sub in all_submissions: + key = (sub["chip"], sub["suite"]) + groups[key]["submissions"].append(sub) + groups[key]["throughputs"].append(sub["primary_throughput"]) + + group_list = [] + for (chip, suite), data in groups.items(): + thr_list = sorted(data["throughputs"]) + n = len(thr_list) + median = thr_list[n // 2] + best_sub = max(data["submissions"], key=lambda s: s["primary_throughput"]) + + # 各 scenario 汇总 + scenario_summary = {} + for sub in data["submissions"]: + for sc_name, sc_info in sub["scenarios"].items(): + if sc_name not in scenario_summary: + scenario_summary[sc_name] = { + "count": 0, + "best_throughput": None, + "best_framework": "", + } + sm = scenario_summary[sc_name] + sm["count"] += 1 + if sc_info["is_valid"] and sc_info["throughput"]: + if sm["best_throughput"] is None or sc_info["throughput"] > sm["best_throughput"]: + sm["best_throughput"] = sc_info["throughput"] + sm["best_framework"] = sub["framework"] + + # 标准差 + stddev = None + if n >= 2: + stddev = round(statistics.stdev(thr_list), 2) + + group_list.append({ + "chip": chip, + "chip_vendor": best_sub["chip_vendor"], + "suite": suite, + "model": best_sub["model"], + "submission_count": n, + "best_throughput": thr_list[-1], + "median_throughput": median, + "min_throughput": thr_list[0], + "max_throughput": thr_list[-1], + "stddev_throughput": stddev, + "scenario_summary": scenario_summary, + "best_submission_id": best_sub["id"], + "best_framework": best_sub["framework"], + "best_submitted_by": best_sub["submitted_by"], + }) + + # 按厂商优先级排序,再按中位数吞吐量降序 + vendor_priority = {"NVIDIA": 1, "Nvidia": 1, "nvidia": 1, + "Huawei": 2, "华为": 2, + "AMD": 3, "amd": 3, + "Google": 4, "Apple": 5, + "Moore Threads": 6, "Iluvatar": 6, "Intel": 6} + group_list.sort(key=lambda g: ( + vendor_priority.get(g["chip_vendor"], 99), + -(g["median_throughput"] or 0) + )) + + # ── 4. Suite 元数据 ─────────────────────────────────────────────────── + suite_meta = _collect_suite_specs() + + # ── 5. 写入 distribution.js ─────────────────────────────────────────── + out_path = output_dir / "distribution.js" + with open(out_path, "w", encoding="utf-8") as f: + f.write("// Auto-generated by leaderboard/generate.py. Do not edit manually.\n\n") + f.write(f"const DISTRIBUTION_SUBMISSIONS = {json.dumps(all_submissions, indent=2, ensure_ascii=False)};\n") + f.write("window.DISTRIBUTION_SUBMISSIONS = DISTRIBUTION_SUBMISSIONS;\n\n") + f.write(f"const DISTRIBUTION_GROUPS = {json.dumps(group_list, indent=2, ensure_ascii=False)};\n") + f.write("window.DISTRIBUTION_GROUPS = DISTRIBUTION_GROUPS;\n\n") + f.write(f"const DISTRIBUTION_SUITE_META = {json.dumps(suite_meta, indent=2, ensure_ascii=False)};\n") + f.write("window.DISTRIBUTION_SUITE_META = DISTRIBUTION_SUITE_META;\n") + + group_count = len(group_list) + submission_count = len(all_submissions) + print(f"Distribution data written to {out_path} " + f"({group_count} groups, {submission_count} submissions).") + + +def _bust_index_cache(data_path: Path, index_path: Path) -> None: + """Rewrite + + + + + +
+ +

🚀 Serving Recipe Distribution

Each dot = one real benchmark run. Wider spread within a Suite = more optimization headroom.
As the community tunes, points shift upward — proof that sharing recipes works.

git clone https://github.com/FreedomIntelligence/AccelMark && cd AccelMark && python run.py --suite A
+
+ +

📊 Performance Distribution

+
+
+
+
+
+
+ +
Loading…
+
+Beeswarm +Scatter +Density +Heatmap +By Suite +Beeswarm — Grouped by Suite, each dot is a submission. Spread = optimization headroom.
Scatter — Throughput × QPS. Top-right = both strong.
Density — Overlap heatmap. Darker = more recipes.
Heatmap — Suite × Chip matrix. Color = throughput.
By Suite — One mini-chart per Suite. Compare per benchmark.
+Y-axis +
+ +
+ +
+ +

🖥️ Chips

Click to jump to table
+ +

📋 Recipe Overview

Best RecipeVendorChipSuite CoverageRecipesBest ThroughputHeadroom
Loading…
+
+ + + + + + + + \ No newline at end of file diff --git a/leaderboard/site/distribution.js b/leaderboard/site/distribution.js new file mode 100644 index 00000000..47255b81 --- /dev/null +++ b/leaderboard/site/distribution.js @@ -0,0 +1,12383 @@ +// Auto-generated by leaderboard/generate.py. Do not edit manually. + +const DISTRIBUTION_SUBMISSIONS = [ + { + "id": "2349a925", + "chip": "Apple M1", + "chip_vendor": "Apple", + "chip_count": 1, + "memory_gb": 16.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "mlx-lm", + "framework_version": "0.31.2", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-20", + "reproduce_script": "runners/apple_mlx_lm_9546b8b5/runner.py", + "runner_id": "apple_mlx_lm_9546b8b5", + "scenarios": { + "offline": { + "throughput": 51.0, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 1.24, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 51.0, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 1.24 + } + }, + { + "id": "300ac34c", + "chip": "Google TPU v5e", + "chip_vendor": "Google", + "chip_count": 1, + "memory_gb": 16.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-tpu", + "framework_version": "0.13.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/google_vllm_tpu_68cc9ffa/runner.py", + "runner_id": "google_vllm_tpu_68cc9ffa", + "scenarios": { + "offline": { + "throughput": 8127.1, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 8127.1, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "d9b3177f", + "chip": "Google TPU v6e", + "chip_vendor": "Google", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-tpu", + "framework_version": "0.13.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/google_vllm_tpu_68cc9ffa/runner.py", + "runner_id": "google_vllm_tpu_68cc9ffa", + "scenarios": { + "offline": { + "throughput": 2775.55, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 2774.41, + "metric_label": "tok/s (speculative)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 2775.55, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "d037f60f", + "chip": "Google TPU v6e", + "chip_vendor": "Google", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-tpu", + "framework_version": "0.13.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/google_vllm_tpu_68cc9ffa/runner.py", + "runner_id": "google_vllm_tpu_68cc9ffa", + "scenarios": { + "offline": { + "throughput": 54.52, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 54.52, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "9318bfeb", + "chip": "Google TPU v6e", + "chip_vendor": "Google", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-tpu", + "framework_version": "0.13.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/google_vllm_tpu_68cc9ffa/runner.py", + "runner_id": "google_vllm_tpu_68cc9ffa", + "scenarios": { + "offline": { + "throughput": 6857.53, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 6857.53, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 64, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "a2777c30", + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "chip_count": 1, + "memory_gb": 64.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 1699.04, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1699.04, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 268.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 1718.23, + "metric_label": "tok/s (speculative)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1699.04, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "a3547ba9", + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "chip_count": 1, + "memory_gb": 64.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 53.22, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 53.22, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 53.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 53.22, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "bd7d8f87", + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "chip_count": 1, + "memory_gb": 64.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 4941.13, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 4941.13, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 1238.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 4941.13, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "fcb9725c", + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "chip_count": 8, + "memory_gb": 64.0, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 769.88, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 2, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 53.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 769.88, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 769.88, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": null + } + }, + { + "id": "354e5562", + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "chip_count": 8, + "memory_gb": 64.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 2430.05, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "d726144e", + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "chip_count": 8, + "memory_gb": 64.0, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 46.7, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 1631.87, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 2, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1631.87, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 226.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1631.87, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": null + } + }, + { + "id": "635ecf42", + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "chip_count": 16, + "memory_gb": 64.0, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 723.06, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 2, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 53.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 723.06, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 723.06, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 16, + "peak_memory_gb": null + } + }, + { + "id": "b1fe92eb", + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "chip_count": 16, + "memory_gb": 64.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 2499.29, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "329a2b9e", + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "chip_count": 16, + "memory_gb": 64.0, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 46.7, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 1638.62, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 2, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1638.62, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 262.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1638.62, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 16, + "peak_memory_gb": null + } + }, + { + "id": "74d19743", + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "chip_count": 1, + "memory_gb": 64.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 1888.72, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1888.72, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 376.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 1858.56, + "metric_label": "tok/s (speculative)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1888.72, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6c1e7ffe", + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "chip_count": 1, + "memory_gb": 64.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 55.0, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 55.0, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 54.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 55.0, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "8826a63d", + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "chip_count": 1, + "memory_gb": 64.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vllm-ascend", + "framework_version": "0.18.0rc1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/ascend_vllm_ascend_d4aa9fda/runner.py", + "runner_id": "ascend_vllm_ascend_d4aa9fda", + "scenarios": { + "offline": { + "throughput": 7848.84, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 7848.84, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 2217.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 7848.84, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b4a92b30", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2415.99, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 33.52, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 2415.99, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 33.52, + "is_valid": true + }, + "sustained": { + "throughput": 484.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 947.4, + "metric_label": "tok/s (speculative)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 2415.99, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 33.52 + } + }, + { + "id": "57cc3fdf_BF16", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2387.7, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 491.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2387.7, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "57cc3fdf_FP8", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2574.53, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 709.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2574.53, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "57cc3fdf_W8A8", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3205.63, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 657.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3205.63, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "57cc3fdf_W8A16", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2465.33, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 700.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2465.33, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "57cc3fdf_W4A16", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1563.39, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 813.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1563.39, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "8e114cbe", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 56.79, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 35.46, + "is_valid": true + }, + "interactive": { + "throughput": 56.79, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 35.46, + "is_valid": true + }, + "sustained": { + "throughput": 57.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 56.79, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 35.46 + } + }, + { + "id": "fe3156b5", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 10805.85, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 33.24, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 10805.85, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 33.24, + "is_valid": true + }, + "sustained": { + "throughput": 3972.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 10805.85, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 33.24 + } + }, + { + "id": "14410aea", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 40.0, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1704.71, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 30.57, + "is_valid": true + }, + "online": { + "throughput": 25, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 164.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1704.71, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 30.57, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1704.71, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 30.57 + } + }, + { + "id": "e76a4402", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 40.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 3000.59, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "08de2dc2", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 40.0, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 46.7, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3217.83, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 32.24, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 3217.83, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 32.24, + "is_valid": true + }, + "sustained": { + "throughput": 472.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 3217.83, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 32.24 + } + }, + { + "id": "8f83bfab", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "runner_id": "nvidia_vllm020_0f6c56e4", + "scenarios": { + "offline": { + "throughput": 3916.69, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 3916.69, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 712.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 3916.69, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "ed4b0557", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2700.61, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 69.33, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 2700.61, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 69.33, + "is_valid": true + }, + "sustained": { + "throughput": 551.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 999.61, + "metric_label": "tok/s (speculative)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 2700.61, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 69.33 + } + }, + { + "id": "ffd81462_BF16", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "runner_id": "nvidia_vllm020_0f6c56e4", + "scenarios": { + "offline": { + "throughput": 3888.91, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 706.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3888.91, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "ffd81462_FP8", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "runner_id": "nvidia_vllm020_0f6c56e4", + "scenarios": { + "offline": { + "throughput": 4141.71, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 438.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 4141.71, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "ffd81462_W8A8", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "runner_id": "nvidia_vllm020_0f6c56e4", + "scenarios": { + "offline": { + "throughput": 3208.11, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 399.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3208.11, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "ffd81462_W8A16", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "runner_id": "nvidia_vllm020_0f6c56e4", + "scenarios": { + "offline": { + "throughput": 3547.44, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 494.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3547.44, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "ffd81462_W4A16", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "runner_id": "nvidia_vllm020_0f6c56e4", + "scenarios": { + "offline": { + "throughput": 1889.19, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 437.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1889.19, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6940965a_BF16", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2666.87, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 537.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2666.87, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6940965a_FP8", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2823.04, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 709.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2823.04, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6940965a_W8A8", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3521.09, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 643.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3521.09, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6940965a_W8A16", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2661.86, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 694.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2661.86, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6940965a_W4A16", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1761.96, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 757.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1761.96, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "43e96189", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "runner_id": "nvidia_vllm020_0f6c56e4", + "scenarios": { + "offline": { + "throughput": 65.15, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 65.15, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 58.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 65.15, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "7bef8eef", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 70.21, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 71.21, + "is_valid": true + }, + "interactive": { + "throughput": 70.21, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 71.21, + "is_valid": true + }, + "sustained": { + "throughput": 67.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 70.21, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 71.21 + } + }, + { + "id": "a4e6a6e4", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.20.1+transformers-5.8.1", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_vllm020_0f6c56e4/runner.py", + "runner_id": "nvidia_vllm020_0f6c56e4", + "scenarios": { + "offline": { + "throughput": 22884.92, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 22884.92, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 11576.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 22884.92, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 64, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "52ad2fe3", + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 11011.89, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 69.0, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 11011.89, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 69.0, + "is_valid": true + }, + "sustained": { + "throughput": 2386.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 11011.89, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 69.0 + } + }, + { + "id": "298e6500", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2723.49, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 69.33, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 2723.49, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 69.33, + "is_valid": true + }, + "sustained": { + "throughput": 546.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 1050.9, + "metric_label": "tok/s (speculative)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 2723.49, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 69.33 + } + }, + { + "id": "944773aa_BF16", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2710.52, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 534.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2710.52, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "944773aa_FP8", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2859.89, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 760.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2859.89, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "944773aa_W8A8", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3570.3, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 715.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3570.3, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "944773aa_W8A16", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2695.72, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 745.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2695.72, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "944773aa_W4A16", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1770.93, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 829.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1770.93, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "4d0e7990", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 70.34, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 71.21, + "is_valid": true + }, + "interactive": { + "throughput": 70.34, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 71.21, + "is_valid": true + }, + "sustained": { + "throughput": 67.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 70.34, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 71.21 + } + }, + { + "id": "54d0e7aa", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 80.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 11972.12, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": 69.0, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 11972.12, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": 69.0, + "is_valid": true + }, + "sustained": { + "throughput": 2804.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 11972.12, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 64, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 69.0 + } + }, + { + "id": "de0853fa", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 80.0, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1803.59, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 67.58, + "is_valid": true + }, + "online": { + "throughput": 25, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 184.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1803.59, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 67.58, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1803.59, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 67.58 + } + }, + { + "id": "74d08a7a", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 80.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 3341.37, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "d31ba78b", + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 80.0, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 46.7, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3428.42, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 69.21, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 3428.42, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 69.21, + "is_valid": true + }, + "sustained": { + "throughput": 569.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 3428.42, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 69.21 + } + }, + { + "id": "e95e2caa", + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1001.2, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 19.33, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1001.2, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 19.33, + "is_valid": true + }, + "sustained": { + "throughput": 309.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1001.2, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 19.33 + } + }, + { + "id": "4955fbb1_BF16", + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 898.24, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 306.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 898.24, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "4955fbb1_FP8", + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1404.18, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 472.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1404.18, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "4955fbb1_W8A8", + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2065.49, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 475.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2065.49, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "4955fbb1_W8A16", + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1374.59, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 475.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1374.59, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "4955fbb1_W4A16", + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1054.17, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 588.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1054.17, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "faf550ec", + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 8961.38, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 19.03, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 8961.38, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 19.03, + "is_valid": true + }, + "sustained": { + "throughput": 2693.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 8961.38, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 19.03 + } + }, + { + "id": "81ca6d0e", + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "chip_count": 4, + "memory_gb": 24.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 1065.49, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "d6543f77", + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1461.83, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 19.17, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1461.83, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 19.17, + "is_valid": true + }, + "sustained": { + "throughput": 400.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1461.83, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 19.17 + } + }, + { + "id": "b59b0798_BF16", + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1292.71, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 393.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1292.71, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b59b0798_FP8", + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2933.85, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 640.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2933.85, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b59b0798_W8A8", + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2678.33, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 567.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2678.33, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b59b0798_W8A16", + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2226.02, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 628.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2226.02, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b59b0798_W4A16", + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1343.61, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 854.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1343.61, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "06662a14", + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 14273.14, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 18.86, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 14273.14, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 18.86, + "is_valid": true + }, + "sustained": { + "throughput": 5995.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 14273.14, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 18.86 + } + }, + { + "id": "bba67533", + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "chip_count": 4, + "memory_gb": 24.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 1633.89, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "675e325e", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1452.3, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 19.21, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1452.3, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 19.21, + "is_valid": true + }, + "sustained": { + "throughput": 339.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1452.3, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 19.21 + } + }, + { + "id": "6d7e1d48_BF16", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1299.02, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 334.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1299.02, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6d7e1d48_FP8", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2888.23, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 472.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2888.23, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6d7e1d48_W8A8", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2622.06, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 438.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2622.06, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6d7e1d48_W8A16", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2171.95, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 506.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2171.95, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6d7e1d48_W4A16", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1345.66, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 606.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1345.66, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b228454f", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 24.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 11440.55, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 18.89, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 11440.55, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 18.89, + "is_valid": true + }, + "sustained": { + "throughput": 1698.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 11440.55, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 18.89 + } + }, + { + "id": "cfd0bdc8", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 24.0, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-21", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 442.95, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 18.89, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 104.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 442.95, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 18.89, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 442.95, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 18.89 + } + }, + { + "id": "54dccbd0", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 24.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-21", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 1609.89, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "a4179ecc", + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 24.0, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 46.7, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-21", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1361.9, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 20.58, + "is_valid": true + }, + "online": { + "throughput": 2, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1361.9, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 20.58, + "is_valid": true + }, + "sustained": { + "throughput": 325.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1361.9, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 20.58 + } + }, + { + "id": "b8f8ed0f", + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 31.8, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.19.1rc1.dev339+gedc364896", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3487.52, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 3487.52, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 707.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 3487.52, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "d1baa050_BF16", + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 31.8, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.19.1rc1.dev339+gedc364896", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3519.52, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 676.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3519.52, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "d1baa050_W8A16", + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 31.8, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.19.1rc1.dev339+gedc364896", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5303.83, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 1148.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 5303.83, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "d1baa050_W4A16", + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 31.8, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.19.1rc1.dev339+gedc364896", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2934.27, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 1381.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2934.27, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "e87e6c36", + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 31.8, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.19.1rc1.dev339+gedc364896", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 54.26, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 54.26, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 51.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 54.26, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "776d2702", + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 31.8, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.19.1rc1.dev339+gedc364896", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 15323.19, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 15323.19, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 3941.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 15323.19, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "831c95a7", + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 79.6, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5128.31, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 69.2, + "is_valid": true + }, + "online": { + "throughput": 25, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 5128.31, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 69.2, + "is_valid": true + }, + "sustained": { + "throughput": 907.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 1733.92, + "metric_label": "tok/s (speculative)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 5128.31, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 69.2 + } + }, + { + "id": "a4a8716a_BF16", + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 79.6, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5052.11, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 5052.11, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "a4a8716a_FP8", + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 79.6, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 6314.92, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 6314.92, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "a4a8716a_W8A8", + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 79.6, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 6203.69, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 6203.69, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "a4a8716a_W8A16", + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 79.6, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5369.94, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 5369.94, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "a4a8716a_W4A16", + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 79.6, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3156.46, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3156.46, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "02748da4", + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 79.6, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 160.38, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 71.13, + "is_valid": true + }, + "interactive": { + "throughput": 160.38, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 71.13, + "is_valid": true + }, + "sustained": { + "throughput": 142.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 160.38, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 71.13 + } + }, + { + "id": "2c0b7beb", + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 79.6, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 18910.96, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 68.91, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 18910.96, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 68.91, + "is_valid": true + }, + "sustained": { + "throughput": 6144.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 18910.96, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 68.91 + } + }, + { + "id": "29b2ec38", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5730.96, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 123.76, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 5730.96, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 123.76, + "is_valid": true + }, + "sustained": { + "throughput": 709.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 1965.64, + "metric_label": "tok/s (speculative)", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 5730.96, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 123.76 + } + }, + { + "id": "f07c60f8_BF16", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5333.35, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 709.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 5333.35, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "f07c60f8_FP8", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 6195.58, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 713.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 6195.58, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "f07c60f8_W8A8", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 6146.56, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 694.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 6146.56, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "f07c60f8_W8A16", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5093.12, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 708.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 5093.12, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "f07c60f8_W4A16", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3278.96, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 649.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3278.96, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "62a36028", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 171.93, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 125.69, + "is_valid": true + }, + "interactive": { + "throughput": 171.93, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 125.69, + "is_valid": true + }, + "sustained": { + "throughput": 132.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 170.37, + "metric_label": "tok/s (speculative)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 171.93, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 125.69 + } + }, + { + "id": "53471efa", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-09", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 12862.12, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 123.42, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 12862.12, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 123.42, + "is_valid": true + }, + "sustained": { + "throughput": 1425.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 12862.12, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 123.42 + } + }, + { + "id": "b727568e", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 140.4, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-09", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3553.46, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 116.69, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 241.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 3553.46, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 116.69, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 3553.46, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 116.69 + } + }, + { + "id": "f005e907", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 140.4, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-09", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 6560.55, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "7f7a270e", + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 140.4, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 7.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-14", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5334.28, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 118.32, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 5334.28, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 118.32, + "is_valid": true + }, + "sustained": { + "throughput": 591.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 5334.28, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 118.32 + } + }, + { + "id": "3f6269bb", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2297.65, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 123.81, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 2297.65, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 123.81, + "is_valid": true + }, + "sustained": { + "throughput": 486.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 783.62, + "metric_label": "tok/s (speculative)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 2297.65, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 123.81 + } + }, + { + "id": "1bcdc710_BF16", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2214.68, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 484.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2214.68, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "1bcdc710_FP8", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2945.86, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 494.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2945.86, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "1bcdc710_W8A8", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3120.93, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 533.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3120.93, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "1bcdc710_W8A16", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2257.42, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 645.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2257.42, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "1bcdc710_W4A16", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1572.76, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 648.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1572.76, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "60c91bf0", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 44.09, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 125.72, + "is_valid": true + }, + "interactive": { + "throughput": 44.09, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 125.72, + "is_valid": true + }, + "sustained": { + "throughput": 41.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 44.09, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 125.72 + } + }, + { + "id": "1e7ed8ca", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 11716.68, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 123.52, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 11716.68, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 123.52, + "is_valid": true + }, + "sustained": { + "throughput": 1771.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 11716.68, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 123.52 + } + }, + { + "id": "76ce4cd0", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 140.4, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1694.75, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 116.75, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 176.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1694.75, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 116.75, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1694.75, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 116.75 + } + }, + { + "id": "f0d031f5", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 140.4, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 2485.62, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "7bd76bb5", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 140.4, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 46.7, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 3757.41, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 118.44, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 3757.41, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 118.44, + "is_valid": true + }, + "sustained": { + "throughput": 561.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 3757.41, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 118.44 + } + }, + { + "id": "b991b4c1", + "chip": "NVIDIA L4", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 22.5, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 366.08, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 17.85, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 366.08, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 17.85, + "is_valid": true + }, + "sustained": { + "throughput": 116.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 261.34, + "metric_label": "tok/s (speculative)", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 366.08, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 17.85 + } + }, + { + "id": "d58fa923", + "chip": "NVIDIA L4", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 22.5, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 7188.02, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 17.55, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 7188.02, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 17.55, + "is_valid": true + }, + "sustained": { + "throughput": 2837.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 7188.02, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 17.55 + } + }, + { + "id": "125c6b61", + "chip": "NVIDIA RTX 4000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 20.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-21", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 9380.99, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 15.33, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 9380.99, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 15.33, + "is_valid": true + }, + "sustained": { + "throughput": 3880.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 9380.99, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 15.33 + } + }, + { + "id": "bd3b5d27", + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-19", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1859.98, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 40.64, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1859.98, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 40.64, + "is_valid": true + }, + "sustained": { + "throughput": 376.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 854.15, + "metric_label": "tok/s (speculative)", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1859.98, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 40.64 + } + }, + { + "id": "e60276e9_BF16", + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-19", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1845.04, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 371.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1845.04, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "e60276e9_FP8", + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-19", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2667.64, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 586.1, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2667.64, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "e60276e9_W8A8", + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-19", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2543.08, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 534.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2543.08, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "e60276e9_W8A16", + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-19", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 2093.62, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 584.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 2093.62, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "e60276e9_W4A16", + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-19", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1271.3, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 816.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1271.3, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "42ab3af7", + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-19", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 33.21, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 42.53, + "is_valid": true + }, + "interactive": { + "throughput": 33.21, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 42.53, + "is_valid": true + }, + "sustained": { + "throughput": 32.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 33.21, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 42.53 + } + }, + { + "id": "2b905f5e", + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-19", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 8248.35, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 40.31, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 8248.35, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 40.31, + "is_valid": true + }, + "sustained": { + "throughput": 2895.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 8248.35, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 40.31 + } + }, + { + "id": "7cd0b745", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1567.35, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 40.71, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1567.35, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": 40.71, + "is_valid": true + }, + "sustained": { + "throughput": 265.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 657.85, + "metric_label": "tok/s (speculative)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1567.35, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 40.71 + } + }, + { + "id": "b87c1621_BF16", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1542.59, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 272.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1542.59, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b87c1621_FP8", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1469.08, + "metric_label": "tokens/sec (FP8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 435.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1469.08, + "primary_metric_label": "tokens/sec (FP8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b87c1621_W8A8", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A8", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1971.81, + "metric_label": "tokens/sec (W8A8)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 419.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1971.81, + "primary_metric_label": "tokens/sec (W8A8)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b87c1621_W8A16", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1469.7, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 433.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1469.7, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b87c1621_W4A16", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 940.08, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 541.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 940.08, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "f2197473", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 31.11, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 42.6, + "is_valid": true + }, + "interactive": { + "throughput": 31.11, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 42.6, + "is_valid": true + }, + "sustained": { + "throughput": 30.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 31.11, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 42.6 + } + }, + { + "id": "a33d6eb3", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 10433.7, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 40.41, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 10433.7, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": 40.41, + "is_valid": true + }, + "sustained": { + "throughput": 1917.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 10433.7, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 40.41 + } + }, + { + "id": "0981ecf7", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 48.0, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 798.25, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 40.12, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 105.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 798.25, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": 40.12, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 798.25, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 40.12 + } + }, + { + "id": "334507e5", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 48.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 1929.75, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "a8cf2a0f", + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 48.0, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 46.7, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-17", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1681.33, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": 41.83, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1681.33, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": 41.83, + "is_valid": true + }, + "sustained": { + "throughput": 343.0, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1681.33, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 64, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 41.83 + } + }, + { + "id": "4660bc0b", + "chip": "Tesla T4", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 15.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 5125.58, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 10.83, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 5125.58, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 10.83, + "is_valid": true + }, + "sustained": { + "throughput": 2006.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 5125.58, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 10.83 + } + }, + { + "id": "48261ecc", + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 933.93, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 26.54, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 933.93, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 26.54, + "is_valid": true + }, + "sustained": { + "throughput": 268.3, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 431.31, + "metric_label": "tok/s (speculative)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 933.93, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 26.54 + } + }, + { + "id": "b957e789_FP16", + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 936.85, + "metric_label": "tokens/sec (FP16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 265.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 936.85, + "primary_metric_label": "tokens/sec (FP16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "b957e789_W4A16", + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "W4A16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 783.65, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 416.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 783.65, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "6eb549a8", + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 15.01, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 28.46, + "is_valid": true + }, + "interactive": { + "throughput": 15.01, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": 28.46, + "is_valid": true + }, + "sustained": { + "throughput": 14.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 15.01, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 28.46 + } + }, + { + "id": "04fce6f6", + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-15", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 7870.64, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 26.25, + "is_valid": true + }, + "online": { + "throughput": 10, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 7870.64, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 26.25, + "is_valid": true + }, + "sustained": { + "throughput": 2789.7, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 7870.64, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": 26.25 + } + }, + { + "id": "48f19c22", + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 32.0, + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_params_b": 70.0, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 466.49, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 26.16, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 92.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": 4, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 466.49, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": 26.16, + "is_valid": true + }, + "burst": { + "throughput": 0.0, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 466.49, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 26.16 + } + }, + { + "id": "865d778c", + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 32.0, + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-16", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": null, + "metric_label": "", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": false + } + }, + "primary_scenario": "scaling", + "primary_throughput": 1172.42, + "primary_metric_label": "tokens/sec (1x baseline)", + "config": {} + }, + { + "id": "2ef567be", + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 8, + "memory_gb": 32.0, + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "model_full": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_params_b": 46.7, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "vLLM", + "framework_version": "0.7.3", + "tier": "verified", + "submitted_by": "JuhaoLiang1997", + "date": "2026-04-18", + "reproduce_script": "runners/nvidia_vllm_47f5d58e/runner.py", + "runner_id": "nvidia_vllm_47f5d58e", + "scenarios": { + "offline": { + "throughput": 1026.12, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 27.83, + "is_valid": true + }, + "online": { + "throughput": 2, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 1026.12, + "metric_label": "tokens/sec", + "concurrency": 4, + "peak_memory_gb": 27.83, + "is_valid": true + }, + "sustained": { + "throughput": 293.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 1026.12, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 4, + "batch_size": null, + "tensor_parallel": 8, + "peak_memory_gb": 27.83 + } + }, + { + "id": "cabb7bd0", + "chip": "MTT S4000", + "chip_vendor": "Moore Threads", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "float16", + "framework": "vllm-musa", + "framework_version": "0.4.2", + "tier": "community", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "runner_id": "moorethreads_vllm_musa_f2f6f965", + "scenarios": { + "offline": { + "throughput": 332.62, + "metric_label": "tokens/sec", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 5, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 332.62, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 8, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "4f66d29d", + "chip": "MTT S4000", + "chip_vendor": "Moore Threads", + "chip_count": 1, + "memory_gb": 48.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "float16", + "framework": "vllm-musa", + "framework_version": "0.4.2", + "tier": "community", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py", + "runner_id": "moorethreads_vllm_musa_f2f6f965", + "scenarios": { + "offline": { + "throughput": 2004.02, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 2004.02, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 2004.02, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 64, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "958afbbd", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "SGLang", + "framework_version": "0.5.6", + "tier": "community", + "submitted_by": "Gong-K", + "date": "2026-05-06", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "runner_id": "nvidia_sglang_c43a8309", + "scenarios": { + "offline": { + "throughput": 3146.66, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 100, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 3146.66, + "metric_label": "tokens/sec", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 562.5, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 705.16, + "metric_label": "tok/s (speculative)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.975, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 3146.66, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 32, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "651fefa6_BF16", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "SGLang", + "framework_version": "0.5.6", + "tier": "community", + "submitted_by": "Gong-K", + "date": "2026-04-30", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "runner_id": "nvidia_sglang_c43a8309", + "scenarios": { + "offline": { + "throughput": 3160.74, + "metric_label": "tokens/sec (BF16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 50, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 558.6, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3160.74, + "primary_metric_label": "tokens/sec (BF16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "651fefa6_W8A16", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8, + "precision": "W8A16", + "effective_dtype": "bfloat16", + "framework": "SGLang", + "framework_version": "0.5.6", + "tier": "community", + "submitted_by": "Gong-K", + "date": "2026-04-30", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "runner_id": "nvidia_sglang_c43a8309", + "scenarios": { + "offline": { + "throughput": 3396.91, + "metric_label": "tokens/sec (W8A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 50, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 841.8, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 3396.91, + "primary_metric_label": "tokens/sec (W8A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "651fefa6_W4A16", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40, + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8, + "precision": "W4A16", + "effective_dtype": "bfloat16", + "framework": "SGLang", + "framework_version": "0.5.6", + "tier": "community", + "submitted_by": "Gong-K", + "date": "2026-04-30", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "runner_id": "nvidia_sglang_c43a8309", + "scenarios": { + "offline": { + "throughput": 1817.91, + "metric_label": "tokens/sec (W4A16)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 50, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 760.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "quantization", + "primary_throughput": 1817.91, + "primary_metric_label": "tokens/sec (W4A16)", + "config": { + "concurrency": null, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "99c43b97", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40, + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "model_full": "meta-llama/Llama-3.1-8B-Instruct", + "model_params_b": 8, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "SGLang", + "framework_version": "0.5.6", + "tier": "community", + "submitted_by": "Gong-K", + "date": "2026-05-07", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "runner_id": "nvidia_sglang_c43a8309", + "scenarios": { + "offline": { + "throughput": 59.89, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 59.89, + "metric_label": "tokens/sec", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 54.9, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 36.86, + "metric_label": "tok/s (speculative)", + "concurrency": 1, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 59.89, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 1, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "435424a8", + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 40, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "SGLang", + "framework_version": "0.5.6", + "tier": "community", + "submitted_by": "Gong-K", + "date": "2026-05-07", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "runner_id": "nvidia_sglang_c43a8309", + "scenarios": { + "offline": { + "throughput": 11509.2, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 40, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 11509.2, + "metric_label": "tokens/sec", + "concurrency": 64, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 7095.4, + "metric_label": "tok/s (sustained mean)", + "concurrency": 32, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 11509.2, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 64, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "83e3ec26", + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 140.4, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "BF16", + "effective_dtype": "bfloat16", + "framework": "SGLang", + "framework_version": "0.5.6", + "tier": "community", + "submitted_by": "Gong-K", + "date": "2026-06-25", + "reproduce_script": "runners/nvidia_sglang_c43a8309/runner.py", + "runner_id": "nvidia_sglang_c43a8309", + "scenarios": { + "offline": { + "throughput": 4342.21, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 100, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 4342.21, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "sustained": { + "throughput": 1272.2, + "metric_label": "tok/s (sustained mean)", + "concurrency": 8, + "peak_memory_gb": null, + "is_valid": true + }, + "speculative": { + "throughput": 613.8, + "metric_label": "tok/s (speculative)", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "burst": { + "throughput": 0.835, + "metric_label": "1 − degradation_ratio", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 4342.21, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "4e0e6eba", + "chip": "Tesla V100-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "model_full": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_params_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "tier": "community", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "runner_id": "nvidia_onecat_vllm_12a253c2", + "scenarios": { + "offline": { + "throughput": 671.43, + "metric_label": "tokens/sec", + "concurrency": 128, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 671.43, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 128, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + }, + { + "id": "419b138c", + "chip": "Tesla V100-PCIE-32GB", + "chip_vendor": "NVIDIA", + "chip_count": 1, + "memory_gb": 32.0, + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "model_full": "Qwen/Qwen2.5-0.5B-Instruct", + "model_params_b": 0.5, + "precision": "FP16", + "effective_dtype": "float16", + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "tier": "community", + "submitted_by": "JuhaoLiang1997", + "date": "2026-05-18", + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "runner_id": "nvidia_onecat_vllm_12a253c2", + "scenarios": { + "offline": { + "throughput": 6292.79, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": null, + "is_valid": true + }, + "online": { + "throughput": 0.0, + "metric_label": "max valid QPS", + "concurrency": null, + "peak_memory_gb": null, + "is_valid": true + }, + "interactive": { + "throughput": 6292.79, + "metric_label": "tokens/sec", + "concurrency": 16, + "peak_memory_gb": null, + "is_valid": true + } + }, + "primary_scenario": "offline", + "primary_throughput": 6292.79, + "primary_metric_label": "tokens/sec", + "config": { + "concurrency": 16, + "batch_size": null, + "tensor_parallel": 1, + "peak_memory_gb": null + } + } +]; +window.DISTRIBUTION_SUBMISSIONS = DISTRIBUTION_SUBMISSIONS; + +const DISTRIBUTION_GROUPS = [ + { + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 2, + "best_throughput": 22884.92, + "median_throughput": 22884.92, + "min_throughput": 11011.89, + "max_throughput": 22884.92, + "stddev_throughput": 8395.5, + "scenario_summary": { + "offline": { + "count": 2, + "best_throughput": 22884.92, + "best_framework": "vLLM" + }, + "online": { + "count": 2, + "best_throughput": 40, + "best_framework": "vLLM" + }, + "interactive": { + "count": 2, + "best_throughput": 22884.92, + "best_framework": "vLLM" + }, + "sustained": { + "count": 2, + "best_throughput": 11576.2, + "best_framework": "vLLM" + } + }, + "best_submission_id": "a4e6a6e4", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 18910.96, + "median_throughput": 18910.96, + "min_throughput": 18910.96, + "max_throughput": 18910.96, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 18910.96, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 40, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 18910.96, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 6144.7, + "best_framework": "vLLM" + } + }, + "best_submission_id": "2c0b7beb", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 15323.19, + "median_throughput": 15323.19, + "min_throughput": 15323.19, + "max_throughput": 15323.19, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 15323.19, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 40, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 15323.19, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 3941.2, + "best_framework": "vLLM" + } + }, + "best_submission_id": "776d2702", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 14273.14, + "median_throughput": 14273.14, + "min_throughput": 14273.14, + "max_throughput": 14273.14, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 14273.14, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 40, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 14273.14, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 5995.2, + "best_framework": "vLLM" + } + }, + "best_submission_id": "06662a14", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 12862.12, + "median_throughput": 12862.12, + "min_throughput": 12862.12, + "max_throughput": 12862.12, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 12862.12, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 12862.12, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 1425.4, + "best_framework": "vLLM" + } + }, + "best_submission_id": "53471efa", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 11972.12, + "median_throughput": 11972.12, + "min_throughput": 11972.12, + "max_throughput": 11972.12, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 11972.12, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 11972.12, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 2804.8, + "best_framework": "vLLM" + } + }, + "best_submission_id": "54d0e7aa", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 11716.68, + "median_throughput": 11716.68, + "min_throughput": 11716.68, + "max_throughput": 11716.68, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 11716.68, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 11716.68, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 1771.6, + "best_framework": "vLLM" + } + }, + "best_submission_id": "1e7ed8ca", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 2, + "best_throughput": 11509.2, + "median_throughput": 11509.2, + "min_throughput": 10805.85, + "max_throughput": 11509.2, + "stddev_throughput": 497.34, + "scenario_summary": { + "offline": { + "count": 2, + "best_throughput": 11509.2, + "best_framework": "SGLang" + }, + "online": { + "count": 2, + "best_throughput": 40, + "best_framework": "vLLM" + }, + "interactive": { + "count": 2, + "best_throughput": 11509.2, + "best_framework": "SGLang" + }, + "sustained": { + "count": 2, + "best_throughput": 7095.4, + "best_framework": "SGLang" + } + }, + "best_submission_id": "435424a8", + "best_framework": "SGLang", + "best_submitted_by": "Gong-K" + }, + { + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 11440.55, + "median_throughput": 11440.55, + "min_throughput": 11440.55, + "max_throughput": 11440.55, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 11440.55, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 11440.55, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 1698.1, + "best_framework": "vLLM" + } + }, + "best_submission_id": "b228454f", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 10433.7, + "median_throughput": 10433.7, + "min_throughput": 10433.7, + "max_throughput": 10433.7, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 10433.7, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 10433.7, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 1917.3, + "best_framework": "vLLM" + } + }, + "best_submission_id": "a33d6eb3", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX 4000 Ada Generation", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 9380.99, + "median_throughput": 9380.99, + "min_throughput": 9380.99, + "max_throughput": 9380.99, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 9380.99, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 40, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 9380.99, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 3880.8, + "best_framework": "vLLM" + } + }, + "best_submission_id": "125c6b61", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 8961.38, + "median_throughput": 8961.38, + "min_throughput": 8961.38, + "max_throughput": 8961.38, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 8961.38, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 8961.38, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 2693.3, + "best_framework": "vLLM" + } + }, + "best_submission_id": "faf550ec", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 8248.35, + "median_throughput": 8248.35, + "min_throughput": 8248.35, + "max_throughput": 8248.35, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 8248.35, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 8248.35, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 2895.0, + "best_framework": "vLLM" + } + }, + "best_submission_id": "2b905f5e", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 7870.64, + "median_throughput": 7870.64, + "min_throughput": 7870.64, + "max_throughput": 7870.64, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 7870.64, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 7870.64, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 2789.7, + "best_framework": "vLLM" + } + }, + "best_submission_id": "04fce6f6", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA L4", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 7188.02, + "median_throughput": 7188.02, + "min_throughput": 7188.02, + "max_throughput": 7188.02, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 7188.02, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 40, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 7188.02, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 2837.3, + "best_framework": "vLLM" + } + }, + "best_submission_id": "d58fa923", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 6560.55, + "median_throughput": 6560.55, + "min_throughput": 6560.55, + "max_throughput": 6560.55, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "f005e907", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 6292.79, + "median_throughput": 6292.79, + "min_throughput": 6292.79, + "max_throughput": 6292.79, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 6292.79, + "best_framework": "1Cat-vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + }, + "interactive": { + "count": 1, + "best_throughput": 6292.79, + "best_framework": "1Cat-vLLM" + } + }, + "best_submission_id": "419b138c", + "best_framework": "1Cat-vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 5730.96, + "median_throughput": 5730.96, + "min_throughput": 5730.96, + "max_throughput": 5730.96, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 5730.96, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 5730.96, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 709.2, + "best_framework": "vLLM" + }, + "speculative": { + "count": 1, + "best_throughput": 1965.64, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "29b2ec38", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 6314.92, + "median_throughput": 5369.94, + "min_throughput": 3156.46, + "max_throughput": 6314.92, + "stddev_throughput": 1272.34, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 6314.92, + "best_framework": "vLLM" + } + }, + "best_submission_id": "a4a8716a_FP8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 5334.28, + "median_throughput": 5334.28, + "min_throughput": 5334.28, + "max_throughput": 5334.28, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 5334.28, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 5334.28, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 591.5, + "best_framework": "vLLM" + } + }, + "best_submission_id": "7f7a270e", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 6195.58, + "median_throughput": 5333.35, + "min_throughput": 3278.96, + "max_throughput": 6195.58, + "stddev_throughput": 1183.88, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 6195.58, + "best_framework": "vLLM" + }, + "online": { + "count": 5, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 5, + "best_throughput": 713.4, + "best_framework": "vLLM" + } + }, + "best_submission_id": "f07c60f8_FP8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 5128.31, + "median_throughput": 5128.31, + "min_throughput": 5128.31, + "max_throughput": 5128.31, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 5128.31, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 25, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 5128.31, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 907.1, + "best_framework": "vLLM" + }, + "speculative": { + "count": 1, + "best_throughput": 1733.92, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "831c95a7", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla T4", + "chip_vendor": "NVIDIA", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 5125.58, + "median_throughput": 5125.58, + "min_throughput": 5125.58, + "max_throughput": 5125.58, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 5125.58, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 5125.58, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 2006.9, + "best_framework": "vLLM" + } + }, + "best_submission_id": "4660bc0b", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 2, + "best_throughput": 4342.21, + "median_throughput": 4342.21, + "min_throughput": 2297.65, + "max_throughput": 4342.21, + "stddev_throughput": 1445.72, + "scenario_summary": { + "offline": { + "count": 2, + "best_throughput": 4342.21, + "best_framework": "SGLang" + }, + "online": { + "count": 2, + "best_throughput": 100, + "best_framework": "SGLang" + }, + "interactive": { + "count": 2, + "best_throughput": 4342.21, + "best_framework": "SGLang" + }, + "sustained": { + "count": 2, + "best_throughput": 1272.2, + "best_framework": "SGLang" + }, + "speculative": { + "count": 2, + "best_throughput": 783.62, + "best_framework": "vLLM" + }, + "burst": { + "count": 2, + "best_throughput": 0.835, + "best_framework": "SGLang" + } + }, + "best_submission_id": "83e3ec26", + "best_framework": "SGLang", + "best_submitted_by": "Gong-K" + }, + { + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 2, + "best_throughput": 3916.69, + "median_throughput": 3916.69, + "min_throughput": 2700.61, + "max_throughput": 3916.69, + "stddev_throughput": 859.9, + "scenario_summary": { + "offline": { + "count": 2, + "best_throughput": 3916.69, + "best_framework": "vLLM" + }, + "online": { + "count": 2, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "interactive": { + "count": 2, + "best_throughput": 3916.69, + "best_framework": "vLLM" + }, + "sustained": { + "count": 2, + "best_throughput": 712.3, + "best_framework": "vLLM" + }, + "burst": { + "count": 2, + "best_throughput": null, + "best_framework": "" + }, + "speculative": { + "count": 1, + "best_throughput": 999.61, + "best_framework": "vLLM" + } + }, + "best_submission_id": "8f83bfab", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 3757.41, + "median_throughput": 3757.41, + "min_throughput": 3757.41, + "max_throughput": 3757.41, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 3757.41, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 3757.41, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 561.2, + "best_framework": "vLLM" + } + }, + "best_submission_id": "7bd76bb5", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 3553.46, + "median_throughput": 3553.46, + "min_throughput": 3553.46, + "max_throughput": 3553.46, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 3553.46, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 241.2, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 3553.46, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "b727568e", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 3, + "best_throughput": 5303.83, + "median_throughput": 3519.52, + "min_throughput": 2934.27, + "max_throughput": 5303.83, + "stddev_throughput": 1234.31, + "scenario_summary": { + "offline": { + "count": 3, + "best_throughput": 5303.83, + "best_framework": "vLLM" + }, + "online": { + "count": 3, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 3, + "best_throughput": 1381.4, + "best_framework": "vLLM" + } + }, + "best_submission_id": "d1baa050_W8A16", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 3487.52, + "median_throughput": 3487.52, + "min_throughput": 3487.52, + "max_throughput": 3487.52, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 3487.52, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + }, + "interactive": { + "count": 1, + "best_throughput": 3487.52, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 707.5, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "b8f8ed0f", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 3428.42, + "median_throughput": 3428.42, + "min_throughput": 3428.42, + "max_throughput": 3428.42, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 3428.42, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 3428.42, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 569.1, + "best_framework": "vLLM" + } + }, + "best_submission_id": "d31ba78b", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 3341.37, + "median_throughput": 3341.37, + "min_throughput": 3341.37, + "max_throughput": 3341.37, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "74d08a7a", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 3217.83, + "median_throughput": 3217.83, + "min_throughput": 3217.83, + "max_throughput": 3217.83, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 3217.83, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 3217.83, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 472.7, + "best_framework": "vLLM" + } + }, + "best_submission_id": "08de2dc2", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 10, + "best_throughput": 4141.71, + "median_throughput": 3208.11, + "min_throughput": 1761.96, + "max_throughput": 4141.71, + "stddev_throughput": 797.93, + "scenario_summary": { + "offline": { + "count": 10, + "best_throughput": 4141.71, + "best_framework": "vLLM" + }, + "online": { + "count": 10, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 10, + "best_throughput": 757.0, + "best_framework": "vLLM" + } + }, + "best_submission_id": "ffd81462_FP8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 2, + "best_throughput": 3146.66, + "median_throughput": 3146.66, + "min_throughput": 2415.99, + "max_throughput": 3146.66, + "stddev_throughput": 516.66, + "scenario_summary": { + "offline": { + "count": 2, + "best_throughput": 3146.66, + "best_framework": "SGLang" + }, + "online": { + "count": 2, + "best_throughput": 100, + "best_framework": "SGLang" + }, + "interactive": { + "count": 2, + "best_throughput": 3146.66, + "best_framework": "SGLang" + }, + "sustained": { + "count": 2, + "best_throughput": 562.5, + "best_framework": "SGLang" + }, + "speculative": { + "count": 2, + "best_throughput": 947.4, + "best_framework": "vLLM" + }, + "burst": { + "count": 2, + "best_throughput": 0.975, + "best_framework": "SGLang" + } + }, + "best_submission_id": "958afbbd", + "best_framework": "SGLang", + "best_submitted_by": "Gong-K" + }, + { + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 3000.59, + "median_throughput": 3000.59, + "min_throughput": 3000.59, + "max_throughput": 3000.59, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "e76a4402", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 2723.49, + "median_throughput": 2723.49, + "min_throughput": 2723.49, + "max_throughput": 2723.49, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 2723.49, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 2723.49, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 546.4, + "best_framework": "vLLM" + }, + "speculative": { + "count": 1, + "best_throughput": 1050.9, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "298e6500", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 3570.3, + "median_throughput": 2710.52, + "min_throughput": 1770.93, + "max_throughput": 3570.3, + "stddev_throughput": 641.09, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 3570.3, + "best_framework": "vLLM" + }, + "online": { + "count": 5, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 5, + "best_throughput": 829.8, + "best_framework": "vLLM" + } + }, + "best_submission_id": "944773aa_W8A8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 8, + "best_throughput": 3396.91, + "median_throughput": 2574.53, + "min_throughput": 1563.39, + "max_throughput": 3396.91, + "stddev_throughput": 661.15, + "scenario_summary": { + "offline": { + "count": 8, + "best_throughput": 3396.91, + "best_framework": "SGLang" + }, + "online": { + "count": 8, + "best_throughput": 50, + "best_framework": "SGLang" + }, + "sustained": { + "count": 8, + "best_throughput": 841.8, + "best_framework": "SGLang" + } + }, + "best_submission_id": "651fefa6_W8A16", + "best_framework": "SGLang", + "best_submitted_by": "Gong-K" + }, + { + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 2485.62, + "median_throughput": 2485.62, + "min_throughput": 2485.62, + "max_throughput": 2485.62, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "f0d031f5", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 3120.93, + "median_throughput": 2257.42, + "min_throughput": 1572.76, + "max_throughput": 3120.93, + "stddev_throughput": 623.34, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 3120.93, + "best_framework": "vLLM" + }, + "online": { + "count": 5, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 5, + "best_throughput": 648.8, + "best_framework": "vLLM" + } + }, + "best_submission_id": "1bcdc710_W8A8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 2933.85, + "median_throughput": 2226.02, + "min_throughput": 1292.71, + "max_throughput": 2933.85, + "stddev_throughput": 753.22, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 2933.85, + "best_framework": "vLLM" + }, + "online": { + "count": 5, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 5, + "best_throughput": 854.9, + "best_framework": "vLLM" + } + }, + "best_submission_id": "b59b0798_FP8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 2888.23, + "median_throughput": 2171.95, + "min_throughput": 1299.02, + "max_throughput": 2888.23, + "stddev_throughput": 725.2, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 2888.23, + "best_framework": "vLLM" + }, + "online": { + "count": 5, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 5, + "best_throughput": 606.2, + "best_framework": "vLLM" + } + }, + "best_submission_id": "6d7e1d48_FP8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 2667.64, + "median_throughput": 2093.62, + "min_throughput": 1271.3, + "max_throughput": 2667.64, + "stddev_throughput": 563.26, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 2667.64, + "best_framework": "vLLM" + }, + "online": { + "count": 5, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 5, + "best_throughput": 816.3, + "best_framework": "vLLM" + } + }, + "best_submission_id": "e60276e9_FP8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1929.75, + "median_throughput": 1929.75, + "min_throughput": 1929.75, + "max_throughput": 1929.75, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "334507e5", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1859.98, + "median_throughput": 1859.98, + "min_throughput": 1859.98, + "max_throughput": 1859.98, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1859.98, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1859.98, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 376.2, + "best_framework": "vLLM" + }, + "speculative": { + "count": 1, + "best_throughput": 854.15, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "bd3b5d27", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 1803.59, + "median_throughput": 1803.59, + "min_throughput": 1803.59, + "max_throughput": 1803.59, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1803.59, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 25, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 184.0, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1803.59, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "de0853fa", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 1704.71, + "median_throughput": 1704.71, + "min_throughput": 1704.71, + "max_throughput": 1704.71, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1704.71, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 25, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 164.3, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1704.71, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "14410aea", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 1694.75, + "median_throughput": 1694.75, + "min_throughput": 1694.75, + "max_throughput": 1694.75, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1694.75, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 176.0, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1694.75, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "76ce4cd0", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 1681.33, + "median_throughput": 1681.33, + "min_throughput": 1681.33, + "max_throughput": 1681.33, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1681.33, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1681.33, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 343.0, + "best_framework": "vLLM" + } + }, + "best_submission_id": "a8cf2a0f", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1633.89, + "median_throughput": 1633.89, + "min_throughput": 1633.89, + "max_throughput": 1633.89, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "bba67533", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1609.89, + "median_throughput": 1609.89, + "min_throughput": 1609.89, + "max_throughput": 1609.89, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "54dccbd0", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1567.35, + "median_throughput": 1567.35, + "min_throughput": 1567.35, + "max_throughput": 1567.35, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1567.35, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1567.35, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 265.3, + "best_framework": "vLLM" + }, + "speculative": { + "count": 1, + "best_throughput": 657.85, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "7cd0b745", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 1971.81, + "median_throughput": 1469.7, + "min_throughput": 940.08, + "max_throughput": 1971.81, + "stddev_throughput": 366.58, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 1971.81, + "best_framework": "vLLM" + }, + "online": { + "count": 5, + "best_throughput": 10, + "best_framework": "vLLM" + }, + "sustained": { + "count": 5, + "best_throughput": 541.0, + "best_framework": "vLLM" + } + }, + "best_submission_id": "b87c1621_W8A8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090 D", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1461.83, + "median_throughput": 1461.83, + "min_throughput": 1461.83, + "max_throughput": 1461.83, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1461.83, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1461.83, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 400.1, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "d6543f77", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1452.3, + "median_throughput": 1452.3, + "min_throughput": 1452.3, + "max_throughput": 1452.3, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1452.3, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + }, + "interactive": { + "count": 1, + "best_throughput": 1452.3, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 339.8, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "675e325e", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 5, + "best_throughput": 2065.49, + "median_throughput": 1374.59, + "min_throughput": 898.24, + "max_throughput": 2065.49, + "stddev_throughput": 449.06, + "scenario_summary": { + "offline": { + "count": 5, + "best_throughput": 2065.49, + "best_framework": "vLLM" + }, + "online": { + "count": 5, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "sustained": { + "count": 5, + "best_throughput": 588.9, + "best_framework": "vLLM" + } + }, + "best_submission_id": "4955fbb1_W8A8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 1361.9, + "median_throughput": 1361.9, + "min_throughput": 1361.9, + "max_throughput": 1361.9, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1361.9, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 2, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1361.9, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 325.5, + "best_framework": "vLLM" + } + }, + "best_submission_id": "a4179ecc", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1172.42, + "median_throughput": 1172.42, + "min_throughput": 1172.42, + "max_throughput": 1172.42, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "865d778c", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1065.49, + "median_throughput": 1065.49, + "min_throughput": 1065.49, + "max_throughput": 1065.49, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "81ca6d0e", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 1026.12, + "median_throughput": 1026.12, + "min_throughput": 1026.12, + "max_throughput": 1026.12, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1026.12, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 2, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 1026.12, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 293.9, + "best_framework": "vLLM" + } + }, + "best_submission_id": "2ef567be", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 3090", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1001.2, + "median_throughput": 1001.2, + "min_throughput": 1001.2, + "max_throughput": 1001.2, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1001.2, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + }, + "interactive": { + "count": 1, + "best_throughput": 1001.2, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 309.9, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "e95e2caa", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_C", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 2, + "best_throughput": 936.85, + "median_throughput": 936.85, + "min_throughput": 783.65, + "max_throughput": 936.85, + "stddev_throughput": 108.33, + "scenario_summary": { + "offline": { + "count": 2, + "best_throughput": 936.85, + "best_framework": "vLLM" + }, + "online": { + "count": 2, + "best_throughput": null, + "best_framework": "" + }, + "sustained": { + "count": 2, + "best_throughput": 416.4, + "best_framework": "vLLM" + } + }, + "best_submission_id": "b957e789_FP16", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 933.93, + "median_throughput": 933.93, + "min_throughput": 933.93, + "max_throughput": 933.93, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 933.93, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 933.93, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 268.3, + "best_framework": "vLLM" + }, + "speculative": { + "count": 1, + "best_throughput": 431.31, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "48261ecc", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 798.25, + "median_throughput": 798.25, + "min_throughput": 798.25, + "max_throughput": 798.25, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 798.25, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 105.3, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 798.25, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "0981ecf7", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 671.43, + "median_throughput": 671.43, + "min_throughput": 671.43, + "max_throughput": 671.43, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 671.43, + "best_framework": "1Cat-vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "4e0e6eba", + "best_framework": "1Cat-vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 466.49, + "median_throughput": 466.49, + "min_throughput": 466.49, + "max_throughput": 466.49, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 466.49, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + }, + "sustained": { + "count": 1, + "best_throughput": 92.8, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 466.49, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "48f19c22", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA GeForce RTX 4090", + "chip_vendor": "NVIDIA", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 442.95, + "median_throughput": 442.95, + "min_throughput": 442.95, + "max_throughput": 442.95, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 442.95, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + }, + "sustained": { + "count": 1, + "best_throughput": 104.5, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 442.95, + "best_framework": "vLLM" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "cfd0bdc8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA L4", + "chip_vendor": "NVIDIA", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 366.08, + "median_throughput": 366.08, + "min_throughput": 366.08, + "max_throughput": 366.08, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 366.08, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + }, + "interactive": { + "count": 1, + "best_throughput": 366.08, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 116.6, + "best_framework": "vLLM" + }, + "speculative": { + "count": 1, + "best_throughput": 261.34, + "best_framework": "vLLM" + } + }, + "best_submission_id": "b991b4c1", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H200", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 171.93, + "median_throughput": 171.93, + "min_throughput": 171.93, + "max_throughput": 171.93, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 171.93, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 171.93, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 132.9, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + }, + "speculative": { + "count": 1, + "best_throughput": 170.37, + "best_framework": "vLLM" + } + }, + "best_submission_id": "62a36028", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H100 80GB HBM3", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 160.38, + "median_throughput": 160.38, + "min_throughput": 160.38, + "max_throughput": 160.38, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 160.38, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 160.38, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 142.6, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "02748da4", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A800-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 70.34, + "median_throughput": 70.34, + "min_throughput": 70.34, + "max_throughput": 70.34, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 70.34, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 70.34, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 67.0, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "4d0e7990", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A100-SXM4-80GB", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 2, + "best_throughput": 70.21, + "median_throughput": 70.21, + "min_throughput": 65.15, + "max_throughput": 70.21, + "stddev_throughput": 3.58, + "scenario_summary": { + "offline": { + "count": 2, + "best_throughput": 70.21, + "best_framework": "vLLM" + }, + "interactive": { + "count": 2, + "best_throughput": 70.21, + "best_framework": "vLLM" + }, + "sustained": { + "count": 2, + "best_throughput": 67.1, + "best_framework": "vLLM" + }, + "online": { + "count": 2, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "7bef8eef", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA A100-SXM4-40GB", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 2, + "best_throughput": 59.89, + "median_throughput": 59.89, + "min_throughput": 56.79, + "max_throughput": 59.89, + "stddev_throughput": 2.19, + "scenario_summary": { + "offline": { + "count": 2, + "best_throughput": 59.89, + "best_framework": "SGLang" + }, + "interactive": { + "count": 2, + "best_throughput": 59.89, + "best_framework": "SGLang" + }, + "sustained": { + "count": 2, + "best_throughput": 57.0, + "best_framework": "vLLM" + }, + "online": { + "count": 2, + "best_throughput": null, + "best_framework": "" + }, + "speculative": { + "count": 1, + "best_throughput": 36.86, + "best_framework": "SGLang" + } + }, + "best_submission_id": "99c43b97", + "best_framework": "SGLang", + "best_submitted_by": "Gong-K" + }, + { + "chip": "NVIDIA GeForce RTX 5090", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 54.26, + "median_throughput": 54.26, + "min_throughput": 54.26, + "max_throughput": 54.26, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 54.26, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 54.26, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 51.5, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "e87e6c36", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA H20-3e", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 44.09, + "median_throughput": 44.09, + "min_throughput": 44.09, + "max_throughput": 44.09, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 44.09, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 44.09, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 41.4, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "60c91bf0", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX 6000 Ada Generation", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 33.21, + "median_throughput": 33.21, + "min_throughput": 33.21, + "max_throughput": 33.21, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 33.21, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 33.21, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 32.3, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "42ab3af7", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "NVIDIA RTX A6000", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 31.11, + "median_throughput": 31.11, + "min_throughput": 31.11, + "max_throughput": 31.11, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 31.11, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 31.11, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 30.5, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "f2197473", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Tesla V100S-PCIE-32GB", + "chip_vendor": "NVIDIA", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 15.01, + "median_throughput": 15.01, + "min_throughput": 15.01, + "max_throughput": 15.01, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 15.01, + "best_framework": "vLLM" + }, + "interactive": { + "count": 1, + "best_throughput": 15.01, + "best_framework": "vLLM" + }, + "sustained": { + "count": 1, + "best_throughput": 14.9, + "best_framework": "vLLM" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "6eb549a8", + "best_framework": "vLLM", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 7848.84, + "median_throughput": 7848.84, + "min_throughput": 7848.84, + "max_throughput": 7848.84, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 7848.84, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 7848.84, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 2217.9, + "best_framework": "vllm-ascend" + } + }, + "best_submission_id": "8826a63d", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 4941.13, + "median_throughput": 4941.13, + "min_throughput": 4941.13, + "max_throughput": 4941.13, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 4941.13, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": 10, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 4941.13, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 1238.9, + "best_framework": "vllm-ascend" + } + }, + "best_submission_id": "bd7d8f87", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 2499.29, + "median_throughput": 2499.29, + "min_throughput": 2499.29, + "max_throughput": 2499.29, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "b1fe92eb", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "suite": "suite_E", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 2430.05, + "median_throughput": 2430.05, + "min_throughput": 2430.05, + "max_throughput": 2430.05, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "354e5562", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1888.72, + "median_throughput": 1888.72, + "min_throughput": 1888.72, + "max_throughput": 1888.72, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1888.72, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 1888.72, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 376.6, + "best_framework": "vllm-ascend" + }, + "speculative": { + "count": 1, + "best_throughput": 1858.56, + "best_framework": "vllm-ascend" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "74d19743", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 1699.04, + "median_throughput": 1699.04, + "min_throughput": 1699.04, + "max_throughput": 1699.04, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1699.04, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 1699.04, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 268.0, + "best_framework": "vllm-ascend" + }, + "speculative": { + "count": 1, + "best_throughput": 1718.23, + "best_framework": "vllm-ascend" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "a2777c30", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 1638.62, + "median_throughput": 1638.62, + "min_throughput": 1638.62, + "max_throughput": 1638.62, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1638.62, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": 2, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 1638.62, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 262.2, + "best_framework": "vllm-ascend" + } + }, + "best_submission_id": "329a2b9e", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "suite": "suite_G", + "model": "Mixtral-8x7B-Instruct-v0.1", + "submission_count": 1, + "best_throughput": 1631.87, + "median_throughput": 1631.87, + "min_throughput": 1631.87, + "max_throughput": 1631.87, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 1631.87, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": 2, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 1631.87, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 226.6, + "best_framework": "vllm-ascend" + } + }, + "best_submission_id": "d726144e", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 769.88, + "median_throughput": 769.88, + "min_throughput": 769.88, + "max_throughput": 769.88, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 769.88, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": 2, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 53.2, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 769.88, + "best_framework": "vllm-ascend" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "fcb9725c", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "suite": "suite_B", + "model": "Meta-Llama-3-70B-Instruct", + "submission_count": 1, + "best_throughput": 723.06, + "median_throughput": 723.06, + "min_throughput": 723.06, + "max_throughput": 723.06, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 723.06, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": 2, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 53.5, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 723.06, + "best_framework": "vllm-ascend" + }, + "burst": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "635ecf42", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend Ascend910", + "chip_vendor": "Huawei", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 55.0, + "median_throughput": 55.0, + "min_throughput": 55.0, + "max_throughput": 55.0, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 55.0, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 55.0, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 54.2, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "6c1e7ffe", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Huawei Ascend 910B2", + "chip_vendor": "Huawei", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 53.22, + "median_throughput": 53.22, + "min_throughput": 53.22, + "max_throughput": 53.22, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 53.22, + "best_framework": "vllm-ascend" + }, + "interactive": { + "count": 1, + "best_throughput": 53.22, + "best_framework": "vllm-ascend" + }, + "sustained": { + "count": 1, + "best_throughput": 53.2, + "best_framework": "vllm-ascend" + }, + "online": { + "count": 1, + "best_throughput": null, + "best_framework": "" + } + }, + "best_submission_id": "a3547ba9", + "best_framework": "vllm-ascend", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Google TPU v5e", + "chip_vendor": "Google", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 8127.1, + "median_throughput": 8127.1, + "min_throughput": 8127.1, + "max_throughput": 8127.1, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 8127.1, + "best_framework": "vllm-tpu" + } + }, + "best_submission_id": "300ac34c", + "best_framework": "vllm-tpu", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Google TPU v6e", + "chip_vendor": "Google", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 6857.53, + "median_throughput": 6857.53, + "min_throughput": 6857.53, + "max_throughput": 6857.53, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 6857.53, + "best_framework": "vllm-tpu" + } + }, + "best_submission_id": "9318bfeb", + "best_framework": "vllm-tpu", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Google TPU v6e", + "chip_vendor": "Google", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 2775.55, + "median_throughput": 2775.55, + "min_throughput": 2775.55, + "max_throughput": 2775.55, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 2775.55, + "best_framework": "vllm-tpu" + }, + "speculative": { + "count": 1, + "best_throughput": 2774.41, + "best_framework": "vllm-tpu" + } + }, + "best_submission_id": "d9b3177f", + "best_framework": "vllm-tpu", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Google TPU v6e", + "chip_vendor": "Google", + "suite": "suite_D", + "model": "Llama-3.1-8B-Instruct", + "submission_count": 1, + "best_throughput": 54.52, + "median_throughput": 54.52, + "min_throughput": 54.52, + "max_throughput": 54.52, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 54.52, + "best_framework": "vllm-tpu" + } + }, + "best_submission_id": "d037f60f", + "best_framework": "vllm-tpu", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "Apple M1", + "chip_vendor": "Apple", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 51.0, + "median_throughput": 51.0, + "min_throughput": 51.0, + "max_throughput": 51.0, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 51.0, + "best_framework": "mlx-lm" + } + }, + "best_submission_id": "2349a925", + "best_framework": "mlx-lm", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "MTT S4000", + "chip_vendor": "Moore Threads", + "suite": "suite_F", + "model": "Qwen2.5-0.5B-Instruct", + "submission_count": 1, + "best_throughput": 2004.02, + "median_throughput": 2004.02, + "min_throughput": 2004.02, + "max_throughput": 2004.02, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 2004.02, + "best_framework": "vllm-musa" + }, + "online": { + "count": 1, + "best_throughput": 40, + "best_framework": "vllm-musa" + }, + "interactive": { + "count": 1, + "best_throughput": 2004.02, + "best_framework": "vllm-musa" + } + }, + "best_submission_id": "4f66d29d", + "best_framework": "vllm-musa", + "best_submitted_by": "JuhaoLiang1997" + }, + { + "chip": "MTT S4000", + "chip_vendor": "Moore Threads", + "suite": "suite_A", + "model": "Meta-Llama-3-8B-Instruct", + "submission_count": 1, + "best_throughput": 332.62, + "median_throughput": 332.62, + "min_throughput": 332.62, + "max_throughput": 332.62, + "stddev_throughput": null, + "scenario_summary": { + "offline": { + "count": 1, + "best_throughput": 332.62, + "best_framework": "vllm-musa" + }, + "online": { + "count": 1, + "best_throughput": 5, + "best_framework": "vllm-musa" + } + }, + "best_submission_id": "cabb7bd0", + "best_framework": "vllm-musa", + "best_submitted_by": "JuhaoLiang1997" + } +]; +window.DISTRIBUTION_GROUPS = DISTRIBUTION_GROUPS; + +const DISTRIBUTION_SUITE_META = { + "suite_A": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "dataset": "sharegpt_standard_v1", + "precision_required": "BF16", + "allowed_precisions": [ + "BF16", + "FP16" + ], + "max_model_len": 4096, + "concurrency_levels": [ + 8, + 32, + 128 + ], + "online_qps_levels": [ + 5, + 25, + 100 + ], + "online_sla_ttft_ms": 500, + "input_tokens_p50": 280, + "output_tokens_p50": 310, + "scenarios_default": [ + "accuracy", + "offline", + "online" + ], + "scenarios_extra": [ + "interactive", + "sustained", + "speculative", + "burst" + ] + }, + "suite_B": { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_revision": "50fd307e57011801c7833c87efa1984ddf2db42f", + "dataset": "sharegpt_standard_v1", + "precision_required": "BF16", + "allowed_precisions": [ + "BF16", + "FP16" + ], + "max_model_len": 4096, + "concurrency_levels": [ + 8, + 32, + 128 + ], + "online_qps_levels": [ + 2, + 5, + 10, + 25 + ], + "online_sla_ttft_ms": 1000, + "input_tokens_p50": 280, + "output_tokens_p50": 310, + "scenarios_default": [ + "accuracy", + "offline", + "online" + ], + "scenarios_extra": [ + "sustained", + "interactive", + "burst" + ] + }, + "suite_C": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "dataset": "sharegpt_standard_v1", + "precision_required": "BF16", + "allowed_precisions": [ + "BF16", + "FP16" + ], + "max_model_len": 4096, + "concurrency_levels": [ + 1, + 4, + 16, + 64 + ], + "online_qps_levels": [ + 5, + 10, + 25, + 50 + ], + "online_sla_ttft_ms": 500, + "scenarios_default": [ + "accuracy", + "offline" + ], + "scenarios_extra": [ + "online", + "sustained" + ] + }, + "suite_D": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "model_revision": "0e9e39f249a16976918f6564b8830bc894c89659", + "dataset": "sharegpt_longctx_v1", + "precision_required": "BF16", + "allowed_precisions": [ + "BF16", + "FP16" + ], + "max_model_len": 30208, + "concurrency_levels": [ + 1, + 4 + ], + "online_qps_levels": [ + 0.5, + 1, + 2 + ], + "online_sla_ttft_ms": 5000, + "input_tokens_p50": 28650, + "output_tokens_p50": 256, + "scenarios_default": [ + "accuracy", + "offline" + ], + "scenarios_extra": [ + "interactive", + "sustained", + "online" + ] + }, + "suite_E": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "dataset": "sharegpt_standard_v1", + "precision_required": "BF16", + "allowed_precisions": [ + "BF16", + "FP16" + ], + "max_model_len": 4096, + "concurrency_levels": [ + 8, + 32, + 128 + ], + "input_tokens_p50": 280, + "output_tokens_p50": 310, + "scenarios_default": [ + "accuracy", + "offline" + ] + }, + "suite_F": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "dataset": "sharegpt_edge_v1", + "precision_required": "BF16", + "allowed_precisions": [ + "FP16", + "BF16" + ], + "max_model_len": 2048, + "concurrency_levels": [ + 4, + 16, + 64 + ], + "online_qps_levels": [ + 10, + 40 + ], + "online_sla_ttft_ms": 500, + "input_tokens_p50": 95, + "output_tokens_p50": 150, + "scenarios_default": [ + "accuracy", + "offline", + "online", + "interactive" + ], + "scenarios_extra": [ + "sustained" + ] + }, + "suite_G": { + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_revision": "eba92302a2861cdc0098cc54bc9f17cb2c47eb61", + "dataset": "sharegpt_standard_v1", + "precision_required": "BF16", + "allowed_precisions": [ + "BF16", + "FP16" + ], + "max_model_len": 4096, + "concurrency_levels": [ + 4, + 16, + 64 + ], + "online_qps_levels": [ + 2, + 10, + 40 + ], + "online_sla_ttft_ms": 500, + "input_tokens_p50": 280, + "output_tokens_p50": 310, + "scenarios_default": [ + "accuracy", + "offline", + "online" + ], + "scenarios_extra": [ + "interactive", + "sustained" + ] + } +}; +window.DISTRIBUTION_SUITE_META = DISTRIBUTION_SUITE_META;