Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/scripts/run-sql-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ if [[ -n "$df_formats" ]]; then
# shellcheck disable=SC2086
target/release_debug/datafusion-bench "$subcommand" \
-d gh-json \
--show-session-metrics \
--formats "$df_formats" \
$opts \
-o df-results.json
Expand All @@ -118,6 +119,7 @@ if [[ -n "$ddb_formats" ]]; then
# shellcheck disable=SC2086
target/release_debug/duckdb-bench "$subcommand" \
-d gh-json \
--show-session-metrics \
--formats "$ddb_formats" \
$opts \
--delete-duckdb-database \
Expand All @@ -131,6 +133,7 @@ if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/l
# shellcheck disable=SC2086
target/release_debug/lance-bench "$subcommand" \
-d gh-json \
--show-session-metrics \
$opts \
-o lance-results.json

Expand Down
5 changes: 4 additions & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
think you've finished work.
* run `cargo xtask public-api` to re-generate the public API lock files. Please do this every time you reach a stopping
point or think you've finished work.
* exception: for trivial, isolated changes that cannot affect formatting or linting (for example constant-only edits
with no import/signature/control-flow changes), skip full-workspace `fmt`/`clippy`/`public-api` sweeps unless
explicitly requested; prefer targeted validation (for example crate-level `cargo check`/tests) only.
* you can try running
`cargo fix --lib --allow-dirty --allow-staged && cargo clippy --fix --lib --allow-dirty --allow-staged` to
automatically many fix minor errors.
Expand Down Expand Up @@ -64,4 +67,4 @@

## Commits

* All commits must be signed of by the committers in the form `Signed-off-by: "COMMITTER" <COMMITTER_EMAIL>`.
* All commits must be signed of by the committers in the form `Signed-off-by: "COMMITTER" <COMMITTER_EMAIL>`.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions bench-orchestrator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ vx-bench run <benchmark> [options]
- `--iterations, -i`: Iterations per query (default: 5)
- `--label, -l`: Label for this run (useful for later reference)
- `--track-memory`: Enable memory usage tracking
- `--allocator`: Host allocator mode: `pooled`, `default`, or `ab` (run both in one run)
- `--build/--no-build`: Build binaries before running (default: build)

### `compare` - Compare Results
Expand All @@ -71,9 +72,9 @@ vx-bench compare [options]
- `--format`: Filter results to a specific format
- `--threshold`: Significance threshold (default: 0.10 = 10%)

**Within-run comparison** (`--run`): Compares different engine:format combinations within a single run. Output shows one row per query, with columns for each engine:format combo.
**Within-run comparison** (`--run`): Compares different engine:format (or engine:format:allocator) combinations within a single run. Output shows one row per query, with columns for each combo.

**Multi-run comparison** (`--runs`): Compares the same benchmarks across multiple runs. Output shows one row per (query, engine, format) combination, with columns for each run.
**Multi-run comparison** (`--runs`): Compares the same benchmarks across multiple runs. Output shows one row per (query, engine, format[, allocator]) combination, with columns for each run.

### `list` - List Benchmark Runs

Expand Down
73 changes: 57 additions & 16 deletions bench-orchestrator/bench_orchestrator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""CLI for benchmark orchestration."""

from datetime import datetime, timedelta
from typing import Annotated
from typing import Annotated, Literal

import pandas as pd
import typer
Expand Down Expand Up @@ -76,19 +76,48 @@ def run(
iterations: Annotated[int, typer.Option("--iterations", "-i", help="Iterations per query")] = 5,
label: Annotated[str | None, typer.Option("--label", "-l", help="Label for this run")] = None,
track_memory: Annotated[bool, typer.Option("--track-memory", help="Track memory usage")] = False,
show_session_metrics: Annotated[
bool, typer.Option("--show-session-metrics", help="Print session metrics after each engine run")
] = False,
samply: Annotated[bool, typer.Option("--samply", help="Record a profile using samply")] = False,
sample_rate: Annotated[int, typer.Option("--sample-rate", help="Sample rate to run samply with")] = None,
tracing: Annotated[bool, typer.Option("--tracing", help="Record a trace for use with perfetto")] = False,
build: Annotated[bool, typer.Option("--build/--no-build", help="Build binaries before running")] = True,
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Log underlying commands")] = False,
options: Annotated[list[str] | None, typer.Option("--opt", help="Engine or benchmark specific options")] = None,
allocator: Annotated[
str,
typer.Option(
"--allocator",
help="Host allocator mode: pooled, default, or ab (run both and compare)",
),
] = "pooled",
) -> None:
"""Run benchmarks with specified configuration."""
engines = parse_engines(engine)
formats = parse_formats(format)
query_list = parse_queries(queries)
exclude_list = parse_queries(exclude_queries)
bench_opts = {}
allocator_mode_value = allocator.strip().lower()
if allocator_mode_value not in {"pooled", "default", "ab"}:
console.print("[red]--allocator must be one of: pooled, default, ab[/red]")
raise typer.Exit(1)

allocator_mode: Literal["pooled", "default", "ab"]
if allocator_mode_value == "default":
allocator_mode = "default"
elif allocator_mode_value == "pooled":
allocator_mode = "pooled"
else:
allocator_mode = "ab"

allocator_variants: list[Literal["pooled", "default"]]
if allocator_mode == "ab":
allocator_variants = ["default", "pooled"]
else:
allocator_variants = [allocator_mode]

# Build options dict
if options:
for opt in options:
Expand All @@ -108,6 +137,7 @@ def run(
label=label,
options=bench_opts,
track_memory=track_memory,
allocator_mode=allocator_mode,
)

# Validate configuration
Expand Down Expand Up @@ -135,6 +165,7 @@ def run(
console.print(f" Engines: {', '.join(e.value for e in engines)}")
console.print(f" Formats: {', '.join(f.value for f in formats)}")
console.print(f" Iterations: {iterations}")
console.print(f" Allocator: {allocator_mode}")
if label:
console.print(f" Label: {label}")
console.print()
Expand All @@ -153,20 +184,27 @@ def run(
executor = BenchmarkExecutor(binary_paths[eng], eng, verbose=verbose)

try:
results = executor.run(
benchmark=benchmark,
formats=engine_formats,
queries=query_list,
exclude_queries=exclude_list,
iterations=iterations,
options=bench_opts,
track_memory=track_memory,
samply=samply,
sample_rate=sample_rate,
tracing=tracing,
on_result=ctx.write_raw_json,
)
console.print(f"[green]{eng.value}: {len(results)} results[/green]")
total_results = 0
for allocator_variant in allocator_variants:
results = executor.run(
benchmark=benchmark,
formats=engine_formats,
queries=query_list,
exclude_queries=exclude_list,
iterations=iterations,
options=bench_opts,
track_memory=track_memory,
show_session_metrics=show_session_metrics,
samply=samply,
sample_rate=sample_rate,
tracing=tracing,
allocator_mode=allocator_variant,
on_result=ctx.write_raw_json,
)
total_results += len(results)
console.print(f"[green]{eng.value} ({allocator_variant}): {len(results)} results[/green]")
if allocator_mode == "ab":
console.print(f"[green]{eng.value}: {total_results} total results[/green]")
except RuntimeError as e:
console.print(f"[red]{eng.value} failed: {e}[/red]")

Expand Down Expand Up @@ -321,7 +359,10 @@ def compare(
console.print(f"[red]{e}[/red]")
raise typer.Exit(1)

table = pivot_comparison_table(pivot, threshold, row_keys=["query", "engine", "format"])
row_keys = ["query", "engine", "format"]
if "allocator" in pivot.df.columns:
row_keys.append("allocator")
table = pivot_comparison_table(pivot, threshold, row_keys=row_keys)
console.print(table)
return

Expand Down
33 changes: 25 additions & 8 deletions bench-orchestrator/bench_orchestrator/comparison/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def extract_target_fields(df: pd.DataFrame) -> pd.DataFrame:
if isinstance(first_target, dict):
df["engine"] = df["target"].apply(lambda t: t.get("engine", "") if isinstance(t, dict) else "")
df["format"] = df["target"].apply(lambda t: t.get("format", "") if isinstance(t, dict) else "")
df["allocator"] = df["target"].apply(lambda t: t.get("allocator", "") if isinstance(t, dict) else "")

# Extract query number from name if present
if "name" in df.columns:
Expand Down Expand Up @@ -196,9 +197,15 @@ def compare_within_run(
if filter_format is not None and "format" in df.columns:
df = df[df["format"] == filter_format]

# Find unique engine:format combinations
combos_df = df.groupby(["engine", "format"]).size().reset_index()[["engine", "format"]]
columns = [f"{row['engine']}:{row['format']}" for _, row in combos_df.iterrows()]
has_allocator = "allocator" in df.columns and df["allocator"].nunique(dropna=True) > 1

# Find unique engine:format[:allocator] combinations.
combo_keys = ["engine", "format"] + (["allocator"] if has_allocator else [])
combos_df = df.groupby(combo_keys).size().reset_index()[combo_keys]
if has_allocator:
columns = [f"{row['engine']}:{row['format']}:{row['allocator']}" for _, row in combos_df.iterrows()]
else:
columns = [f"{row['engine']}:{row['format']}" for _, row in combos_df.iterrows()]

if len(columns) < 2:
raise ValueError("Need at least 2 engine:format combinations to compare")
Expand All @@ -208,10 +215,17 @@ def compare_within_run(
baseline_engine = combos_df.iloc[0]["engine"]
baseline_format = combos_df.iloc[0]["format"]

baseline_key = f"{baseline_engine}:{baseline_format}"
if has_allocator:
baseline_allocator = combos_df.iloc[0]["allocator"]
baseline_key = f"{baseline_engine}:{baseline_format}:{baseline_allocator}"
else:
baseline_key = f"{baseline_engine}:{baseline_format}"

# Create engine:format column
df["combo"] = df["engine"] + ":" + df["format"]
# Create combo column.
if has_allocator:
df["combo"] = df["engine"] + ":" + df["format"] + ":" + df["allocator"]
else:
df["combo"] = df["engine"] + ":" + df["format"]

# Pivot to get queries as rows, combos as columns
pivot = df.pivot_table(index="query", columns="combo", values="value", aggfunc="mean")
Expand Down Expand Up @@ -268,8 +282,11 @@ def compare_runs(
# Combine all runs
combined: pd.DataFrame = pd.concat(processed, ignore_index=True)

# Pivot to get (query, engine, format) as rows, runs as columns
pivot = combined.pivot_table(index=["query", "engine", "format"], columns="run", values="value", aggfunc="mean")
# Pivot to get (query, engine, format[, allocator]) as rows, runs as columns.
index = ["query", "engine", "format"]
if "allocator" in combined.columns and combined["allocator"].nunique(dropna=True) > 1:
index.append("allocator")
pivot = combined.pivot_table(index=index, columns="run", values="value", aggfunc="mean")

# Deduplicate labels while preserving order (two runs can share a label).
unique_labels = list(dict.fromkeys(labels))
Expand Down
2 changes: 2 additions & 0 deletions bench-orchestrator/bench_orchestrator/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Literal


class Engine(Enum):
Expand Down Expand Up @@ -78,6 +79,7 @@ class RunConfig:
label: str | None = None
options: dict[str, str] = field(default_factory=dict)
track_memory: bool = False
allocator_mode: Literal["pooled", "default", "ab"] = "pooled"

def validate(self) -> list[str]:
"""Validate the configuration and return any warnings."""
Expand Down
24 changes: 22 additions & 2 deletions bench-orchestrator/bench_orchestrator/runner/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

"""Benchmark binary execution."""

import json
import os
import subprocess
from collections.abc import Callable
from pathlib import Path
from typing import final
from typing import Literal, final

from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn
Expand Down Expand Up @@ -34,9 +36,11 @@ def run(
iterations: int = 5,
options: dict[str, str] | None = None,
track_memory: bool = False,
show_session_metrics: bool = False,
samply: bool = False,
sample_rate: int | None = None,
tracing: bool = False,
allocator_mode: Literal["pooled", "default"] = "pooled",
on_result: Callable[[str], None] | None = None,
) -> list[str]:
"""
Expand All @@ -50,6 +54,8 @@ def run(
iterations: Number of runs per query
options: Additional options (e.g., scale_factor)
track_memory: Enable memory tracking
show_session_metrics: Print session metrics at the end of the benchmark process
allocator_mode: Host allocator mode ("pooled" or "default")
on_result: Callback for each result line (for streaming)

Returns:
Expand All @@ -73,6 +79,8 @@ def run(
cmd.extend(["--exclude-queries", ",".join(map(str, exclude_queries))])
if track_memory:
cmd.append("--track-memory")
if show_session_metrics:
cmd.append("--show-session-metrics")
if tracing:
cmd.append("--tracing")
if options:
Expand All @@ -83,7 +91,7 @@ def run(
cmd = ["--"] + cmd
cmd_prefix = ["samply", "record"]
if sample_rate:
cmd = cmd_prefix + ["--rate", sample_rate] + cmd
cmd = cmd_prefix + ["--rate", str(sample_rate)] + cmd
else:
cmd = cmd_prefix + cmd

Expand All @@ -94,6 +102,9 @@ def run(
if self.verbose:
console.print(f"[dim]$ {' '.join(cmd)}[/dim]")

env = os.environ.copy()
env["VORTEX_HOST_ALLOCATOR"] = allocator_mode

results = []

with Progress(
Expand All @@ -108,11 +119,20 @@ def run(
cmd,
stdout=subprocess.PIPE,
text=True,
env=env,
)

for line in iter(process.stdout.readline, ""):
line = line.strip()
if line:
if line.startswith("{"):
try:
payload = json.loads(line)
if isinstance(payload.get("target"), dict):
payload["target"]["allocator"] = allocator_mode
line = json.dumps(payload)
except json.JSONDecodeError:
pass
results.append(line)
if on_result:
on_result(line)
Expand Down
5 changes: 4 additions & 1 deletion bench-orchestrator/bench_orchestrator/storage/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ def create_run(self, config: RunConfig, build_config: BuildConfig) -> Iterator[R
timestamp=datetime.now(),
label=config.label,
benchmark=config.benchmark.value,
dataset_config=config.options,
dataset_config={
**config.options,
"allocator_mode": config.allocator_mode,
},
engines=[e.value for e in config.engines],
formats=[f.value for f in config.formats],
queries=config.queries or [],
Expand Down
Loading
Loading