Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ timeout = 3600
# Environment variables for PyO3. Ensures reproducible builds and avoids spurious recompilations.
[env]
PYO3_ENVIRONMENT_SIGNATURE = { value = "cpython-3.11-64bit", force = true }
# Default benchmark variant. A plain `cargo bench` runs every benchmark once on
# the host; the CodSpeed workflow overrides this per leg to gate and name
# feature-set-specific runs. Not forced, so a workflow-provided value wins.
BENCH_VARIANT = "local"

[cache.global-clean]
# Anything older than this duration will be deleted in the source cache.
Expand Down
7 changes: 5 additions & 2 deletions .github/actions/system-info/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@ runs:
echo "=== CPU Model ==="
lscpu
echo "=== /proc/cpuinfo summary ==="
grep -m1 "model name" /proc/cpuinfo
# x86 exposes "model name"; aarch64 has no such line (the model is shown
# by lscpu above), so a missing match must not fail the step.
grep -m1 "model name" /proc/cpuinfo || true
nproc
echo "=== CPU Flags ==="
grep -m1 "flags" /proc/cpuinfo
# x86 exposes "flags"; aarch64 exposes "Features".
grep -m1 -E "^(flags|Features)" /proc/cpuinfo || true

echo "=== Memory ==="
free -h
Expand Down
52 changes: 52 additions & 0 deletions .github/workflows/codspeed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ jobs:
- name: Build benchmarks
env:
RUSTFLAGS: "-C target-feature=+avx2"
# Benchmarks gated on a CPU feature set (e.g. bit_transpose) run their
# `simulation`-tagged variants here, in simulation mode. Walltime
# variants run in the dedicated bench-codspeed-bittranspose job.
BENCH_VARIANT: simulation
run: cargo codspeed build ${{ matrix.features }} $(printf -- '-p %s ' ${{ matrix.packages }}) --profile bench
- name: Run benchmarks
uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
Expand All @@ -66,6 +70,54 @@ jobs:
token: ${{ secrets.CODSPEED_TOKEN }}
mode: "simulation"

bench-codspeed-bittranspose:
# Walltime measurements for the bit_transpose benchmark on real silicon, one
# leg per architecture. Each leg builds with its own target features and sets
# BENCH_VARIANT, which both gates the benchmarks (via `ignore`) and prefixes
# their names so x86_64 and aarch64 results do not collide in CodSpeed. The
# simulation variant of these benchmarks runs in the bench-codspeed job above.
if: github.repository == 'vortex-data/vortex'
strategy:
fail-fast: false
matrix:
include:
- leg: x86_64
runner: amd64-medium
image: ubuntu24-full-x64-pre-v2
rustflags: "-C target-feature=+avx2"
- leg: aarch64
runner: arm64-medium
image: ubuntu24-full-arm64-pre-v2
rustflags: "-C target-feature=+neon"
name: "Benchmark bit_transpose with Codspeed (${{ matrix.leg }})"
timeout-minutes: 30
runs-on: >-
runs-on=${{ github.run_id }}/runner=${{ matrix.runner }}/image=${{ matrix.image }}/tag=bench-bittranspose-${{ matrix.leg }}
steps:
- uses: runs-on/action@v2
with:
sccache: s3
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- name: Setup benchmark environment
run: sudo bash scripts/setup-benchmark.sh
- uses: ./.github/actions/setup-prebuild
- uses: ./.github/actions/system-info
- name: Install Codspeed
uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995
with:
tool: cargo-codspeed
- name: Build benchmarks
env:
RUSTFLAGS: ${{ matrix.rustflags }}
BENCH_VARIANT: ${{ matrix.leg }}
run: cargo codspeed build -m walltime -p vortex-fastlanes --features _test-harness --bench bit_transpose --profile bench
- name: Run benchmarks
uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
with:
run: bash scripts/bench-taskset.sh cargo codspeed run
token: ${{ secrets.CODSPEED_TOKEN }}
mode: "walltime"

bench-codspeed-cuda:
if: github.repository == 'vortex-data/vortex'
strategy:
Expand Down
91 changes: 75 additions & 16 deletions encodings/fastlanes/benches/bit_transpose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use divan::Bencher;
use vortex_fastlanes::bit_transpose::scalar::transpose_bits_scalar;
use vortex_fastlanes::bit_transpose::scalar::untranspose_bits_scalar;

mod shared;

fn main() {
divan::main();
}
Expand All @@ -23,9 +25,15 @@ const BATCH_SIZE: usize = 1000;

// ============================================================================
// Transpose: single array
//
// Scalar benchmarks are architecture-neutral, so they run on every leg as a
// per-architecture baseline.
// ============================================================================

#[divan::bench]
#[divan::bench(
name = variant!("transpose_scalar"),
ignore = ignore_unless_variant!(simulation, x86_64, aarch64),
)]
fn transpose_scalar(bencher: Bencher) {
let input = generate_test_data(42);

Expand All @@ -40,7 +48,10 @@ fn transpose_scalar(bencher: Bencher) {
// Transpose: throughput (1000 arrays)
// ============================================================================

#[divan::bench]
#[divan::bench(
name = variant!("transpose_scalar_throughput"),
ignore = ignore_unless_variant!(simulation, x86_64, aarch64),
)]
fn transpose_scalar_throughput(bencher: Bencher) {
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();

Expand All @@ -57,7 +68,10 @@ fn transpose_scalar_throughput(bencher: Bencher) {
// Untranspose: single array
// ============================================================================

#[divan::bench]
#[divan::bench(
name = variant!("untranspose_scalar"),
ignore = ignore_unless_variant!(simulation, x86_64, aarch64),
)]
fn untranspose_scalar(bencher: Bencher) {
let input = generate_test_data(42);

Expand All @@ -72,7 +86,10 @@ fn untranspose_scalar(bencher: Bencher) {
// Untranspose: throughput (1000 arrays)
// ============================================================================

#[divan::bench]
#[divan::bench(
name = variant!("untranspose_scalar_throughput"),
ignore = ignore_unless_variant!(simulation, x86_64, aarch64),
)]
fn untranspose_scalar_throughput(bencher: Bencher) {
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();

Expand All @@ -87,6 +104,10 @@ fn untranspose_scalar_throughput(bencher: Bencher) {

// ============================================================================
// x86_64 benchmarks
//
// BMI2 and VBMI share the `x86_64` walltime leg (and the `simulation` leg); the
// `#[target_feature]` intrinsics are selected at runtime via `has_bmi2` /
// `has_vbmi`, so a single x86 build covers both.
// ============================================================================

#[cfg(target_arch = "x86_64")]
Expand All @@ -104,7 +125,10 @@ mod x86 {

// --- Transpose: single array ---

#[divan::bench]
#[divan::bench(
name = crate::variant!("transpose_bmi2"),
ignore = crate::ignore_unless_variant!(simulation, x86_64),
)]
fn transpose_bmi2(bencher: Bencher) {
if !has_bmi2() {
return;
Expand All @@ -119,7 +143,10 @@ mod x86 {
});
}

#[divan::bench]
#[divan::bench(
name = crate::variant!("transpose_vbmi"),
ignore = crate::ignore_unless_variant!(simulation, x86_64),
)]
fn transpose_vbmi(bencher: Bencher) {
if !has_vbmi() {
return;
Expand All @@ -136,7 +163,10 @@ mod x86 {

// --- Untranspose: single array ---

#[divan::bench]
#[divan::bench(
name = crate::variant!("untranspose_bmi2"),
ignore = crate::ignore_unless_variant!(simulation, x86_64),
)]
fn untranspose_bmi2(bencher: Bencher) {
if !has_bmi2() {
return;
Expand All @@ -151,7 +181,10 @@ mod x86 {
});
}

#[divan::bench]
#[divan::bench(
name = crate::variant!("untranspose_vbmi"),
ignore = crate::ignore_unless_variant!(simulation, x86_64),
)]
fn untranspose_vbmi(bencher: Bencher) {
if !has_vbmi() {
return;
Expand All @@ -168,7 +201,10 @@ mod x86 {

// --- Transpose: throughput (1000 arrays) ---

#[divan::bench]
#[divan::bench(
name = crate::variant!("transpose_bmi2_throughput"),
ignore = crate::ignore_unless_variant!(simulation, x86_64),
)]
fn transpose_bmi2_throughput(bencher: Bencher) {
if !has_bmi2() {
return;
Expand All @@ -185,7 +221,10 @@ mod x86 {
});
}

#[divan::bench]
#[divan::bench(
name = crate::variant!("transpose_vbmi_throughput"),
ignore = crate::ignore_unless_variant!(simulation, x86_64),
)]
fn transpose_vbmi_throughput(bencher: Bencher) {
if !has_vbmi() {
return;
Expand All @@ -204,7 +243,10 @@ mod x86 {

// --- Untranspose: throughput (1000 arrays) ---

#[divan::bench]
#[divan::bench(
name = crate::variant!("untranspose_bmi2_throughput"),
ignore = crate::ignore_unless_variant!(simulation, x86_64),
)]
fn untranspose_bmi2_throughput(bencher: Bencher) {
if !has_bmi2() {
return;
Expand All @@ -221,7 +263,10 @@ mod x86 {
});
}

#[divan::bench]
#[divan::bench(
name = crate::variant!("untranspose_vbmi_throughput"),
ignore = crate::ignore_unless_variant!(simulation, x86_64),
)]
fn untranspose_vbmi_throughput(bencher: Bencher) {
if !has_vbmi() {
return;
Expand All @@ -241,6 +286,8 @@ mod x86 {

// ============================================================================
// aarch64 benchmarks
//
// NEON has its own walltime leg; the scalar baselines above also run there.
// ============================================================================

#[cfg(target_arch = "aarch64")]
Expand All @@ -254,7 +301,10 @@ mod aarch64 {

// --- Transpose: single array ---

#[divan::bench]
#[divan::bench(
name = crate::variant!("transpose_neon"),
ignore = crate::ignore_unless_variant!(aarch64),
)]
fn transpose_neon(bencher: Bencher) {
let input = generate_test_data(42);

Expand All @@ -267,7 +317,10 @@ mod aarch64 {

// --- Untranspose: single array ---

#[divan::bench]
#[divan::bench(
name = crate::variant!("untranspose_neon"),
ignore = crate::ignore_unless_variant!(aarch64),
)]
fn untranspose_neon(bencher: Bencher) {
let input = generate_test_data(42);

Expand All @@ -280,7 +333,10 @@ mod aarch64 {

// --- Transpose: throughput (1000 arrays) ---

#[divan::bench]
#[divan::bench(
name = crate::variant!("transpose_neon_throughput"),
ignore = crate::ignore_unless_variant!(aarch64),
)]
fn transpose_neon_throughput(bencher: Bencher) {
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();

Expand All @@ -295,7 +351,10 @@ mod aarch64 {

// --- Untranspose: throughput (1000 arrays) ---

#[divan::bench]
#[divan::bench(
name = crate::variant!("untranspose_neon_throughput"),
ignore = crate::ignore_unless_variant!(aarch64),
)]
fn untranspose_neon_throughput(bencher: Bencher) {
let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();

Expand Down
62 changes: 62 additions & 0 deletions encodings/fastlanes/benches/shared/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Shared helpers for the fastlanes benchmark binaries.
//!
//! Pulled into a bench via `mod shared;`. The macros here give every benchmark
//! a single source of truth for *which* CPU feature set / architecture it is
//! measured under, driven by the compile-time `BENCH_VARIANT` environment
//! variable:
//!
//! * A plain `cargo bench` leaves `BENCH_VARIANT` at its `local` default (set in
//! `.cargo/config.toml`), so every benchmark runs once on the host.
//! * The CodSpeed workflow sets `BENCH_VARIANT` to exactly one variant per CI
//! leg (e.g. `simulation`, `x86_64`, `aarch64`), which both prefixes the
//! benchmark name and gates whether the benchmark runs.

/// Prefix a benchmark name with the active build variant.
///
/// `BENCH_VARIANT` is read at compile time. The prefix keeps measurements taken
/// under different builds or architectures from colliding in CodSpeed (most
/// importantly the architecture-neutral scalar benchmarks, which run on every
/// leg).
#[macro_export]
macro_rules! variant {
($name:literal) => {
concat!(env!("BENCH_VARIANT"), "::", $name)
};
}

/// Map a known variant identifier to its string tag.
///
/// Adding a new variant? Add an arm here *and* a matching CI leg. An unknown
/// identifier fails to compile, so benchmark tags can't silently typo into a
/// variant that never runs.
#[macro_export]
macro_rules! variant_tag {
(simulation) => {
"simulation"
};
(x86_64) => {
"x86_64"
};
(aarch64) => {
"aarch64"
};
}

/// divan `ignore` expression: skip this benchmark *unless* we are running
/// locally (`BENCH_VARIANT=local`, the default) or the active variant is one of
/// the listed feature sets. CI sets `BENCH_VARIANT` to exactly one variant per
/// leg; locally it defaults to `local`, so every benchmark runs.
///
/// The gate is an OR-chain of `==` rather than `matches!` because
/// [`variant_tag!`] expands to a string literal, which is not valid in
/// `matches!` pattern position.
#[macro_export]
macro_rules! ignore_unless_variant {
($($v:ident),+ $(,)?) => {{
let active = env!("BENCH_VARIANT");
!(active == "local" $(|| active == $crate::variant_tag!($v))+)
}};
}
Loading