diff --git a/.cargo/config.toml b/.cargo/config.toml index 466422a80b1..e0f96fb234e 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -18,6 +18,10 @@ timeout = 3600 # Environment variables for PyO3. Ensures reproducible builds and avoids spurious recompilations. [env] PYO3_ENVIRONMENT_SIGNATURE = { value = "cpython-3.11-64bit", force = true } +# Default benchmark variant. A plain `cargo bench` runs every benchmark once on +# the host; the CodSpeed workflow overrides this per leg to gate and name +# feature-set-specific runs. Not forced, so a workflow-provided value wins. +BENCH_VARIANT = "local" [cache.global-clean] # Anything older than this duration will be deleted in the source cache. diff --git a/.github/actions/system-info/action.yml b/.github/actions/system-info/action.yml index 7ca1d99b7ea..c08abf1e851 100644 --- a/.github/actions/system-info/action.yml +++ b/.github/actions/system-info/action.yml @@ -10,10 +10,13 @@ runs: echo "=== CPU Model ===" lscpu echo "=== /proc/cpuinfo summary ===" - grep -m1 "model name" /proc/cpuinfo + # x86 exposes "model name"; aarch64 has no such line (the model is shown + # by lscpu above), so a missing match must not fail the step. + grep -m1 "model name" /proc/cpuinfo || true nproc echo "=== CPU Flags ===" - grep -m1 "flags" /proc/cpuinfo + # x86 exposes "flags"; aarch64 exposes "Features". + grep -m1 -E "^(flags|Features)" /proc/cpuinfo || true echo "=== Memory ===" free -h diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 50a5cbaa75a..6815e056a96 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -58,6 +58,10 @@ jobs: - name: Build benchmarks env: RUSTFLAGS: "-C target-feature=+avx2" + # Benchmarks gated on a CPU feature set (e.g. bit_transpose) run their + # `simulation`-tagged variants here, in simulation mode. Walltime + # variants run in the dedicated bench-codspeed-bittranspose job. + BENCH_VARIANT: simulation run: cargo codspeed build ${{ matrix.features }} $(printf -- '-p %s ' ${{ matrix.packages }}) --profile bench - name: Run benchmarks uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2 @@ -66,6 +70,54 @@ jobs: token: ${{ secrets.CODSPEED_TOKEN }} mode: "simulation" + bench-codspeed-bittranspose: + # Walltime measurements for the bit_transpose benchmark on real silicon, one + # leg per architecture. Each leg builds with its own target features and sets + # BENCH_VARIANT, which both gates the benchmarks (via `ignore`) and prefixes + # their names so x86_64 and aarch64 results do not collide in CodSpeed. The + # simulation variant of these benchmarks runs in the bench-codspeed job above. + if: github.repository == 'vortex-data/vortex' + strategy: + fail-fast: false + matrix: + include: + - leg: x86_64 + runner: amd64-medium + image: ubuntu24-full-x64-pre-v2 + rustflags: "-C target-feature=+avx2" + - leg: aarch64 + runner: arm64-medium + image: ubuntu24-full-arm64-pre-v2 + rustflags: "-C target-feature=+neon" + name: "Benchmark bit_transpose with Codspeed (${{ matrix.leg }})" + timeout-minutes: 30 + runs-on: >- + runs-on=${{ github.run_id }}/runner=${{ matrix.runner }}/image=${{ matrix.image }}/tag=bench-bittranspose-${{ matrix.leg }} + steps: + - uses: runs-on/action@v2 + with: + sccache: s3 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - name: Setup benchmark environment + run: sudo bash scripts/setup-benchmark.sh + - uses: ./.github/actions/setup-prebuild + - uses: ./.github/actions/system-info + - name: Install Codspeed + uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995 + with: + tool: cargo-codspeed + - name: Build benchmarks + env: + RUSTFLAGS: ${{ matrix.rustflags }} + BENCH_VARIANT: ${{ matrix.leg }} + run: cargo codspeed build -m walltime -p vortex-fastlanes --features _test-harness --bench bit_transpose --profile bench + - name: Run benchmarks + uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2 + with: + run: bash scripts/bench-taskset.sh cargo codspeed run + token: ${{ secrets.CODSPEED_TOKEN }} + mode: "walltime" + bench-codspeed-cuda: if: github.repository == 'vortex-data/vortex' strategy: diff --git a/encodings/fastlanes/benches/bit_transpose.rs b/encodings/fastlanes/benches/bit_transpose.rs index 08c3ffb12e5..ac38c2c8b02 100644 --- a/encodings/fastlanes/benches/bit_transpose.rs +++ b/encodings/fastlanes/benches/bit_transpose.rs @@ -5,6 +5,8 @@ use divan::Bencher; use vortex_fastlanes::bit_transpose::scalar::transpose_bits_scalar; use vortex_fastlanes::bit_transpose::scalar::untranspose_bits_scalar; +mod shared; + fn main() { divan::main(); } @@ -23,9 +25,15 @@ const BATCH_SIZE: usize = 1000; // ============================================================================ // Transpose: single array +// +// Scalar benchmarks are architecture-neutral, so they run on every leg as a +// per-architecture baseline. // ============================================================================ -#[divan::bench] +#[divan::bench( + name = variant!("transpose_scalar"), + ignore = ignore_unless_variant!(simulation, x86_64, aarch64), +)] fn transpose_scalar(bencher: Bencher) { let input = generate_test_data(42); @@ -40,7 +48,10 @@ fn transpose_scalar(bencher: Bencher) { // Transpose: throughput (1000 arrays) // ============================================================================ -#[divan::bench] +#[divan::bench( + name = variant!("transpose_scalar_throughput"), + ignore = ignore_unless_variant!(simulation, x86_64, aarch64), +)] fn transpose_scalar_throughput(bencher: Bencher) { let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect(); @@ -57,7 +68,10 @@ fn transpose_scalar_throughput(bencher: Bencher) { // Untranspose: single array // ============================================================================ -#[divan::bench] +#[divan::bench( + name = variant!("untranspose_scalar"), + ignore = ignore_unless_variant!(simulation, x86_64, aarch64), +)] fn untranspose_scalar(bencher: Bencher) { let input = generate_test_data(42); @@ -72,7 +86,10 @@ fn untranspose_scalar(bencher: Bencher) { // Untranspose: throughput (1000 arrays) // ============================================================================ -#[divan::bench] +#[divan::bench( + name = variant!("untranspose_scalar_throughput"), + ignore = ignore_unless_variant!(simulation, x86_64, aarch64), +)] fn untranspose_scalar_throughput(bencher: Bencher) { let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect(); @@ -87,6 +104,10 @@ fn untranspose_scalar_throughput(bencher: Bencher) { // ============================================================================ // x86_64 benchmarks +// +// BMI2 and VBMI share the `x86_64` walltime leg (and the `simulation` leg); the +// `#[target_feature]` intrinsics are selected at runtime via `has_bmi2` / +// `has_vbmi`, so a single x86 build covers both. // ============================================================================ #[cfg(target_arch = "x86_64")] @@ -104,7 +125,10 @@ mod x86 { // --- Transpose: single array --- - #[divan::bench] + #[divan::bench( + name = crate::variant!("transpose_bmi2"), + ignore = crate::ignore_unless_variant!(simulation, x86_64), + )] fn transpose_bmi2(bencher: Bencher) { if !has_bmi2() { return; @@ -119,7 +143,10 @@ mod x86 { }); } - #[divan::bench] + #[divan::bench( + name = crate::variant!("transpose_vbmi"), + ignore = crate::ignore_unless_variant!(simulation, x86_64), + )] fn transpose_vbmi(bencher: Bencher) { if !has_vbmi() { return; @@ -136,7 +163,10 @@ mod x86 { // --- Untranspose: single array --- - #[divan::bench] + #[divan::bench( + name = crate::variant!("untranspose_bmi2"), + ignore = crate::ignore_unless_variant!(simulation, x86_64), + )] fn untranspose_bmi2(bencher: Bencher) { if !has_bmi2() { return; @@ -151,7 +181,10 @@ mod x86 { }); } - #[divan::bench] + #[divan::bench( + name = crate::variant!("untranspose_vbmi"), + ignore = crate::ignore_unless_variant!(simulation, x86_64), + )] fn untranspose_vbmi(bencher: Bencher) { if !has_vbmi() { return; @@ -168,7 +201,10 @@ mod x86 { // --- Transpose: throughput (1000 arrays) --- - #[divan::bench] + #[divan::bench( + name = crate::variant!("transpose_bmi2_throughput"), + ignore = crate::ignore_unless_variant!(simulation, x86_64), + )] fn transpose_bmi2_throughput(bencher: Bencher) { if !has_bmi2() { return; @@ -185,7 +221,10 @@ mod x86 { }); } - #[divan::bench] + #[divan::bench( + name = crate::variant!("transpose_vbmi_throughput"), + ignore = crate::ignore_unless_variant!(simulation, x86_64), + )] fn transpose_vbmi_throughput(bencher: Bencher) { if !has_vbmi() { return; @@ -204,7 +243,10 @@ mod x86 { // --- Untranspose: throughput (1000 arrays) --- - #[divan::bench] + #[divan::bench( + name = crate::variant!("untranspose_bmi2_throughput"), + ignore = crate::ignore_unless_variant!(simulation, x86_64), + )] fn untranspose_bmi2_throughput(bencher: Bencher) { if !has_bmi2() { return; @@ -221,7 +263,10 @@ mod x86 { }); } - #[divan::bench] + #[divan::bench( + name = crate::variant!("untranspose_vbmi_throughput"), + ignore = crate::ignore_unless_variant!(simulation, x86_64), + )] fn untranspose_vbmi_throughput(bencher: Bencher) { if !has_vbmi() { return; @@ -241,6 +286,8 @@ mod x86 { // ============================================================================ // aarch64 benchmarks +// +// NEON has its own walltime leg; the scalar baselines above also run there. // ============================================================================ #[cfg(target_arch = "aarch64")] @@ -254,7 +301,10 @@ mod aarch64 { // --- Transpose: single array --- - #[divan::bench] + #[divan::bench( + name = crate::variant!("transpose_neon"), + ignore = crate::ignore_unless_variant!(aarch64), + )] fn transpose_neon(bencher: Bencher) { let input = generate_test_data(42); @@ -267,7 +317,10 @@ mod aarch64 { // --- Untranspose: single array --- - #[divan::bench] + #[divan::bench( + name = crate::variant!("untranspose_neon"), + ignore = crate::ignore_unless_variant!(aarch64), + )] fn untranspose_neon(bencher: Bencher) { let input = generate_test_data(42); @@ -280,7 +333,10 @@ mod aarch64 { // --- Transpose: throughput (1000 arrays) --- - #[divan::bench] + #[divan::bench( + name = crate::variant!("transpose_neon_throughput"), + ignore = crate::ignore_unless_variant!(aarch64), + )] fn transpose_neon_throughput(bencher: Bencher) { let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect(); @@ -295,7 +351,10 @@ mod aarch64 { // --- Untranspose: throughput (1000 arrays) --- - #[divan::bench] + #[divan::bench( + name = crate::variant!("untranspose_neon_throughput"), + ignore = crate::ignore_unless_variant!(aarch64), + )] fn untranspose_neon_throughput(bencher: Bencher) { let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect(); diff --git a/encodings/fastlanes/benches/shared/mod.rs b/encodings/fastlanes/benches/shared/mod.rs new file mode 100644 index 00000000000..d7cef76b2ef --- /dev/null +++ b/encodings/fastlanes/benches/shared/mod.rs @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Shared helpers for the fastlanes benchmark binaries. +//! +//! Pulled into a bench via `mod shared;`. The macros here give every benchmark +//! a single source of truth for *which* CPU feature set / architecture it is +//! measured under, driven by the compile-time `BENCH_VARIANT` environment +//! variable: +//! +//! * A plain `cargo bench` leaves `BENCH_VARIANT` at its `local` default (set in +//! `.cargo/config.toml`), so every benchmark runs once on the host. +//! * The CodSpeed workflow sets `BENCH_VARIANT` to exactly one variant per CI +//! leg (e.g. `simulation`, `x86_64`, `aarch64`), which both prefixes the +//! benchmark name and gates whether the benchmark runs. + +/// Prefix a benchmark name with the active build variant. +/// +/// `BENCH_VARIANT` is read at compile time. The prefix keeps measurements taken +/// under different builds or architectures from colliding in CodSpeed (most +/// importantly the architecture-neutral scalar benchmarks, which run on every +/// leg). +#[macro_export] +macro_rules! variant { + ($name:literal) => { + concat!(env!("BENCH_VARIANT"), "::", $name) + }; +} + +/// Map a known variant identifier to its string tag. +/// +/// Adding a new variant? Add an arm here *and* a matching CI leg. An unknown +/// identifier fails to compile, so benchmark tags can't silently typo into a +/// variant that never runs. +#[macro_export] +macro_rules! variant_tag { + (simulation) => { + "simulation" + }; + (x86_64) => { + "x86_64" + }; + (aarch64) => { + "aarch64" + }; +} + +/// divan `ignore` expression: skip this benchmark *unless* we are running +/// locally (`BENCH_VARIANT=local`, the default) or the active variant is one of +/// the listed feature sets. CI sets `BENCH_VARIANT` to exactly one variant per +/// leg; locally it defaults to `local`, so every benchmark runs. +/// +/// The gate is an OR-chain of `==` rather than `matches!` because +/// [`variant_tag!`] expands to a string literal, which is not valid in +/// `matches!` pattern position. +#[macro_export] +macro_rules! ignore_unless_variant { + ($($v:ident),+ $(,)?) => {{ + let active = env!("BENCH_VARIANT"); + !(active == "local" $(|| active == $crate::variant_tag!($v))+) + }}; +}