From 10939a6117f1eaef46d7b88513dafcc0b0ac236a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 16:46:17 +0000 Subject: [PATCH] bench: sweep bit-packed compare across all int types and bit widths Add `bitpack_compare_sweep`, which exercises the public `array.binary(rhs, op)` compare-against-constant path over all eight integer types and every valid bit width (64Ki in-range elements per case, no patches). It isolates the `` unpack + per-element compare kernel so a kernel change shows up as a CodSpeed diff. Signed-off-by: Joe Isaacs --- encodings/fastlanes/Cargo.toml | 4 + .../benches/bitpack_compare_sweep.rs | 111 ++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 encodings/fastlanes/benches/bitpack_compare_sweep.rs diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 08c96c481d7..e0aa5cbb724 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -64,6 +64,10 @@ required-features = ["_test-harness"] name = "bitpack_compare" harness = false +[[bench]] +name = "bitpack_compare_sweep" +harness = false + [[bench]] name = "cast_bitpacked" harness = false diff --git a/encodings/fastlanes/benches/bitpack_compare_sweep.rs b/encodings/fastlanes/benches/bitpack_compare_sweep.rs new file mode 100644 index 00000000000..3eb0ba3b9a2 --- /dev/null +++ b/encodings/fastlanes/benches/bitpack_compare_sweep.rs @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Sweeps the public `BitPackedArray` compare-against-constant path (`array.binary(rhs, op)`) over +//! every integer type and every valid bit width, so a kernel change shows up as a CodSpeed diff. +//! +//! The array holds in-range values (no patches, no out-of-range fast path), so each iteration runs +//! the full unpack + per-element compare kernel that backs ``. +//! +//! Run with `cargo bench -p vortex-fastlanes --bench bitpack_compare_sweep`. + +#![expect(clippy::unwrap_used)] +#![expect(clippy::cast_possible_truncation)] + +use divan::Bencher; +use divan::counter::ItemsCount; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::NativePType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_fastlanes::BitPackedData; + +fn main() { + divan::main(); +} + +/// Number of elements per benchmarked array (64 full FastLanes blocks). +const LEN: usize = 64 * 1024; + +/// Operator under test. `Lt` exercises the full unpack + per-element comparison path. +const OP: Operator = Operator::Lt; + +/// Integer types we can build packed arrays for in the benchmark. +trait BenchInt: NativePType + Copy + Into { + /// Build an in-range value from a small counter. + fn from_counter(v: u64) -> Self; +} + +macro_rules! impl_bench_int { + ($($T:ty),+) => { + $(impl BenchInt for $T { + #[inline] + fn from_counter(v: u64) -> Self { + v as $T + } + })+ + }; +} + +impl_bench_int!(u8, u16, u32, u64, i8, i16, i32, i64); + +/// Encode `LEN` in-range values of type `T` at the given bit width, returning the packed array, a +/// mid-range constant to compare against, and an execution context. +fn setup(width: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let cap = 1u64 << width; + let buf: BufferMut = (0..LEN) + .map(|i| T::from_counter((i as u64) % cap)) + .collect(); + let array = BitPackedData::encode( + &PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(), + width as u8, + &mut ctx, + ) + .unwrap() + .into_array(); + let rhs = ConstantArray::new(T::from_counter(cap / 2), LEN).into_array(); + (array, rhs, ctx) +} + +/// Generate a compare benchmark over every valid bit width for one type. Valid widths are +/// `1..native_bits` - bit-packing requires the target width to be strictly narrower than the type. +macro_rules! bench_type { + ($modname:ident, $T:ty, $native_bits:expr) => { + mod $modname { + use super::*; + + #[divan::bench(args = 1..$native_bits)] + fn compare(bencher: Bencher, width: usize) { + let (array, rhs, mut ctx) = setup::<$T>(width); + bencher.counter(ItemsCount::new(LEN)).bench_local(|| { + array + .clone() + .binary(rhs.clone(), OP) + .unwrap() + .execute::(&mut ctx) + .unwrap() + }); + } + } + }; +} + +bench_type!(u8, u8, 8usize); +bench_type!(u16, u16, 16usize); +bench_type!(u32, u32, 32usize); +bench_type!(u64, u64, 64usize); +bench_type!(i8, i8, 8usize); +bench_type!(i16, i16, 16usize); +bench_type!(i32, i32, 32usize); +bench_type!(i64, i64, 64usize);