From 10939a6117f1eaef46d7b88513dafcc0b0ac236a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 16:46:17 +0000 Subject: [PATCH 1/3] bench: sweep bit-packed compare across all int types and bit widths Add `bitpack_compare_sweep`, which exercises the public `array.binary(rhs, op)` compare-against-constant path over all eight integer types and every valid bit width (64Ki in-range elements per case, no patches). It isolates the `` unpack + per-element compare kernel so a kernel change shows up as a CodSpeed diff. Signed-off-by: Joe Isaacs --- encodings/fastlanes/Cargo.toml | 4 + .../benches/bitpack_compare_sweep.rs | 111 ++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 encodings/fastlanes/benches/bitpack_compare_sweep.rs diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 08c96c481d7..e0aa5cbb724 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -64,6 +64,10 @@ required-features = ["_test-harness"] name = "bitpack_compare" harness = false +[[bench]] +name = "bitpack_compare_sweep" +harness = false + [[bench]] name = "cast_bitpacked" harness = false diff --git a/encodings/fastlanes/benches/bitpack_compare_sweep.rs b/encodings/fastlanes/benches/bitpack_compare_sweep.rs new file mode 100644 index 00000000000..3eb0ba3b9a2 --- /dev/null +++ b/encodings/fastlanes/benches/bitpack_compare_sweep.rs @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Sweeps the public `BitPackedArray` compare-against-constant path (`array.binary(rhs, op)`) over +//! every integer type and every valid bit width, so a kernel change shows up as a CodSpeed diff. +//! +//! The array holds in-range values (no patches, no out-of-range fast path), so each iteration runs +//! the full unpack + per-element compare kernel that backs ``. +//! +//! Run with `cargo bench -p vortex-fastlanes --bench bitpack_compare_sweep`. + +#![expect(clippy::unwrap_used)] +#![expect(clippy::cast_possible_truncation)] + +use divan::Bencher; +use divan::counter::ItemsCount; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::NativePType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_fastlanes::BitPackedData; + +fn main() { + divan::main(); +} + +/// Number of elements per benchmarked array (64 full FastLanes blocks). +const LEN: usize = 64 * 1024; + +/// Operator under test. `Lt` exercises the full unpack + per-element comparison path. +const OP: Operator = Operator::Lt; + +/// Integer types we can build packed arrays for in the benchmark. +trait BenchInt: NativePType + Copy + Into { + /// Build an in-range value from a small counter. + fn from_counter(v: u64) -> Self; +} + +macro_rules! impl_bench_int { + ($($T:ty),+) => { + $(impl BenchInt for $T { + #[inline] + fn from_counter(v: u64) -> Self { + v as $T + } + })+ + }; +} + +impl_bench_int!(u8, u16, u32, u64, i8, i16, i32, i64); + +/// Encode `LEN` in-range values of type `T` at the given bit width, returning the packed array, a +/// mid-range constant to compare against, and an execution context. +fn setup(width: usize) -> (ArrayRef, ArrayRef, ExecutionCtx) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let cap = 1u64 << width; + let buf: BufferMut = (0..LEN) + .map(|i| T::from_counter((i as u64) % cap)) + .collect(); + let array = BitPackedData::encode( + &PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array(), + width as u8, + &mut ctx, + ) + .unwrap() + .into_array(); + let rhs = ConstantArray::new(T::from_counter(cap / 2), LEN).into_array(); + (array, rhs, ctx) +} + +/// Generate a compare benchmark over every valid bit width for one type. Valid widths are +/// `1..native_bits` - bit-packing requires the target width to be strictly narrower than the type. +macro_rules! bench_type { + ($modname:ident, $T:ty, $native_bits:expr) => { + mod $modname { + use super::*; + + #[divan::bench(args = 1..$native_bits)] + fn compare(bencher: Bencher, width: usize) { + let (array, rhs, mut ctx) = setup::<$T>(width); + bencher.counter(ItemsCount::new(LEN)).bench_local(|| { + array + .clone() + .binary(rhs.clone(), OP) + .unwrap() + .execute::(&mut ctx) + .unwrap() + }); + } + } + }; +} + +bench_type!(u8, u8, 8usize); +bench_type!(u16, u16, 16usize); +bench_type!(u32, u32, 32usize); +bench_type!(u64, u64, 64usize); +bench_type!(i8, i8, 8usize); +bench_type!(i16, i16, 16usize); +bench_type!(i32, i32, 32usize); +bench_type!(i64, i64, 64usize); From 48da899354e048d5e0d73bce36c2be6afc30716b Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 16:47:23 +0000 Subject: [PATCH 2/3] perf(fastlanes): fuse bit-packed compare into a transposed mask + untranspose Replace the unpack-then-compare streaming kernel for compare-against-constant with the FastLanes fused `unpack_cmp`: compare each value as it is unpacked, accumulating results straight into a transposed 1024-bit mask (`[u64; 16]`, one register-resident word per lane - no `[bool; 1024]`/`[T; 1024]` scratch), then a single SIMD `untranspose_bits` per block rotates the mask into logical row order, copied directly into the output bit buffer. Inline patches are spliced in afterwards; sliced (offset != 0) arrays fall back to the scalar streaming predicate. This requires the in-development FastLanes (PR #141 fused mask + PR #145 width-generic BMI2/VBMI untranspose), pinned via a git patch until released. Benchmarked end-to-end through the public compare path (`bitpack_compare_sweep`, 64Ki elements, all integer types and bit widths): fused beats the streaming baseline for every type and width - i8/u8 ~6.2-7.7x i16/u16 ~4.5-6.0x i32/u32 ~1.9-4.3x i64/u64 ~1.2-1.9x Signed-off-by: Joe Isaacs --- Cargo.lock | 10 +- Cargo.toml | 5 + .../src/bitpacking/compute/compare.rs | 42 ++++-- .../src/bitpacking/compute/compare_fused.rs | 136 ++++++++++++++++++ .../fastlanes/src/bitpacking/compute/mod.rs | 1 + 5 files changed, 182 insertions(+), 12 deletions(-) create mode 100644 encodings/fastlanes/src/bitpacking/compute/compare_fused.rs diff --git a/Cargo.lock b/Cargo.lock index ec89bf1161f..1dd83daba52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1645,6 +1645,12 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core_detect" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8f80099a98041a3d1622845c271458a2d73e688351bf3cb999266764b81d48" + [[package]] name = "cpubits" version = "0.1.1" @@ -3146,11 +3152,11 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastlanes" version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "414cb755aee48ff7b0907995d2949c68c8c17900970076dff6a808e18e592d71" +source = "git+https://github.com/spiraldb/fastlanes?rev=6c10ea72cf693a17e994aa6401604ebedbeda453#6c10ea72cf693a17e994aa6401604ebedbeda453" dependencies = [ "arrayref", "const_for", + "core_detect", "num-traits", "paste", "seq-macro", diff --git a/Cargo.toml b/Cargo.toml index 9700d8d78ed..8dcbfadfe08 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -412,3 +412,8 @@ debug = false debug-assertions = false strip = "debuginfo" incremental = false + +# Pin to the in-development FastLanes branch (PR #141 fused [u64;16] cmp mask + +# PR #145 width-generic BMI/VBMI untranspose) until a release is cut. +[patch.crates-io] +fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" } diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs index f5c5c81c5cb..4e6755b9c36 100644 --- a/encodings/fastlanes/src/bitpacking/compute/compare.rs +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -8,6 +8,9 @@ //! a [`BitBuffer`]. Patches are re-applied at the end by overwriting bits at the patched //! indices with `predicate(patch_value)`. +use fastlanes::BitPacking; +use fastlanes::BitPackingCompare; +use fastlanes::FastLanesComparable; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -20,7 +23,8 @@ use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::BitPacked; -use crate::bitpacking::compute::stream_predicate::stream_predicate; +use crate::bitpacking::compute::compare_fused::stream_compare_fused; +use crate::unpack_iter::BitPacked as BitPackedIter; impl CompareKernel for BitPacked { fn compare( @@ -55,6 +59,15 @@ impl CompareKernel for BitPacked { } } +/// Compare every value against the constant via the fused FastLanes `unpack_cmp` kernel. +/// +/// `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive between +/// kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. +/// +/// The fused kernel (compare straight into a transposed 1024-bit mask, then a single SIMD +/// untranspose into logical row order) beats the unpack-then-compare streaming baseline for every +/// integer type and bit width - see `benches/bitpack_compare_fused.rs` (~6-7x for 8-bit lanes +/// down to ~1.2-1.9x for 64-bit lanes), so it is used unconditionally. fn compare_constant_typed( lhs: ArrayView<'_, BitPacked>, rhs: T, @@ -63,19 +76,28 @@ fn compare_constant_typed( ctx: &mut ExecutionCtx, ) -> VortexResult where - T: NativePType + Copy + crate::unpack_iter::BitPacked, + T: NativePType + BitPackedIter + FastLanesComparable, + ::Bitpacked: BitPacking + NativePType + BitPackingCompare, { - // `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive - // between kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. match operator { - CompareOperator::Eq => stream_predicate::(lhs, nullability, |v| v.is_eq(rhs), ctx), + CompareOperator::Eq => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_eq(b), ctx) + } CompareOperator::NotEq => { - stream_predicate::(lhs, nullability, |v| !v.is_eq(rhs), ctx) + stream_compare_fused::(lhs, rhs, nullability, |a, b| !a.is_eq(b), ctx) + } + CompareOperator::Lt => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_lt(b), ctx) + } + CompareOperator::Lte => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_le(b), ctx) + } + CompareOperator::Gt => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_gt(b), ctx) + } + CompareOperator::Gte => { + stream_compare_fused::(lhs, rhs, nullability, |a, b| a.is_ge(b), ctx) } - CompareOperator::Lt => stream_predicate::(lhs, nullability, |v| v.is_lt(rhs), ctx), - CompareOperator::Lte => stream_predicate::(lhs, nullability, |v| v.is_le(rhs), ctx), - CompareOperator::Gt => stream_predicate::(lhs, nullability, |v| v.is_gt(rhs), ctx), - CompareOperator::Gte => stream_predicate::(lhs, nullability, |v| v.is_ge(rhs), ctx), } } diff --git a/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs new file mode 100644 index 00000000000..053b63e6b3b --- /dev/null +++ b/encodings/fastlanes/src/bitpacking/compute/compare_fused.rs @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Fused compare kernel for [`BitPackedArray`] against a constant. +//! +//! Where [`super::stream_predicate`] unpacks a full 1024-element FastLanes block into a scratch +//! buffer and *then* folds a predicate over it, this path hands the comparison down into the +//! FastLanes [`BitPackingCompare::unchecked_unpack_cmp`] kernel, which compares each value against +//! the constant *as it is unpacked*, accumulating the boolean results straight into a 1024-bit +//! mask (`[u64; 16]`) in transposed FastLanes lane order - one register-resident word per lane, no +//! `[bool; 1024]` or `[T; 1024]` scratch. A single SIMD [`untranspose_bits`] per block then rotates +//! that mask into logical row order, which is copied directly into the output bit buffer. +//! +//! Only the full-chunk fast path uses the fused kernel. Sliced arrays (non-zero block offset) fall +//! back to the scalar streaming predicate, and inline patches are spliced in afterwards by +//! overwriting the bits at the patched indices with `cmp(patch_value, rhs)`. + +use fastlanes::BitPacking; +use fastlanes::BitPackingCompare; +use fastlanes::FastLanesComparable; +use fastlanes::untranspose_bits; +use num_traits::AsPrimitive; +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::Nullability; +use vortex_array::match_each_unsigned_integer_ptype; +use vortex_buffer::BitBufferMut; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; + +use super::stream_predicate::stream_predicate; +use crate::BitPacked; +use crate::BitPackedArrayExt; +use crate::unpack_iter::BitPacked as BitPackedIter; + +const CHUNK_SIZE: usize = 1024; +/// `u64` words spanning one FastLanes block (1024 bits / 64). +const WORDS_PER_CHUNK: usize = CHUNK_SIZE / u64::BITS as usize; + +/// Compare the unpacked values of a [`BitPackedArray`] against `rhs` using the fused FastLanes +/// `unpack_cmp` kernel, producing a [`BoolArray`]. +/// +/// `cmp(value, rhs)` defines the predicate; it must be the total-order comparison matching the +/// requested operator (e.g. `|a, b| a.is_lt(b)`). +pub(super) fn stream_compare_fused( + array: ArrayView<'_, BitPacked>, + rhs: T, + nullability: Nullability, + cmp: F, + ctx: &mut ExecutionCtx, +) -> VortexResult +where + T: NativePType + BitPackedIter + FastLanesComparable, + ::Bitpacked: BitPacking + NativePType + BitPackingCompare, + F: Fn(T, T) -> bool + Copy, +{ + let len = array.len(); + let bit_width = BitPackedArrayExt::bit_width(&array) as usize; + let offset = BitPackedArrayExt::offset(&array) as usize; + + // The fused kernel consumes whole 1024-element blocks at a fixed packed width. A non-zero + // block offset (from slicing) or a degenerate width has no clean full-chunk form, so defer + // to the scalar streaming predicate, which handles every layout. + if offset != 0 || len == 0 || bit_width == 0 { + return stream_predicate::(array, nullability, move |v| cmp(v, rhs), ctx); + } + + let packed = BitPackedArrayExt::packed_slice::<::Bitpacked>(&array); + let elems_per_chunk = 128 * bit_width / size_of::<::Bitpacked>(); + let num_chunks = len.div_ceil(CHUNK_SIZE); + + let mut words: BufferMut = BufferMut::zeroed(len.div_ceil(u64::BITS as usize)); + { + let words = words.as_mut_slice(); + // Per block: fuse compare into a transposed 1024-bit mask, then untranspose into logical + // row order. The packed buffer is zero-padded out to a whole final block, so every chunk - + // including the trailing partial one - has exactly `elems_per_chunk` packed values; we just + // copy fewer than 16 words out of the last block's untransposed mask. + let mut transposed = [0u64; WORDS_PER_CHUNK]; + let mut logical = [0u64; WORDS_PER_CHUNK]; + for chunk in 0..num_chunks { + let packed_chunk = &packed[chunk * elems_per_chunk..][..elems_per_chunk]; + // SAFETY: `packed_chunk` is exactly `128 * bit_width / size_of::()` elements and + // `bit_width <= U::T`, satisfying `unchecked_unpack_cmp`'s contract. + unsafe { + <::Bitpacked as BitPackingCompare>::unchecked_unpack_cmp::< + T, + _, + >(bit_width, packed_chunk, &mut transposed, cmp, rhs); + } + untranspose_bits::<::Bitpacked>(&transposed, &mut logical); + + let block_start = chunk * CHUNK_SIZE; + let block_bits = (len - block_start).min(CHUNK_SIZE); + let word_off = chunk * WORDS_PER_CHUNK; + let n_words = block_bits.div_ceil(u64::BITS as usize); + words[word_off..][..n_words].copy_from_slice(&logical[..n_words]); + } + + // Patched indices hold placeholder packed values, so their fused result is meaningless; + // overwrite each with the comparison against the real patch value. + if let Some(p) = array.patches() { + let p_idx = p.indices().clone().execute::(ctx)?; + let p_val = p.values().clone().execute::(ctx)?; + let p_off = p.offset(); + match_each_unsigned_integer_ptype!(p_idx.ptype(), |I| { + let indices = p_idx.as_slice::(); + let values = p_val.as_slice::(); + for (&global, &value) in indices.iter().zip(values) { + let global: usize = global.as_(); + set_bit(words, global - p_off, cmp(value, rhs)); + } + }); + } + } + + let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), 0, len); + let validity = array.validity()?.union_nullability(nullability); + Ok(BoolArray::new(bits.freeze(), validity).into_array()) +} + +/// Branchlessly write a single bit in a packed `u64` word buffer: clear the bit, then OR in the +/// new value. Avoids a data-dependent branch per patch in the patch-fixup loop, and touches the +/// target word through a single bounds-checked `&mut`. +#[inline] +fn set_bit(words: &mut [u64], idx: usize, value: bool) { + let shift = idx % u64::BITS as usize; + let mask = 1u64 << shift; + let word = &mut words[idx / u64::BITS as usize]; + *word = (*word & !mask) | (u64::from(value) << shift); +} diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs index 518f8319eb1..06a4b4597b0 100644 --- a/encodings/fastlanes/src/bitpacking/compute/mod.rs +++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs @@ -4,6 +4,7 @@ mod between; mod cast; mod compare; +mod compare_fused; mod filter; pub(crate) mod is_constant; mod slice; From 08ed4a4a033adf2558850ab904deb130de5a3197 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 4 Jun 2026 10:31:27 +0000 Subject: [PATCH 3/3] ci(wasm): pin in-development FastLanes in the excluded wasm-test workspace wasm-test is excluded from the workspace, so it does not inherit the root [patch.crates-io] and was building vortex-fastlanes against published fastlanes 0.5.0 (old `[bool;1024]` unpack_cmp, no `untranspose_bits`) -> compile error in compare_fused.rs. Add the matching git `rev` pin here. Temporary, like the root pin: both are removed when a FastLanes release is cut and the version is bumped. Signed-off-by: Joe Isaacs --- wasm-test/Cargo.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/wasm-test/Cargo.toml b/wasm-test/Cargo.toml index 100c15970b0..20cf5f26cfd 100644 --- a/wasm-test/Cargo.toml +++ b/wasm-test/Cargo.toml @@ -16,3 +16,8 @@ vortex = { path = "../vortex", default-features = false } inherits = "dev" debug = "line-tables-only" incremental = false + +# wasm-test is excluded from the workspace, so it does not inherit the root +# [patch.crates-io]; pin the in-development FastLanes here too until a release is cut. +[patch.crates-io] +fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "6c10ea72cf693a17e994aa6401604ebedbeda453" }