From 5aa6e21d2cafa5b56945863d6a3d20256d3db514 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 18:07:39 +0000 Subject: [PATCH 1/7] Fused delta(for(bitpacking)) decode + bench Wire the new `fastlanes::Delta::unfor_undelta_pack` kernel into delta decompression. When a DeltaArray's `deltas` child is a FoR array (unsigned reference) wrapping a BitPacked array stored as full, zero-offset chunks with no patches, `delta_decompress` now takes a fully fused fast path (`try_fused_for_bitpacking` -> `decompress_fused`) that unpacks, applies the frame-of-reference, and inverts the delta encoding in a single pass per chunk before untransposing. All other shapes fall back to the existing generic path. A round-trip test builds the stack from non-strictly-increasing (monotone non-decreasing) u32/u64 columns and asserts the fused path is actually taken. The `delta_for_bitpack` divan bench compares the fused decode against an unfused baseline (materialize the FoR(bitpacked) deltas, then generic delta decode). On non-decreasing columns the fused path is ~1.3-2.0x faster, with the gap widening at larger sizes and for u64. A local-dev `[patch.crates-io]` points fastlanes at the sibling checkout that carries the kernel; it would be replaced by a published version bump. Signed-off-by: Joe Isaacs --- Cargo.lock | 2 - Cargo.toml | 6 + encodings/fastlanes/Cargo.toml | 4 + .../fastlanes/benches/delta_for_bitpack.rs | 131 ++++++++++++++++++ .../src/delta/array/delta_compress.rs | 63 +++++++++ .../src/delta/array/delta_decompress.rs | 129 +++++++++++++++++ 6 files changed, 333 insertions(+), 2 deletions(-) create mode 100644 encodings/fastlanes/benches/delta_for_bitpack.rs diff --git a/Cargo.lock b/Cargo.lock index ec89bf1161f..eedea62a562 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3146,8 +3146,6 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastlanes" version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "414cb755aee48ff7b0907995d2949c68c8c17900970076dff6a808e18e592d71" dependencies = [ "arrayref", "const_for", diff --git a/Cargo.toml b/Cargo.toml index 9700d8d78ed..26585e812de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -412,3 +412,9 @@ debug = false debug-assertions = false strip = "debuginfo" incremental = false + +# Local development patch: build against the sibling fastlanes checkout which carries the +# `Delta::unfor_undelta_pack` fused kernel (branch claude/delta-bitpacking-fastlanes-V6mTZ). +# This would be replaced by a version bump once that kernel is published to crates.io. +[patch.crates-io] +fastlanes = { path = "../fastlanes" } diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 08c96c481d7..07c4fa2834b 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -64,6 +64,10 @@ required-features = ["_test-harness"] name = "bitpack_compare" harness = false +[[bench]] +name = "delta_for_bitpack" +harness = false + [[bench]] name = "cast_bitpacked" harness = false diff --git a/encodings/fastlanes/benches/delta_for_bitpack.rs b/encodings/fastlanes/benches/delta_for_bitpack.rs new file mode 100644 index 00000000000..0dd8af3e92d --- /dev/null +++ b/encodings/fastlanes/benches/delta_for_bitpack.rs @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compare decoding a `delta(for(bitpacking))` stack two ways: +//! * `fused` — the fused `Delta::unfor_undelta_pack` kernel (one pass over the packed buffer). +//! * `unfused` — materialize the FoR(bitpacked) deltas child to a primitive buffer, then run the +//! generic delta decode over it (two passes, two intermediate buffers). +//! +//! Both decode the same non-strictly-increasing (monotone non-decreasing) integer column. +//! +//! Run with `cargo bench -p vortex-fastlanes --bench delta_for_bitpack`. + +#![expect(clippy::unwrap_used)] +#![expect(clippy::cast_possible_truncation)] + +use divan::Bencher; +use divan::counter::ItemsCount; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::primitive::PrimitiveArrayExt; +use vortex_array::match_each_unsigned_integer_ptype; +use vortex_fastlanes::bitpack_compress::bitpack_encode; +use vortex_fastlanes::{Delta, FoR, FoRArrayExt, delta_compress}; + +fn main() { + divan::main(); +} + +// Exact multiples of 1024 so the deltas bit-pack without a zero-padding wrap. +const LENS: &[usize] = &[64 * 1024, 1024 * 1024]; + +/// Build the `delta(for(bitpacking))` stack and return both the fused root array and the pieces +/// needed to reconstruct an unfused decode (the bases child and the FoR(bitpacked) deltas child). +fn build(values: PrimitiveArray) -> (ArrayRef, ArrayRef, ArrayRef, usize, ExecutionCtx) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let len = values.len(); + + let (bases, deltas) = delta_compress(&values, &mut ctx).unwrap(); + let for_deltas = FoR::encode(deltas).unwrap(); + let reference = for_deltas.reference_scalar().clone(); + let for_encoded = for_deltas + .encoded() + .clone() + .execute::(&mut ctx) + .unwrap(); + + // Smallest width that captures every value, so bit-packing introduces no patches. + let unsigned = for_encoded.ptype().to_unsigned(); + let bit_width = match_each_unsigned_integer_ptype!(unsigned, |T| { + let reinterpreted = for_encoded.reinterpret_cast(unsigned); + let max = reinterpreted + .as_slice::() + .iter() + .copied() + .max() + .unwrap_or_default(); + (T::BITS - max.leading_zeros()) as u8 + }); + let bitpacked = bitpack_encode(&for_encoded, bit_width, None, &mut ctx).unwrap(); + + let bases = bases.into_array(); + let for_child = FoR::try_new(bitpacked.into_array(), reference) + .unwrap() + .into_array(); + let fused = Delta::try_new(bases.clone(), for_child.clone(), 0, len) + .unwrap() + .into_array(); + (fused, bases, for_child, len, ctx) +} + +fn u32_non_decreasing(len: usize) -> PrimitiveArray { + PrimitiveArray::from_iter((0..len as u32).map(|i| i / 4)) +} + +fn u64_non_decreasing(len: usize) -> PrimitiveArray { + PrimitiveArray::from_iter((0..len as u64).map(|i| (i / 6) * 3)) +} + +#[divan::bench(args = LENS)] +fn fused_u32(bencher: Bencher, len: usize) { + let (fused, _, _, n, mut ctx) = build(u32_non_decreasing(len)); + bencher + .counter(ItemsCount::new(n)) + .bench_local(|| fused.clone().execute::(&mut ctx).unwrap()); +} + +#[divan::bench(args = LENS)] +fn unfused_u32(bencher: Bencher, len: usize) { + let (_, bases, for_child, n, mut ctx) = build(u32_non_decreasing(len)); + bencher.counter(ItemsCount::new(n)).bench_local(|| { + // Pass 1: unpack + un-FoR the deltas into a materialized primitive buffer. + let deltas = for_child + .clone() + .execute::(&mut ctx) + .unwrap(); + // Pass 2: generic delta decode (un-delta + untranspose) over the materialized deltas. + Delta::try_new(bases.clone(), deltas.into_array(), 0, n) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + }); +} + +#[divan::bench(args = LENS)] +fn fused_u64(bencher: Bencher, len: usize) { + let (fused, _, _, n, mut ctx) = build(u64_non_decreasing(len)); + bencher + .counter(ItemsCount::new(n)) + .bench_local(|| fused.clone().execute::(&mut ctx).unwrap()); +} + +#[divan::bench(args = LENS)] +fn unfused_u64(bencher: Bencher, len: usize) { + let (_, bases, for_child, n, mut ctx) = build(u64_non_decreasing(len)); + bencher.counter(ItemsCount::new(n)).bench_local(|| { + let deltas = for_child + .clone() + .execute::(&mut ctx) + .unwrap(); + Delta::try_new(bases.clone(), deltas.into_array(), 0, n) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + }); +} diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs index e35778ad29e..7e195ad2276 100644 --- a/encodings/fastlanes/src/delta/array/delta_compress.rs +++ b/encodings/fastlanes/src/delta/array/delta_compress.rs @@ -105,20 +105,83 @@ mod tests { use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::assert_arrays_eq; + use vortex_array::match_each_unsigned_integer_ptype; use vortex_array::session::ArraySession; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_session::VortexSession; use crate::Delta; + use crate::FoR; use crate::bitpack_compress::bitpack_encode; use crate::delta::array::delta_decompress::delta_decompress; use crate::delta_compress; + use crate::r#for::FoRArrayExt; static SESSION: LazyLock = LazyLock::new(|| VortexSession::empty().with::()); + /// Build a `delta(for(bitpacking))` stack from `array`: delta-encode, then FoR + bit-pack the + /// resulting deltas. This is the exact tree the fused decode path in `delta_decompress` + /// recognizes. + fn build_delta_for_bitpacked( + array: &PrimitiveArray, + ctx: &mut vortex_array::ExecutionCtx, + ) -> VortexResult { + let (bases, deltas) = delta_compress(array, ctx)?; + let for_deltas = FoR::encode(deltas)?; + let reference = for_deltas.reference_scalar().clone(); + let for_encoded = for_deltas + .encoded() + .clone() + .execute::(ctx)?; + // Pick the smallest width that captures every value so bit-packing introduces no patches, + // keeping the array on the fused decode path. + let unsigned = for_encoded.ptype().to_unsigned(); + let bit_width = match_each_unsigned_integer_ptype!(unsigned, |T| { + let reinterpreted = for_encoded.reinterpret_cast(unsigned); + let max = reinterpreted + .as_slice::() + .iter() + .copied() + .max() + .unwrap_or_default(); + (T::BITS - max.leading_zeros()) as u8 + }); + let bitpacked = bitpack_encode(&for_encoded, bit_width, None, ctx)?; + let fused_for = FoR::try_new(bitpacked.into_array(), reference)?; + Delta::try_new(bases.into_array(), fused_for.into_array(), 0, array.len()) + } + + /// Non-strictly-increasing (monotone non-decreasing) integer columns. Consecutive equal runs + /// make many deltas zero, so the per-lane FoR reference over the deltas is small and the deltas + /// bit-pack tightly — exactly the shape that produces a delta(for(bitpacking)) stack. + /// + /// Lengths are exact multiples of 1024 so there is no zero-padding tail. (Padding can make a + /// lane straddle the real/zero boundary, producing a wrapping delta that forces full width.) + #[rstest] + #[case::u32_non_decreasing((0u32..20_480).map(|i| i / 3).collect())] + #[case::u64_non_decreasing((0u64..20_480).map(|i| (i / 5) * 2).collect())] + #[case::u32_long_runs((0u32..20_480).map(|i| i / 100).collect())] + fn fused_for_bitpacking_roundtrip(#[case] array: PrimitiveArray) -> VortexResult<()> { + use crate::delta::array::delta_decompress::try_fused_for_bitpacking; + + let mut ctx = SESSION.create_execution_ctx(); + let stack = build_delta_for_bitpacked(&array, &mut ctx)?; + + // The stack must take the fused decode path, not silently fall back to the generic one. + assert!( + try_fused_for_bitpacking(&stack, &mut ctx)?.is_some(), + "delta(for(bitpacking)) must be recognized by the fused decode path" + ); + + let decompressed = stack.into_array().execute::(&mut ctx)?; + assert_arrays_eq!(decompressed, array); + Ok(()) + } + #[rstest] #[case::u32((0u32..10_000).collect())] #[case::u8((0..10_000).map(|i| (i % (u8::MAX as i32)) as u8).collect())] diff --git a/encodings/fastlanes/src/delta/array/delta_decompress.rs b/encodings/fastlanes/src/delta/array/delta_decompress.rs index fe2567e63c7..948d562c4e0 100644 --- a/encodings/fastlanes/src/delta/array/delta_decompress.rs +++ b/encodings/fastlanes/src/delta/array/delta_decompress.rs @@ -15,16 +15,27 @@ use vortex_array::dtype::NativePType; use vortex_array::match_each_unsigned_integer_ptype; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; +use vortex_error::VortexExpect; use vortex_error::VortexResult; +use crate::BitPacked; +use crate::BitPackedArrayExt; use crate::DeltaArray; +use crate::FoR; use crate::bit_transpose::untranspose_validity; use crate::delta::array::DeltaArrayExt; +use crate::r#for::FoRArrayExt; pub fn delta_decompress( array: &DeltaArray, ctx: &mut ExecutionCtx, ) -> VortexResult { + // Fast path: a fully fused `delta(for(bitpacking))` decode that unpacks, applies the + // frame-of-reference, and inverts the delta encoding in a single pass over the packed buffer. + if let Some(decoded) = try_fused_for_bitpacking(array, ctx)? { + return Ok(decoded); + } + let bases = array.bases().clone().execute::(ctx)?; let deltas = array.deltas().clone().execute::(ctx)?; @@ -52,6 +63,124 @@ pub fn delta_decompress( Ok(decoded.reinterpret_cast(original_ptype)) } +/// Attempts the fused `delta(for(bitpacking))` decode. +/// +/// Returns `Some` when the `deltas` child is a [`FoR`] array with an unsigned reference wrapping a +/// [`BitPacked`] array stored as full, zero-offset chunks with no patches. In that case the packed +/// deltas are unpacked, FoR-decoded, and un-delta'd in a single pass via +/// [`Delta::unchecked_unfor_undelta_pack`]. Otherwise returns `None` so the caller falls back to the +/// generic path. +pub(crate) fn try_fused_for_bitpacking( + array: &DeltaArray, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let Some(for_) = array.deltas().as_opt::() else { + return Ok(None); + }; + // The fused kernel works in unsigned wrapping arithmetic; a signed reference would need a + // bit-reinterpret that the generic path already handles correctly. + if !for_.reference_scalar().dtype().is_unsigned_int() { + return Ok(None); + } + let Some(bp) = for_.encoded().as_opt::() else { + return Ok(None); + }; + // Patches and sliced (non-zero offset) bit-packed children are left to the generic path. + if bp.patches().is_some() || bp.offset() != 0 { + return Ok(None); + } + + let bases = array.bases().clone().execute::(ctx)?; + + let start = array.offset(); + let end = start + array.len(); + + let validity = untranspose_validity(&bp.validity()?, ctx)?; + let validity = validity.slice(start..end)?; + + let original_ptype = for_.ptype(); + let unsigned_ptype = original_ptype.to_unsigned(); + let bases = bases.reinterpret_cast(unsigned_ptype); + + let decoded = match_each_unsigned_integer_ptype!(unsigned_ptype, |T| { + const LANES: usize = T::LANES; + + let reference = for_ + .reference_scalar() + .as_primitive() + .as_::() + .vortex_expect("FoR reference must be non-null and unsigned"); + let packed = bp.packed_slice::(); + + let buffer = decompress_fused::( + bases.as_slice(), + packed, + bp.bit_width() as usize, + reference, + bp.len(), + ); + let buffer = buffer.slice(start..end); + + PrimitiveArray::new(buffer, validity) + }); + + Ok(Some(decoded.reinterpret_cast(original_ptype))) +} + +/// Fused low-level decode of bit-packed, FoR-encoded deltas. +/// +/// `packed` holds `num_values / 1024` chunks each of `128 * bit_width / size_of::()` packed +/// words. Each chunk is unpacked, FoR-decoded (wrapping-add `reference`) and un-delta'd in a single +/// pass, then untransposed back into logical order. +pub(crate) fn decompress_fused( + bases: &[T], + packed: &[T], + bit_width: usize, + reference: T, + num_values: usize, +) -> Buffer +where + T: NativePType + Delta + Transpose, +{ + debug_assert!( + num_values.is_multiple_of(1024), + "bit-packed deltas must be padded to a multiple of 1024" + ); + let num_chunks = num_values / 1024; + let elems_per_chunk = 128 * bit_width / size_of::(); + debug_assert_eq!(packed.len(), num_chunks * elems_per_chunk); + assert!(bases.len() >= num_chunks * LANES); + + let mut output = BufferMut::with_capacity(num_values); + let (output_chunks, _) = output.spare_capacity_mut().as_chunks_mut::<1024>(); + + let mut transposed: [T; 1024] = [T::default(); 1024]; + for (i, output_chunk) in output_chunks.iter_mut().enumerate() { + let packed_chunk = &packed[i * elems_per_chunk..(i + 1) * elems_per_chunk]; + let base = &bases[i * LANES..(i + 1) * LANES]; + + // SAFETY: `packed_chunk` has length `128 * bit_width / size_of::()`, `base` has length + // `LANES`, and `transposed` has length 1024, satisfying the kernel's contract. + unsafe { + Delta::unchecked_unfor_undelta_pack( + bit_width, + packed_chunk, + reference, + base, + &mut transposed, + ); + } + + Transpose::untranspose(&transposed, unsafe { + mem::transmute::<&mut [MaybeUninit; 1024], &mut [T; 1024]>(output_chunk) + }); + } + + unsafe { output.set_len(num_values) }; + + output.freeze() +} + /// Performs the low-level delta decompression on primitive values. /// /// All chunks must be full 1024-element chunks (deltas length must be a multiple of 1024). From ba828aec09852f2dc69274a3e793f592ece4cf32 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 18:27:07 +0000 Subject: [PATCH 2/7] Gate fused delta(for(bitpacking)) decode behind unstable_encodings Put the fused decode fast path (`try_fused_for_bitpacking` / `decompress_fused`), its imports, the round-trip test, and the bench behind a new `unstable_encodings` feature on vortex-fastlanes that enables `fastlanes/unstable`. With the feature off (the default) the kernel is compiled out entirely, so there is no `.text` cost; vortex-btrblocks' existing `unstable_encodings` feature now propagates it. Signed-off-by: Joe Isaacs --- encodings/fastlanes/Cargo.toml | 6 +++++- encodings/fastlanes/src/delta/array/delta_compress.rs | 5 +++++ .../fastlanes/src/delta/array/delta_decompress.rs | 10 ++++++---- vortex-btrblocks/Cargo.toml | 1 + 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 07c4fa2834b..d344063bc81 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -36,10 +36,13 @@ rand = { workspace = true } rstest = { workspace = true } vortex-alp = { path = "../alp" } vortex-array = { workspace = true, features = ["_test-harness"] } -vortex-fastlanes = { path = ".", features = ["_test-harness"] } +vortex-fastlanes = { path = ".", features = ["_test-harness", "unstable_encodings"] } [features] _test-harness = ["dep:rand"] +# Unstable encodings/decoders with no stability guarantee. Enables the fused +# delta(for(bitpacking)) decode kernel from the fastlanes crate. +unstable_encodings = ["fastlanes/unstable"] [[bench]] name = "bitpacking_take" @@ -67,6 +70,7 @@ harness = false [[bench]] name = "delta_for_bitpack" harness = false +required-features = ["unstable_encodings"] [[bench]] name = "cast_bitpacked" diff --git a/encodings/fastlanes/src/delta/array/delta_compress.rs b/encodings/fastlanes/src/delta/array/delta_compress.rs index 7e195ad2276..1db5b88ca5a 100644 --- a/encodings/fastlanes/src/delta/array/delta_compress.rs +++ b/encodings/fastlanes/src/delta/array/delta_compress.rs @@ -107,6 +107,7 @@ mod tests { use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::assert_arrays_eq; + #[cfg(feature = "unstable_encodings")] use vortex_array::match_each_unsigned_integer_ptype; use vortex_array::session::ArraySession; use vortex_error::VortexExpect; @@ -114,10 +115,12 @@ mod tests { use vortex_session::VortexSession; use crate::Delta; + #[cfg(feature = "unstable_encodings")] use crate::FoR; use crate::bitpack_compress::bitpack_encode; use crate::delta::array::delta_decompress::delta_decompress; use crate::delta_compress; + #[cfg(feature = "unstable_encodings")] use crate::r#for::FoRArrayExt; static SESSION: LazyLock = @@ -126,6 +129,7 @@ mod tests { /// Build a `delta(for(bitpacking))` stack from `array`: delta-encode, then FoR + bit-pack the /// resulting deltas. This is the exact tree the fused decode path in `delta_decompress` /// recognizes. + #[cfg(feature = "unstable_encodings")] fn build_delta_for_bitpacked( array: &PrimitiveArray, ctx: &mut vortex_array::ExecutionCtx, @@ -161,6 +165,7 @@ mod tests { /// /// Lengths are exact multiples of 1024 so there is no zero-padding tail. (Padding can make a /// lane straddle the real/zero boundary, producing a wrapping delta that forces full width.) + #[cfg(feature = "unstable_encodings")] #[rstest] #[case::u32_non_decreasing((0u32..20_480).map(|i| i / 3).collect())] #[case::u64_non_decreasing((0u64..20_480).map(|i| (i / 5) * 2).collect())] diff --git a/encodings/fastlanes/src/delta/array/delta_decompress.rs b/encodings/fastlanes/src/delta/array/delta_decompress.rs index 948d562c4e0..612d3ca1a52 100644 --- a/encodings/fastlanes/src/delta/array/delta_decompress.rs +++ b/encodings/fastlanes/src/delta/array/delta_decompress.rs @@ -15,16 +15,15 @@ use vortex_array::dtype::NativePType; use vortex_array::match_each_unsigned_integer_ptype; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; +#[cfg(feature = "unstable_encodings")] use vortex_error::VortexExpect; use vortex_error::VortexResult; -use crate::BitPacked; -use crate::BitPackedArrayExt; use crate::DeltaArray; -use crate::FoR; use crate::bit_transpose::untranspose_validity; use crate::delta::array::DeltaArrayExt; -use crate::r#for::FoRArrayExt; +#[cfg(feature = "unstable_encodings")] +use crate::{BitPacked, BitPackedArrayExt, FoR, r#for::FoRArrayExt}; pub fn delta_decompress( array: &DeltaArray, @@ -32,6 +31,7 @@ pub fn delta_decompress( ) -> VortexResult { // Fast path: a fully fused `delta(for(bitpacking))` decode that unpacks, applies the // frame-of-reference, and inverts the delta encoding in a single pass over the packed buffer. + #[cfg(feature = "unstable_encodings")] if let Some(decoded) = try_fused_for_bitpacking(array, ctx)? { return Ok(decoded); } @@ -70,6 +70,7 @@ pub fn delta_decompress( /// deltas are unpacked, FoR-decoded, and un-delta'd in a single pass via /// [`Delta::unchecked_unfor_undelta_pack`]. Otherwise returns `None` so the caller falls back to the /// generic path. +#[cfg(feature = "unstable_encodings")] pub(crate) fn try_fused_for_bitpacking( array: &DeltaArray, ctx: &mut ExecutionCtx, @@ -132,6 +133,7 @@ pub(crate) fn try_fused_for_bitpacking( /// `packed` holds `num_values / 1024` chunks each of `128 * bit_width / size_of::()` packed /// words. Each chunk is unpacked, FoR-decoded (wrapping-add `reference`) and un-delta'd in a single /// pass, then untransposed back into logical order. +#[cfg(feature = "unstable_encodings")] pub(crate) fn decompress_fused( bases: &[T], packed: &[T], diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 1adb6508828..8dd7a6c92e0 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -53,6 +53,7 @@ unstable_encodings = [ "dep:vortex-tensor", "dep:vortex-onpair", "vortex-zstd?/unstable_encodings", + "vortex-fastlanes/unstable_encodings", ] pco = ["dep:pco", "dep:vortex-pco"] zstd = ["dep:vortex-zstd"] From bdd3669cce48262310aa526331ae32a4c4351d6f Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 18:33:59 +0000 Subject: [PATCH 3/7] Patch fastlanes via git branch instead of local path The `[patch.crates-io]` previously pointed at a sibling `../fastlanes` checkout, which does not exist in CI and broke workspace resolution for every job. Point it at the pushed fastlanes branch (spiraldb/fastlanes#140) so the workspace resolves and both default and all-features builds compile. To be replaced by a published fastlanes version bump once that PR merges. Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + Cargo.toml | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eedea62a562..91475e4d825 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3146,6 +3146,7 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastlanes" version = "0.5.0" +source = "git+https://github.com/spiraldb/fastlanes?branch=claude%2Fdelta-bitpacking-fastlanes-V6mTZ#e64ed124d7ea907660e7f0630a9b9d73ab26e655" dependencies = [ "arrayref", "const_for", diff --git a/Cargo.toml b/Cargo.toml index 26585e812de..b1b76a8d109 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -413,8 +413,8 @@ debug-assertions = false strip = "debuginfo" incremental = false -# Local development patch: build against the sibling fastlanes checkout which carries the -# `Delta::unfor_undelta_pack` fused kernel (branch claude/delta-bitpacking-fastlanes-V6mTZ). -# This would be replaced by a version bump once that kernel is published to crates.io. +# Temporary patch: build against the fastlanes branch carrying the unstable +# `Delta::unfor_undelta_pack` fused kernel (spiraldb/fastlanes#140). Replace with a published +# fastlanes version bump once that PR merges and releases. [patch.crates-io] -fastlanes = { path = "../fastlanes" } +fastlanes = { git = "https://github.com/spiraldb/fastlanes", branch = "claude/delta-bitpacking-fastlanes-V6mTZ" } From 485b71c3e3f20b6240812511b7439b407722f325 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 18:39:25 +0000 Subject: [PATCH 4/7] Apply nightly rustfmt import granularity Split the combined `use` statements into one item per line and regroup, matching the repo's nightly rustfmt config (imports_granularity = "Item", group_imports = "StdExternalCrate"). No functional change. Signed-off-by: Joe Isaacs --- encodings/fastlanes/benches/delta_for_bitpack.rs | 5 ++++- encodings/fastlanes/src/delta/array/delta_decompress.rs | 8 +++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/encodings/fastlanes/benches/delta_for_bitpack.rs b/encodings/fastlanes/benches/delta_for_bitpack.rs index 0dd8af3e92d..eebb4d56a28 100644 --- a/encodings/fastlanes/benches/delta_for_bitpack.rs +++ b/encodings/fastlanes/benches/delta_for_bitpack.rs @@ -23,8 +23,11 @@ use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::match_each_unsigned_integer_ptype; +use vortex_fastlanes::Delta; +use vortex_fastlanes::FoR; +use vortex_fastlanes::FoRArrayExt; use vortex_fastlanes::bitpack_compress::bitpack_encode; -use vortex_fastlanes::{Delta, FoR, FoRArrayExt, delta_compress}; +use vortex_fastlanes::delta_compress; fn main() { divan::main(); diff --git a/encodings/fastlanes/src/delta/array/delta_decompress.rs b/encodings/fastlanes/src/delta/array/delta_decompress.rs index 612d3ca1a52..42ab63cb654 100644 --- a/encodings/fastlanes/src/delta/array/delta_decompress.rs +++ b/encodings/fastlanes/src/delta/array/delta_decompress.rs @@ -19,11 +19,17 @@ use vortex_buffer::BufferMut; use vortex_error::VortexExpect; use vortex_error::VortexResult; +#[cfg(feature = "unstable_encodings")] +use crate::BitPacked; +#[cfg(feature = "unstable_encodings")] +use crate::BitPackedArrayExt; use crate::DeltaArray; +#[cfg(feature = "unstable_encodings")] +use crate::FoR; use crate::bit_transpose::untranspose_validity; use crate::delta::array::DeltaArrayExt; #[cfg(feature = "unstable_encodings")] -use crate::{BitPacked, BitPackedArrayExt, FoR, r#for::FoRArrayExt}; +use crate::r#for::FoRArrayExt; pub fn delta_decompress( array: &DeltaArray, From 1d5c569e88b727584e8f513770b475e2a78e9512 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 20:05:54 +0000 Subject: [PATCH 5/7] Track fastlanes feature rename to delta_for_bitpacking Point `unstable_encodings` at `fastlanes/delta_for_bitpacking` and bump the patched fastlanes git revision accordingly. Signed-off-by: Joe Isaacs --- Cargo.lock | 2 +- encodings/fastlanes/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 91475e4d825..17503ef3967 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3146,7 +3146,7 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastlanes" version = "0.5.0" -source = "git+https://github.com/spiraldb/fastlanes?branch=claude%2Fdelta-bitpacking-fastlanes-V6mTZ#e64ed124d7ea907660e7f0630a9b9d73ab26e655" +source = "git+https://github.com/spiraldb/fastlanes?branch=claude%2Fdelta-bitpacking-fastlanes-V6mTZ#267717cd72e8b6f0ed0e5321ae3fc785fa433058" dependencies = [ "arrayref", "const_for", diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index d344063bc81..7fe3dc69305 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -42,7 +42,7 @@ vortex-fastlanes = { path = ".", features = ["_test-harness", "unstable_encoding _test-harness = ["dep:rand"] # Unstable encodings/decoders with no stability guarantee. Enables the fused # delta(for(bitpacking)) decode kernel from the fastlanes crate. -unstable_encodings = ["fastlanes/unstable"] +unstable_encodings = ["fastlanes/delta_for_bitpacking"] [[bench]] name = "bitpacking_take" From 767e03bdf2692ff5eda92a00a1afbf9bdce172e1 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 20:31:03 +0000 Subject: [PATCH 6/7] Bench fused vs real generic Vortex decode on the same array Expose `delta_decompress` / `delta_decompress_generic` under the `_test-harness` feature and rewrite the bench so both arms call the real decode entry points on the identical delta(for(bitpacking)) array: `fused` (the unfor_undelta_pack fast path) vs `current` (the pre-fusion generic decode). The previous baseline reused a cached intermediate and understated the gap; the cold-vs-cold comparison shows ~4.6x (u32 64Ki) to ~6.9x (u64 1Mi), dominated by avoiding the intermediate FoR-decoded PrimitiveArray materialization rather than kernel speed. Signed-off-by: Joe Isaacs --- encodings/fastlanes/Cargo.toml | 2 +- .../fastlanes/benches/delta_for_bitpack.rs | 77 ++++++++----------- .../src/delta/array/delta_decompress.rs | 10 +++ encodings/fastlanes/src/delta/mod.rs | 4 + 4 files changed, 45 insertions(+), 48 deletions(-) diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml index 7fe3dc69305..3f467461f11 100644 --- a/encodings/fastlanes/Cargo.toml +++ b/encodings/fastlanes/Cargo.toml @@ -70,7 +70,7 @@ harness = false [[bench]] name = "delta_for_bitpack" harness = false -required-features = ["unstable_encodings"] +required-features = ["unstable_encodings", "_test-harness"] [[bench]] name = "cast_bitpacked" diff --git a/encodings/fastlanes/benches/delta_for_bitpack.rs b/encodings/fastlanes/benches/delta_for_bitpack.rs index eebb4d56a28..950ba840d7e 100644 --- a/encodings/fastlanes/benches/delta_for_bitpack.rs +++ b/encodings/fastlanes/benches/delta_for_bitpack.rs @@ -1,21 +1,23 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Compare decoding a `delta(for(bitpacking))` stack two ways: -//! * `fused` — the fused `Delta::unfor_undelta_pack` kernel (one pass over the packed buffer). -//! * `unfused` — materialize the FoR(bitpacked) deltas child to a primitive buffer, then run the -//! generic delta decode over it (two passes, two intermediate buffers). +//! A/B decode of a `delta(for(bitpacking))` column, both arms going through the real Vortex decode +//! entry points on the *same* array: +//! * `fused` — `delta_decompress` with the fused `unfor_undelta_pack` fast path. +//! * `current` — `delta_decompress_generic`, the path Vortex took before the fused kernel: +//! materialize the FoR(bitpacked) deltas child, then un-delta + untranspose. //! -//! Both decode the same non-strictly-increasing (monotone non-decreasing) integer column. +//! The column is non-strictly-increasing (monotone non-decreasing) so it compresses as +//! delta(for(bitpacking)). //! -//! Run with `cargo bench -p vortex-fastlanes --bench delta_for_bitpack`. +//! Run with `cargo bench -p vortex-fastlanes --bench delta_for_bitpack +//! --features unstable_encodings,_test-harness`. #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] use divan::Bencher; use divan::counter::ItemsCount; -use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; @@ -24,10 +26,13 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::primitive::PrimitiveArrayExt; use vortex_array::match_each_unsigned_integer_ptype; use vortex_fastlanes::Delta; +use vortex_fastlanes::DeltaArray; use vortex_fastlanes::FoR; use vortex_fastlanes::FoRArrayExt; use vortex_fastlanes::bitpack_compress::bitpack_encode; use vortex_fastlanes::delta_compress; +use vortex_fastlanes::delta_decompress; +use vortex_fastlanes::delta_decompress_generic; fn main() { divan::main(); @@ -36,9 +41,8 @@ fn main() { // Exact multiples of 1024 so the deltas bit-pack without a zero-padding wrap. const LENS: &[usize] = &[64 * 1024, 1024 * 1024]; -/// Build the `delta(for(bitpacking))` stack and return both the fused root array and the pieces -/// needed to reconstruct an unfused decode (the bases child and the FoR(bitpacked) deltas child). -fn build(values: PrimitiveArray) -> (ArrayRef, ArrayRef, ArrayRef, usize, ExecutionCtx) { +/// Build the `delta(for(bitpacking))` stack for `values`. +fn build(values: PrimitiveArray) -> (DeltaArray, usize, ExecutionCtx) { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let len = values.len(); @@ -65,14 +69,11 @@ fn build(values: PrimitiveArray) -> (ArrayRef, ArrayRef, ArrayRef, usize, Execut }); let bitpacked = bitpack_encode(&for_encoded, bit_width, None, &mut ctx).unwrap(); - let bases = bases.into_array(); let for_child = FoR::try_new(bitpacked.into_array(), reference) .unwrap() .into_array(); - let fused = Delta::try_new(bases.clone(), for_child.clone(), 0, len) - .unwrap() - .into_array(); - (fused, bases, for_child, len, ctx) + let array = Delta::try_new(bases.into_array(), for_child, 0, len).unwrap(); + (array, len, ctx) } fn u32_non_decreasing(len: usize) -> PrimitiveArray { @@ -85,50 +86,32 @@ fn u64_non_decreasing(len: usize) -> PrimitiveArray { #[divan::bench(args = LENS)] fn fused_u32(bencher: Bencher, len: usize) { - let (fused, _, _, n, mut ctx) = build(u32_non_decreasing(len)); + let (array, n, mut ctx) = build(u32_non_decreasing(len)); bencher .counter(ItemsCount::new(n)) - .bench_local(|| fused.clone().execute::(&mut ctx).unwrap()); + .bench_local(|| delta_decompress(&array, &mut ctx).unwrap()); } #[divan::bench(args = LENS)] -fn unfused_u32(bencher: Bencher, len: usize) { - let (_, bases, for_child, n, mut ctx) = build(u32_non_decreasing(len)); - bencher.counter(ItemsCount::new(n)).bench_local(|| { - // Pass 1: unpack + un-FoR the deltas into a materialized primitive buffer. - let deltas = for_child - .clone() - .execute::(&mut ctx) - .unwrap(); - // Pass 2: generic delta decode (un-delta + untranspose) over the materialized deltas. - Delta::try_new(bases.clone(), deltas.into_array(), 0, n) - .unwrap() - .into_array() - .execute::(&mut ctx) - .unwrap() - }); +fn current_u32(bencher: Bencher, len: usize) { + let (array, n, mut ctx) = build(u32_non_decreasing(len)); + bencher + .counter(ItemsCount::new(n)) + .bench_local(|| delta_decompress_generic(&array, &mut ctx).unwrap()); } #[divan::bench(args = LENS)] fn fused_u64(bencher: Bencher, len: usize) { - let (fused, _, _, n, mut ctx) = build(u64_non_decreasing(len)); + let (array, n, mut ctx) = build(u64_non_decreasing(len)); bencher .counter(ItemsCount::new(n)) - .bench_local(|| fused.clone().execute::(&mut ctx).unwrap()); + .bench_local(|| delta_decompress(&array, &mut ctx).unwrap()); } #[divan::bench(args = LENS)] -fn unfused_u64(bencher: Bencher, len: usize) { - let (_, bases, for_child, n, mut ctx) = build(u64_non_decreasing(len)); - bencher.counter(ItemsCount::new(n)).bench_local(|| { - let deltas = for_child - .clone() - .execute::(&mut ctx) - .unwrap(); - Delta::try_new(bases.clone(), deltas.into_array(), 0, n) - .unwrap() - .into_array() - .execute::(&mut ctx) - .unwrap() - }); +fn current_u64(bencher: Bencher, len: usize) { + let (array, n, mut ctx) = build(u64_non_decreasing(len)); + bencher + .counter(ItemsCount::new(n)) + .bench_local(|| delta_decompress_generic(&array, &mut ctx).unwrap()); } diff --git a/encodings/fastlanes/src/delta/array/delta_decompress.rs b/encodings/fastlanes/src/delta/array/delta_decompress.rs index 42ab63cb654..07d7514a3ab 100644 --- a/encodings/fastlanes/src/delta/array/delta_decompress.rs +++ b/encodings/fastlanes/src/delta/array/delta_decompress.rs @@ -42,6 +42,16 @@ pub fn delta_decompress( return Ok(decoded); } + delta_decompress_generic(array, ctx) +} + +/// The generic delta decode: fully materialize the `deltas` child, then invert the delta encoding +/// (un-delta + untranspose). This is the path taken for every stack that the fused fast path does +/// not recognize, and the one Vortex used before the fused `delta(for(bitpacking))` kernel existed. +pub fn delta_decompress_generic( + array: &DeltaArray, + ctx: &mut ExecutionCtx, +) -> VortexResult { let bases = array.bases().clone().execute::(ctx)?; let deltas = array.deltas().clone().execute::(ctx)?; diff --git a/encodings/fastlanes/src/delta/mod.rs b/encodings/fastlanes/src/delta/mod.rs index 52ea9b33574..e5774f7f654 100644 --- a/encodings/fastlanes/src/delta/mod.rs +++ b/encodings/fastlanes/src/delta/mod.rs @@ -4,6 +4,10 @@ mod array; pub use array::DeltaData; pub use array::delta_compress::delta_compress; +// Exposed for benchmarks: decode entry points so a bench can A/B the fused fast path against the +// generic (pre-fusion) decode on the same array. +#[cfg(feature = "_test-harness")] +pub use array::delta_decompress::{delta_decompress, delta_decompress_generic}; mod compute; From 1565f71b2aec486ad2a9e07f8c82c495ea65310c Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 20:37:04 +0000 Subject: [PATCH 7/7] Pin fastlanes patch to rev 267717c Reference the exact fastlanes revision (spiraldb/fastlanes#140) instead of the branch for reproducibility. Signed-off-by: Joe Isaacs --- Cargo.lock | 2 +- Cargo.toml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 17503ef3967..e66d4c48bc4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3146,7 +3146,7 @@ checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "fastlanes" version = "0.5.0" -source = "git+https://github.com/spiraldb/fastlanes?branch=claude%2Fdelta-bitpacking-fastlanes-V6mTZ#267717cd72e8b6f0ed0e5321ae3fc785fa433058" +source = "git+https://github.com/spiraldb/fastlanes?rev=267717cd72e8b6f0ed0e5321ae3fc785fa433058#267717cd72e8b6f0ed0e5321ae3fc785fa433058" dependencies = [ "arrayref", "const_for", diff --git a/Cargo.toml b/Cargo.toml index b1b76a8d109..72d24a43ad3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -413,8 +413,8 @@ debug-assertions = false strip = "debuginfo" incremental = false -# Temporary patch: build against the fastlanes branch carrying the unstable -# `Delta::unfor_undelta_pack` fused kernel (spiraldb/fastlanes#140). Replace with a published -# fastlanes version bump once that PR merges and releases. +# Temporary patch: build against the fastlanes revision carrying the `delta_for_bitpacking` +# fused `Delta::unfor_undelta_pack` kernel (spiraldb/fastlanes#140, rev 267717c). Replace with a +# published fastlanes version bump once that PR merges and releases. [patch.crates-io] -fastlanes = { git = "https://github.com/spiraldb/fastlanes", branch = "claude/delta-bitpacking-fastlanes-V6mTZ" } +fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "267717cd72e8b6f0ed0e5321ae3fc785fa433058" }