Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -412,3 +412,9 @@ debug = false
debug-assertions = false
strip = "debuginfo"
incremental = false

# Temporary patch: build against the fastlanes revision carrying the `delta_for_bitpacking`
# fused `Delta::unfor_undelta_pack` kernel (spiraldb/fastlanes#140, rev 267717c). Replace with a
# published fastlanes version bump once that PR merges and releases.
[patch.crates-io]
fastlanes = { git = "https://github.com/spiraldb/fastlanes", rev = "267717cd72e8b6f0ed0e5321ae3fc785fa433058" }
10 changes: 9 additions & 1 deletion encodings/fastlanes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,13 @@ rand = { workspace = true }
rstest = { workspace = true }
vortex-alp = { path = "../alp" }
vortex-array = { workspace = true, features = ["_test-harness"] }
vortex-fastlanes = { path = ".", features = ["_test-harness"] }
vortex-fastlanes = { path = ".", features = ["_test-harness", "unstable_encodings"] }

[features]
_test-harness = ["dep:rand"]
# Unstable encodings/decoders with no stability guarantee. Enables the fused
# delta(for(bitpacking)) decode kernel from the fastlanes crate.
unstable_encodings = ["fastlanes/delta_for_bitpacking"]

[[bench]]
name = "bitpacking_take"
Expand All @@ -64,6 +67,11 @@ required-features = ["_test-harness"]
name = "bitpack_compare"
harness = false

[[bench]]
name = "delta_for_bitpack"
harness = false
required-features = ["unstable_encodings", "_test-harness"]

[[bench]]
name = "cast_bitpacked"
harness = false
Expand Down
117 changes: 117 additions & 0 deletions encodings/fastlanes/benches/delta_for_bitpack.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! A/B decode of a `delta(for(bitpacking))` column, both arms going through the real Vortex decode
//! entry points on the *same* array:
//! * `fused` — `delta_decompress` with the fused `unfor_undelta_pack` fast path.
//! * `current` — `delta_decompress_generic`, the path Vortex took before the fused kernel:
//! materialize the FoR(bitpacked) deltas child, then un-delta + untranspose.
//!
//! The column is non-strictly-increasing (monotone non-decreasing) so it compresses as
//! delta(for(bitpacking)).
//!
//! Run with `cargo bench -p vortex-fastlanes --bench delta_for_bitpack
//! --features unstable_encodings,_test-harness`.

#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

use divan::Bencher;
use divan::counter::ItemsCount;
use vortex_array::ExecutionCtx;
use vortex_array::IntoArray;
use vortex_array::LEGACY_SESSION;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::primitive::PrimitiveArrayExt;
use vortex_array::match_each_unsigned_integer_ptype;
use vortex_fastlanes::Delta;
use vortex_fastlanes::DeltaArray;
use vortex_fastlanes::FoR;
use vortex_fastlanes::FoRArrayExt;
use vortex_fastlanes::bitpack_compress::bitpack_encode;
use vortex_fastlanes::delta_compress;
use vortex_fastlanes::delta_decompress;
use vortex_fastlanes::delta_decompress_generic;

fn main() {
divan::main();
}

// Exact multiples of 1024 so the deltas bit-pack without a zero-padding wrap.
const LENS: &[usize] = &[64 * 1024, 1024 * 1024];

/// Build the `delta(for(bitpacking))` stack for `values`.
fn build(values: PrimitiveArray) -> (DeltaArray, usize, ExecutionCtx) {
let mut ctx = LEGACY_SESSION.create_execution_ctx();
let len = values.len();

let (bases, deltas) = delta_compress(&values, &mut ctx).unwrap();
let for_deltas = FoR::encode(deltas).unwrap();
let reference = for_deltas.reference_scalar().clone();
let for_encoded = for_deltas
.encoded()
.clone()
.execute::<PrimitiveArray>(&mut ctx)
.unwrap();

// Smallest width that captures every value, so bit-packing introduces no patches.
let unsigned = for_encoded.ptype().to_unsigned();
let bit_width = match_each_unsigned_integer_ptype!(unsigned, |T| {
let reinterpreted = for_encoded.reinterpret_cast(unsigned);
let max = reinterpreted
.as_slice::<T>()
.iter()
.copied()
.max()
.unwrap_or_default();
(T::BITS - max.leading_zeros()) as u8
});
let bitpacked = bitpack_encode(&for_encoded, bit_width, None, &mut ctx).unwrap();

let for_child = FoR::try_new(bitpacked.into_array(), reference)
.unwrap()
.into_array();
let array = Delta::try_new(bases.into_array(), for_child, 0, len).unwrap();
(array, len, ctx)
}

fn u32_non_decreasing(len: usize) -> PrimitiveArray {
PrimitiveArray::from_iter((0..len as u32).map(|i| i / 4))
}

fn u64_non_decreasing(len: usize) -> PrimitiveArray {
PrimitiveArray::from_iter((0..len as u64).map(|i| (i / 6) * 3))
}

#[divan::bench(args = LENS)]
fn fused_u32(bencher: Bencher, len: usize) {
let (array, n, mut ctx) = build(u32_non_decreasing(len));
bencher
.counter(ItemsCount::new(n))
.bench_local(|| delta_decompress(&array, &mut ctx).unwrap());
}

#[divan::bench(args = LENS)]
fn current_u32(bencher: Bencher, len: usize) {
let (array, n, mut ctx) = build(u32_non_decreasing(len));
bencher
.counter(ItemsCount::new(n))
.bench_local(|| delta_decompress_generic(&array, &mut ctx).unwrap());
}

#[divan::bench(args = LENS)]
fn fused_u64(bencher: Bencher, len: usize) {
let (array, n, mut ctx) = build(u64_non_decreasing(len));
bencher
.counter(ItemsCount::new(n))
.bench_local(|| delta_decompress(&array, &mut ctx).unwrap());
}

#[divan::bench(args = LENS)]
fn current_u64(bencher: Bencher, len: usize) {
let (array, n, mut ctx) = build(u64_non_decreasing(len));
bencher
.counter(ItemsCount::new(n))
.bench_local(|| delta_decompress_generic(&array, &mut ctx).unwrap());
}
68 changes: 68 additions & 0 deletions encodings/fastlanes/src/delta/array/delta_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,88 @@ mod tests {
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::primitive::PrimitiveArrayExt;
use vortex_array::assert_arrays_eq;
#[cfg(feature = "unstable_encodings")]
use vortex_array::match_each_unsigned_integer_ptype;
use vortex_array::session::ArraySession;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_session::VortexSession;

use crate::Delta;
#[cfg(feature = "unstable_encodings")]
use crate::FoR;
use crate::bitpack_compress::bitpack_encode;
use crate::delta::array::delta_decompress::delta_decompress;
use crate::delta_compress;
#[cfg(feature = "unstable_encodings")]
use crate::r#for::FoRArrayExt;

static SESSION: LazyLock<VortexSession> =
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());

/// Build a `delta(for(bitpacking))` stack from `array`: delta-encode, then FoR + bit-pack the
/// resulting deltas. This is the exact tree the fused decode path in `delta_decompress`
/// recognizes.
#[cfg(feature = "unstable_encodings")]
fn build_delta_for_bitpacked(
array: &PrimitiveArray,
ctx: &mut vortex_array::ExecutionCtx,
) -> VortexResult<crate::DeltaArray> {
let (bases, deltas) = delta_compress(array, ctx)?;
let for_deltas = FoR::encode(deltas)?;
let reference = for_deltas.reference_scalar().clone();
let for_encoded = for_deltas
.encoded()
.clone()
.execute::<PrimitiveArray>(ctx)?;
// Pick the smallest width that captures every value so bit-packing introduces no patches,
// keeping the array on the fused decode path.
let unsigned = for_encoded.ptype().to_unsigned();
let bit_width = match_each_unsigned_integer_ptype!(unsigned, |T| {
let reinterpreted = for_encoded.reinterpret_cast(unsigned);
let max = reinterpreted
.as_slice::<T>()
.iter()
.copied()
.max()
.unwrap_or_default();
(T::BITS - max.leading_zeros()) as u8
});
let bitpacked = bitpack_encode(&for_encoded, bit_width, None, ctx)?;
let fused_for = FoR::try_new(bitpacked.into_array(), reference)?;
Delta::try_new(bases.into_array(), fused_for.into_array(), 0, array.len())
}

/// Non-strictly-increasing (monotone non-decreasing) integer columns. Consecutive equal runs
/// make many deltas zero, so the per-lane FoR reference over the deltas is small and the deltas
/// bit-pack tightly — exactly the shape that produces a delta(for(bitpacking)) stack.
///
/// Lengths are exact multiples of 1024 so there is no zero-padding tail. (Padding can make a
/// lane straddle the real/zero boundary, producing a wrapping delta that forces full width.)
#[cfg(feature = "unstable_encodings")]
#[rstest]
#[case::u32_non_decreasing((0u32..20_480).map(|i| i / 3).collect())]
#[case::u64_non_decreasing((0u64..20_480).map(|i| (i / 5) * 2).collect())]
#[case::u32_long_runs((0u32..20_480).map(|i| i / 100).collect())]
fn fused_for_bitpacking_roundtrip(#[case] array: PrimitiveArray) -> VortexResult<()> {
use crate::delta::array::delta_decompress::try_fused_for_bitpacking;

let mut ctx = SESSION.create_execution_ctx();
let stack = build_delta_for_bitpacked(&array, &mut ctx)?;

// The stack must take the fused decode path, not silently fall back to the generic one.
assert!(
try_fused_for_bitpacking(&stack, &mut ctx)?.is_some(),
"delta(for(bitpacking)) must be recognized by the fused decode path"
);

let decompressed = stack.into_array().execute::<PrimitiveArray>(&mut ctx)?;
assert_arrays_eq!(decompressed, array);
Ok(())
}

#[rstest]
#[case::u32((0u32..10_000).collect())]
#[case::u8((0..10_000).map(|i| (i % (u8::MAX as i32)) as u8).collect())]
Expand Down
Loading
Loading