From 9e26f484e79be985df3b7dbdb03ddde46a9d50bd Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 2 Jun 2026 10:58:58 +0000 Subject: [PATCH 1/3] u Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/builder.rs | 3 + vortex-btrblocks/src/schemes/integer/delta.rs | 177 ++++++++++++++++++ vortex-btrblocks/src/schemes/integer/mod.rs | 4 + .../schemes/integer/scheme_selection_tests.rs | 61 +++++- 4 files changed, 244 insertions(+), 1 deletion(-) create mode 100644 vortex-btrblocks/src/schemes/integer/delta.rs diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 61c40341dbc..61a88b21f2c 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -41,6 +41,9 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ &integer::RunEndScheme, &integer::SequenceScheme, &integer::IntRLEScheme, + // Prefer all other schemes above delta, for now (since its slower to decompress). + #[cfg(feature = "unstable_encodings")] + &integer::DeltaScheme, //////////////////////////////////////////////////////////////////////////////////////////////// // Float schemes. //////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/vortex-btrblocks/src/schemes/integer/delta.rs b/vortex-btrblocks/src/schemes/integer/delta.rs new file mode 100644 index 00000000000..d356f9fe1fe --- /dev/null +++ b/vortex-btrblocks/src/schemes/integer/delta.rs @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! FastLanes Delta integer encoding. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_compressor::builtins::BinaryDictScheme; +use vortex_compressor::builtins::FloatDictScheme; +use vortex_compressor::builtins::IntDictScheme; +use vortex_compressor::builtins::StringDictScheme; +use vortex_compressor::estimate::CompressionEstimate; +use vortex_compressor::estimate::DeferredEstimate; +use vortex_compressor::estimate::EstimateScore; +use vortex_compressor::estimate::EstimateVerdict; +use vortex_compressor::scheme::AncestorExclusion; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; +use vortex_error::VortexResult; +use vortex_fastlanes::Delta; + +use crate::ArrayAndStats; +use crate::CascadingCompressor; +use crate::CompressorContext; +use crate::GenerateStatsOptions; +use crate::Scheme; +use crate::SchemeExt; + +/// FastLanes Delta encoding for smooth / near-monotone integers. +/// +/// Delta replaces each value with its difference from an earlier value (at the FastLanes lane +/// stride), so a later cascade layer (FoR / BitPacking) packs the smaller residuals. It only +/// pays off when those residuals span meaningfully fewer bits than the values themselves. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct DeltaScheme; + +/// Multiplicative penalty applied to Delta's estimated compression ratio. +/// +/// Unlike FoR/BitPacking, Delta breaks random access and adds a prefix-sum decode pass, and it +/// carries a structural sign bit on its residuals. We therefore require Delta to be meaningfully +/// (~10%) smaller than the best alternative before it wins, rather than picking it for a +/// single-bit gain. This factor encodes that "delta tax". +const DELTA_PENALTY: f64 = 0.9; + +/// Minimum length before Delta is worth considering (one FastLanes chunk). +const MIN_DELTA_LEN: usize = 1024; + +impl Scheme for DeltaScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.delta" + } + + fn matches(&self, canonical: &Canonical) -> bool { + canonical.dtype().is_int() + } + + fn num_children(&self) -> usize { + 2 + } + + /// Delta-encode the data at most once per path: exclude Delta from the subtrees of both the + /// bases and the deltas children so we never delta-encode data that was already delta-encoded. + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: DeltaScheme.id(), + children: ChildSelection::All, + }] + } + + /// Delta over dictionary codes just adds indirection: codes are compact integers with no + /// monotone structure, so (like FoR/Sequence) skip the codes child. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: BinaryDictScheme.id(), + children: ChildSelection::One(1), + }, + ] + } + + fn expected_compression_ratio( + &self, + data: &ArrayAndStats, + compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + // Delta only pays off if a later cascade layer (FoR/BitPacking) packs the residuals. + if compress_ctx.finished_cascading() { + return CompressionEstimate::Verdict(EstimateVerdict::Skip); + } + // Too short to transpose into FastLanes chunks meaningfully. + if data.array_len() < MIN_DELTA_LEN { + return CompressionEstimate::Verdict(EstimateVerdict::Skip); + } + + // Estimating Delta needs the real transposed-delta span, so defer to a callback that + // delta-encodes the array and measures the residual range. + CompressionEstimate::Deferred(DeferredEstimate::Callback(Box::new( + |_compressor, data, best_so_far, _ctx, exec_ctx| { + let primitive = data.array().clone().execute::(exec_ctx)?; + let full_width = primitive.ptype().bit_width() as f64; + + // Delta's best case is residuals collapsing to a single bit. If even that, after + // the penalty, can't beat the incumbent, skip before doing the encode work. + let threshold = best_so_far.and_then(EstimateScore::finite_ratio); + if threshold.is_some_and(|t| full_width * DELTA_PENALTY <= t) { + return Ok(EstimateVerdict::Skip); + } + + // Measure the actual FastLanes transposed-delta span. This is the lane-stride + // difference that gets bit-packed, not the lag-1 difference (which the transpose + // makes optimistic), so it is what truly drives the compressed size. + let (_bases, deltas) = vortex_fastlanes::delta_compress(&primitive, exec_ctx)?; + let delta_stats = + ArrayAndStats::new(deltas.into_array(), GenerateStatsOptions::default()); + let span = delta_stats.integer_stats(exec_ctx).erased().max_minus_min(); + + // Bits needed to FoR-pack the residuals. A zero span means constant deltas, which + // SequenceScheme already captures more cheaply, so defer to it. + let delta_bits = match span.checked_ilog2() { + Some(l) => (l + 1) as f64, + None => return Ok(EstimateVerdict::Skip), + }; + + let ratio = full_width / delta_bits * DELTA_PENALTY; + if ratio <= 1.0 { + return Ok(EstimateVerdict::Skip); + } + Ok(EstimateVerdict::Ratio(ratio)) + }, + ))) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &ArrayAndStats, + compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + let primitive = data.array().clone().execute::(exec_ctx)?; + let len = primitive.len(); + let (bases, deltas) = vortex_fastlanes::delta_compress(&primitive, exec_ctx)?; + + let compressed_bases = compressor.compress_child( + &bases.into_array(), + &compress_ctx, + self.id(), + 0, + exec_ctx, + )?; + let compressed_deltas = compressor.compress_child( + &deltas.into_array(), + &compress_ctx, + self.id(), + 1, + exec_ctx, + )?; + + Delta::try_new(compressed_bases, compressed_deltas, 0, len).map(IntoArray::into_array) + } +} diff --git a/vortex-btrblocks/src/schemes/integer/mod.rs b/vortex-btrblocks/src/schemes/integer/mod.rs index aed29f1ad3d..abe5868f5c8 100644 --- a/vortex-btrblocks/src/schemes/integer/mod.rs +++ b/vortex-btrblocks/src/schemes/integer/mod.rs @@ -4,6 +4,8 @@ //! Integer compression schemes. mod bitpacking; +#[cfg(feature = "unstable_encodings")] +mod delta; mod for_; mod rle; mod runend; @@ -15,6 +17,8 @@ mod zigzag; mod pco; pub use bitpacking::BitPackingScheme; +#[cfg(feature = "unstable_encodings")] +pub use delta::DeltaScheme; pub use for_::FoRScheme; #[cfg(feature = "pco")] pub use pco::PcoScheme; diff --git a/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs b/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs index 2e0fb269fda..993827d2057 100644 --- a/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs +++ b/vortex-btrblocks/src/schemes/integer/scheme_selection_tests.rs @@ -143,7 +143,11 @@ fn test_sequence_compressed() -> VortexResult<()> { fn test_rle_compressed() -> VortexResult<()> { let mut values: Vec = Vec::new(); for i in 0..1024 { - values.extend(iter::repeat_n(i, 10)); + // Scramble the per-run value so the data is run-length-dominant but not monotone: this + // keeps RunEnd the winner instead of Delta (whose residuals would be small on a smooth + // ramp). + let v = (i as u32).wrapping_mul(2_654_435_761) as i32; + values.extend(iter::repeat_n(v, 10)); } let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); let btr = BtrBlocksCompressor::default(); @@ -152,3 +156,58 @@ fn test_rle_compressed() -> VortexResult<()> { assert!(compressed.is::()); Ok(()) } + +/// A strictly-increasing column with small, irregular steps: not a perfect arithmetic sequence +/// (so Sequence skips), all-unique with no runs (so RunEnd/Dict skip), and a wide absolute range. +/// Delta's residuals are far smaller than the FoR span, so Delta should win and round-trip, and +/// it must appear at most once in the tree. +#[cfg(feature = "unstable_encodings")] +#[test] +fn test_delta_compressed() -> VortexResult<()> { + use vortex_array::assert_arrays_eq; + use vortex_fastlanes::Delta; + + let mut rng = StdRng::seed_from_u64(7u64); + let mut value = 500_000i32; + let values: Vec = (0..4096) + .map(|_| { + value += 1 + (rng.next_u32() % 6) as i32; + value + }) + .collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress( + &array.clone().into_array(), + &mut SESSION.create_execution_ctx(), + )?; + assert!( + compressed.is::(), + "expected Delta, got tree:\n{}", + compressed.display_tree() + ); + // Delta must appear at most once per tree: no Delta node may be nested under another. + assert!( + !has_nested_delta(&compressed, false), + "Delta was applied more than once in the tree:\n{}", + compressed.display_tree() + ); + assert_arrays_eq!(compressed, array.into_array()); + Ok(()) +} + +/// Returns true if any `Delta` array appears below an ancestor `Delta` in the tree. +#[cfg(feature = "unstable_encodings")] +fn has_nested_delta(array: &vortex_array::ArrayRef, under_delta: bool) -> bool { + use vortex_fastlanes::Delta; + + let is_delta = array.is::(); + if is_delta && under_delta { + return true; + } + array + .children() + .iter() + .any(|child| has_nested_delta(child, under_delta || is_delta)) +} From ffa701a3b20d2482435ac3b7c73ee92f8dabdeed Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 11:42:11 +0000 Subject: [PATCH 2/3] Fix CI for Delta scheme: large-file test data and GPU-incompatible exclusion Enabling Delta in the default scheme set under `--all-features` (which turns on `unstable_encodings`) changed two things that CI caught: - `vortex-file open::tests::test_initial_read_size` built a "large" file from an alternating `i / -i` pattern, which Delta compresses to ~400KB and trips the `> 1MB` assertion. Switch to high-entropy pseudo-random values so the file stays large under any encoding. This was the only failure in both the coverage and CUDA test jobs. - `only_cuda_compatible()` must not emit Delta: there is no GPU decode kernel for it and its prefix-sum decode is inherently sequential, so exclude it from the CUDA-compatible scheme set, like RLE/Sparse/FSST. Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/builder.rs | 4 ++++ vortex-file/src/open.rs | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 61a88b21f2c..eb2fc743bbd 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -197,6 +197,10 @@ impl BtrBlocksCompressorBuilder { ]; #[cfg(feature = "unstable_encodings")] excluded.push(string::OnPairScheme.id()); + // Delta has no GPU decode kernel and its prefix-sum decode is inherently sequential, so it + // is incompatible with pure-GPU decompression paths. + #[cfg(feature = "unstable_encodings")] + excluded.push(integer::DeltaScheme.id()); let builder = self.exclude_schemes(excluded); #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] diff --git a/vortex-file/src/open.rs b/vortex-file/src/open.rs index 8fb0f22827e..fd71cc9a0de 100644 --- a/vortex-file/src/open.rs +++ b/vortex-file/src/open.rs @@ -428,10 +428,16 @@ mod tests { // Create a large file (> 1MB) let mut buf = ByteBufferMut::empty(); - // 1.5M integers -> ~6MB. We use a pattern to avoid Sequence encoding. + // 1.5M integers -> ~6MB. We use high-entropy (pseudo-random) values so the data does not + // compress well under any encoding (Sequence, RunEnd, Delta, ...), keeping the written + // file comfortably above 1MB. + let mut state = 0x9E37_79B9u32; let array = Buffer::from( (0i32..1_500_000) - .map(|i| if i % 2 == 0 { i } else { -i }) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state as i32 + }) .collect::>(), ) .into_array(); From 5ecbcb7aa3a7518abfd97a6edee5527f2779ad9e Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 16:35:43 +0000 Subject: [PATCH 3/3] Reduce Delta selection penalty (delta tax 10% -> 5%) Lower DELTA_PENALTY from 0.9 to 0.95 so Delta is chosen more readily, requiring it to be only ~5% smaller than the best alternative rather than ~10%. This widens Delta's applicability for benchmarking its effect on query performance. Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/schemes/integer/delta.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-btrblocks/src/schemes/integer/delta.rs b/vortex-btrblocks/src/schemes/integer/delta.rs index d356f9fe1fe..064d39690a9 100644 --- a/vortex-btrblocks/src/schemes/integer/delta.rs +++ b/vortex-btrblocks/src/schemes/integer/delta.rs @@ -41,9 +41,9 @@ pub struct DeltaScheme; /// /// Unlike FoR/BitPacking, Delta breaks random access and adds a prefix-sum decode pass, and it /// carries a structural sign bit on its residuals. We therefore require Delta to be meaningfully -/// (~10%) smaller than the best alternative before it wins, rather than picking it for a +/// (~5%) smaller than the best alternative before it wins, rather than picking it for a /// single-bit gain. This factor encodes that "delta tax". -const DELTA_PENALTY: f64 = 0.9; +const DELTA_PENALTY: f64 = 0.95; /// Minimum length before Delta is worth considering (one FastLanes chunk). const MIN_DELTA_LEN: usize = 1024;