From 24ec51dfde1510e48852bf02f364d76bf4771dd5 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 09:43:04 +0000 Subject: [PATCH 01/23] Add FSSTView encoding: a ListView-style FSST array FSSTView addresses its FSST-compressed codes with separate `offsets` and `sizes` arrays (like ListView) instead of FSST's single monotonic offsets array (like List/VarBin). Decoupling start from length means offsets need not be monotonic or contiguous, so filter/take/slice become metadata-only: they rewrite only the small offsets/sizes/lengths/validity arrays and reuse the compressed byte heap and symbol table untouched. This avoids the heap rewrite that plain FSST incurs on filter/take (which delegate to VarBin), giving the same speed win ListView has over List. - New `vortex.fsstview` encoding in the fsst crate, reusing FSSTData for the symbol table + compressed byte heap. Children are declared with the `#[array_slots(FSSTView)]` proc macro (uncompressed_lengths, codes_offsets, codes_sizes, codes_validity). - Metadata-only FilterKernel, TakeExecute, and SliceReduce. - scalar_at decodes a single element via its offset+size slice. - Canonicalization gathers the live codes (possibly out-of-order) and bulk-decompresses into a VarBinView. - `fsstview_from_fsst` zero-copy conversion from an FSST array. - Registered in `register_default_encodings`. - Tests: canonical/filter/take/slice equivalence vs FSST, scalar_at, and filter/take/consistency conformance for nullable and non-nullable data. Signed-off-by: Joe Isaacs --- encodings/fsst/src/fsstview/array.rs | 459 +++++++++++++++++++++++ encodings/fsst/src/fsstview/canonical.rs | 103 +++++ encodings/fsst/src/fsstview/compute.rs | 104 +++++ encodings/fsst/src/fsstview/kernel.rs | 13 + encodings/fsst/src/fsstview/mod.rs | 29 ++ encodings/fsst/src/fsstview/ops.rs | 29 ++ encodings/fsst/src/fsstview/rules.rs | 10 + encodings/fsst/src/fsstview/slice.rs | 36 ++ encodings/fsst/src/fsstview/tests.rs | 178 +++++++++ encodings/fsst/src/lib.rs | 2 + vortex-file/src/lib.rs | 2 + 11 files changed, 965 insertions(+) create mode 100644 encodings/fsst/src/fsstview/array.rs create mode 100644 encodings/fsst/src/fsstview/canonical.rs create mode 100644 encodings/fsst/src/fsstview/compute.rs create mode 100644 encodings/fsst/src/fsstview/kernel.rs create mode 100644 encodings/fsst/src/fsstview/mod.rs create mode 100644 encodings/fsst/src/fsstview/ops.rs create mode 100644 encodings/fsst/src/fsstview/rules.rs create mode 100644 encodings/fsst/src/fsstview/slice.rs create mode 100644 encodings/fsst/src/fsstview/tests.rs diff --git a/encodings/fsst/src/fsstview/array.rs b/encodings/fsst/src/fsstview/array.rs new file mode 100644 index 00000000000..81b83506aa3 --- /dev/null +++ b/encodings/fsst/src/fsstview/array.rs @@ -0,0 +1,459 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use fsst::Symbol; +use prost::Message as _; +use vortex_array::Array; +use vortex_array::ArrayId; +use vortex_array::ArrayParts; +use vortex_array::ArrayRef; +use vortex_array::ArraySlots; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::ExecutionResult; +use vortex_array::IntoArray; +use vortex_array::TypedArrayRef; +use vortex_array::array_slots; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::varbin::VarBinArrayExt; +use vortex_array::buffer::BufferHandle; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; +use vortex_array::serde::ArrayChildren; +use vortex_array::smallvec::smallvec; +use vortex_array::validity::Validity; +use vortex_array::vtable::VTable; +use vortex_array::vtable::ValidityVTable; +use vortex_array::vtable::child_to_validity; +use vortex_array::vtable::validity_to_child; +use vortex_buffer::Buffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_error::vortex_panic; +use vortex_session::VortexSession; +use vortex_session::registry::CachedId; + +use crate::FSSTArray; +use crate::FSSTArrayExt; +// `FSSTView` reuses the exact same inner data representation as `FSST`: the symbol table plus +// the raw compressed byte heap. Only the *addressing* of that heap differs (offsets + sizes +// instead of monotonic offsets), and that addressing lives entirely in the array's slots. +use crate::array::FSSTData; +use crate::fsstview::canonical::canonicalize_fsstview; +use crate::fsstview::kernel::PARENT_KERNELS; +use crate::fsstview::rules::RULES; + +/// An [`FSSTView`]-encoded Vortex array. +pub type FSSTViewArray = Array; + +/// The [`FSSTView`] encoding: a ListView-style FSST array. +#[derive(Clone, Debug)] +pub struct FSSTView; + +/// The child slots of an [`FSSTView`] array. +/// +/// Declared with the [`array_slots`] proc macro, which generates the slot-index constants +/// (`FSSTViewSlots::CODES_OFFSETS`, ...), the borrowed [`FSSTViewSlotsView`] struct, and the +/// typed accessor trait [`FSSTViewArraySlotsExt`] (`.uncompressed_lengths()`, +/// `.codes_offsets()`, `.codes_sizes()`, `.codes_validity()`). +#[array_slots(FSSTView)] +pub struct FSSTViewSlots { + /// Length of each original (uncompressed) value. Non-nullable integer. + pub uncompressed_lengths: ArrayRef, + /// Start offset of each element's compressed bytecodes within the code heap. Non-nullable + /// integer. Unlike `FSST`, these are **not** required to be monotonic or contiguous. + pub codes_offsets: ArrayRef, + /// Length in bytes of each element's compressed bytecodes within the code heap. Non-nullable + /// integer. + pub codes_sizes: ArrayRef, + /// Optional validity bitmap for the codes. Absent when the array is non-nullable. + pub codes_validity: Option, +} + +#[derive(Clone, prost::Message)] +pub struct FSSTViewMetadata { + #[prost(enumeration = "PType", tag = "1")] + uncompressed_lengths_ptype: i32, + #[prost(enumeration = "PType", tag = "2")] + codes_offsets_ptype: i32, + #[prost(enumeration = "PType", tag = "3")] + codes_sizes_ptype: i32, +} + +impl FSSTViewMetadata { + fn get_uncompressed_lengths_ptype(&self) -> VortexResult { + PType::try_from(self.uncompressed_lengths_ptype) + .map_err(|_| vortex_err!("Invalid PType {}", self.uncompressed_lengths_ptype)) + } + + fn get_codes_offsets_ptype(&self) -> VortexResult { + PType::try_from(self.codes_offsets_ptype) + .map_err(|_| vortex_err!("Invalid PType {}", self.codes_offsets_ptype)) + } + + fn get_codes_sizes_ptype(&self) -> VortexResult { + PType::try_from(self.codes_sizes_ptype) + .map_err(|_| vortex_err!("Invalid PType {}", self.codes_sizes_ptype)) + } +} + +impl FSSTView { + /// Build an [`FSSTViewArray`] from its decomposed components. + /// + /// `codes_offsets[i]` and `codes_sizes[i]` address element `i`'s compressed bytecodes inside + /// `codes_bytes`. The offsets do not need to be sorted, contiguous, or non-overlapping. + #[allow(clippy::too_many_arguments)] + pub fn try_new( + dtype: DType, + symbols: Buffer, + symbol_lengths: Buffer, + codes_bytes: BufferHandle, + codes_offsets: ArrayRef, + codes_sizes: ArrayRef, + uncompressed_lengths: ArrayRef, + validity: Validity, + ) -> VortexResult { + let len = codes_offsets.len(); + validate_fsstview( + &symbols, + &symbol_lengths, + &codes_offsets, + &codes_sizes, + &uncompressed_lengths, + &validity, + &dtype, + len, + )?; + let data = FSSTData::try_new(symbols, symbol_lengths, codes_bytes, len)?; + let slots = make_slots( + uncompressed_lengths, + codes_offsets, + codes_sizes, + &validity, + len, + ); + Ok(unsafe { + Array::from_parts_unchecked( + ArrayParts::new(FSSTView, dtype, len, data).with_slots(slots), + ) + }) + } + + /// Build an [`FSSTViewArray`] without validation. + /// + /// # Safety + /// + /// The caller must uphold the same invariants validated by [`FSSTView::try_new`]. + #[allow(clippy::too_many_arguments)] + pub(crate) unsafe fn new_unchecked( + dtype: DType, + symbols: Buffer, + symbol_lengths: Buffer, + codes_bytes: BufferHandle, + codes_offsets: ArrayRef, + codes_sizes: ArrayRef, + uncompressed_lengths: ArrayRef, + validity: Validity, + ) -> FSSTViewArray { + let len = codes_offsets.len(); + let data = unsafe { FSSTData::new_unchecked(symbols, symbol_lengths, codes_bytes, len) }; + let slots = make_slots( + uncompressed_lengths, + codes_offsets, + codes_sizes, + &validity, + len, + ); + unsafe { + Array::from_parts_unchecked( + ArrayParts::new(FSSTView, dtype, len, data).with_slots(slots), + ) + } + } +} + +/// Convert a plain [`FSSTArray`] into an [`FSSTViewArray`], sharing the symbol table and the +/// compressed byte heap (zero-copy) and deriving `sizes[i] = offsets[i + 1] - offsets[i]`. +pub fn fsstview_from_fsst(fsst: &FSSTArray, ctx: &mut ExecutionCtx) -> VortexResult { + let codes = fsst.codes(); + let validity = codes.validity()?; + let offsets = codes.offsets().clone().execute::(ctx)?; + + let (codes_offsets, codes_sizes) = match_each_integer_ptype!(offsets.ptype(), |O| { + let offsets = offsets.as_slice::(); + let len = offsets.len().saturating_sub(1); + let mut starts = Vec::with_capacity(len); + let mut sizes = Vec::with_capacity(len); + for i in 0..len { + starts.push(offsets[i]); + sizes.push(offsets[i + 1] - offsets[i]); + } + ( + PrimitiveArray::from_iter(starts).into_array(), + PrimitiveArray::from_iter(sizes).into_array(), + ) + }); + + FSSTView::try_new( + fsst.dtype().clone(), + fsst.symbols().clone(), + fsst.symbol_lengths().clone(), + fsst.codes_bytes_handle().clone(), + codes_offsets, + codes_sizes, + fsst.uncompressed_lengths().clone(), + validity, + ) +} + +fn make_slots( + uncompressed_lengths: ArrayRef, + codes_offsets: ArrayRef, + codes_sizes: ArrayRef, + validity: &Validity, + len: usize, +) -> ArraySlots { + smallvec![ + Some(uncompressed_lengths), + Some(codes_offsets), + Some(codes_sizes), + validity_to_child(validity, len), + ] +} + +#[allow(clippy::too_many_arguments)] +fn validate_fsstview( + symbols: &Buffer, + symbol_lengths: &Buffer, + codes_offsets: &ArrayRef, + codes_sizes: &ArrayRef, + uncompressed_lengths: &ArrayRef, + validity: &Validity, + dtype: &DType, + len: usize, +) -> VortexResult<()> { + vortex_ensure!( + matches!(dtype, DType::Binary(_) | DType::Utf8(_)), + "FSSTView arrays must be Binary or Utf8, found {dtype}" + ); + if symbols.len() > 255 { + vortex_bail!(InvalidArgument: "symbols array must have length <= 255"); + } + if symbols.len() != symbol_lengths.len() { + vortex_bail!(InvalidArgument: "symbols and symbol_lengths arrays must have same length"); + } + if codes_offsets.len() != len { + vortex_bail!(InvalidArgument: "codes_offsets must have same len as outer array"); + } + if codes_sizes.len() != len { + vortex_bail!(InvalidArgument: "codes_sizes must have same len as outer array"); + } + if uncompressed_lengths.len() != len { + vortex_bail!(InvalidArgument: "uncompressed_lengths must have same len as outer array"); + } + if !codes_offsets.dtype().is_int() || codes_offsets.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes_offsets must be non-nullable integer, found {}", codes_offsets.dtype()); + } + if !codes_sizes.dtype().is_int() || codes_sizes.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes_sizes must be non-nullable integer, found {}", codes_sizes.dtype()); + } + if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "uncompressed_lengths must be non-nullable integer, found {}", uncompressed_lengths.dtype()); + } + if validity.nullability() != dtype.nullability() { + vortex_bail!(InvalidArgument: "validity nullability must match outer dtype nullability"); + } + Ok(()) +} + +/// Typed accessors for [`FSSTViewArray`] that aren't covered by the [`array_slots`] macro. +pub trait FSSTViewArrayExt: TypedArrayRef { + /// The validity of the array, derived from the `codes_validity` slot. + fn fsstview_validity(&self) -> Validity { + child_to_validity( + self.as_ref().slots()[FSSTViewSlots::CODES_VALIDITY].as_ref(), + self.as_ref().dtype().nullability(), + ) + } +} + +impl> FSSTViewArrayExt for T {} + +impl VTable for FSSTView { + type TypedArrayData = FSSTData; + type OperationsVTable = Self; + type ValidityVTable = Self; + + fn id(&self) -> ArrayId { + static ID: CachedId = CachedId::new("vortex.fsstview"); + *ID + } + + fn validate( + &self, + data: &Self::TypedArrayData, + dtype: &DType, + len: usize, + slots: &[Option], + ) -> VortexResult<()> { + let view = FSSTViewSlotsView::from_slots(slots); + let validity = child_to_validity(view.codes_validity, dtype.nullability()); + validate_fsstview( + data.symbols(), + data.symbol_lengths(), + view.codes_offsets, + view.codes_sizes, + view.uncompressed_lengths, + &validity, + dtype, + len, + ) + } + + fn nbuffers(_array: ArrayView<'_, Self>) -> usize { + 3 + } + + fn buffer(array: ArrayView<'_, Self>, idx: usize) -> BufferHandle { + match idx { + 0 => BufferHandle::new_host(array.symbols().clone().into_byte_buffer()), + 1 => BufferHandle::new_host(array.symbol_lengths().clone().into_byte_buffer()), + 2 => array.codes_bytes_handle().clone(), + _ => vortex_panic!("FSSTViewArray buffer index {idx} out of bounds"), + } + } + + fn buffer_name(_array: ArrayView<'_, Self>, idx: usize) -> Option { + match idx { + 0 => Some("symbols".to_string()), + 1 => Some("symbol_lengths".to_string()), + 2 => Some("compressed_codes".to_string()), + _ => vortex_panic!("FSSTViewArray buffer_name index {idx} out of bounds"), + } + } + + fn serialize( + array: ArrayView<'_, Self>, + _session: &VortexSession, + ) -> VortexResult>> { + Ok(Some( + FSSTViewMetadata { + uncompressed_lengths_ptype: PType::try_from(array.uncompressed_lengths().dtype())? + as i32, + codes_offsets_ptype: PType::try_from(array.codes_offsets().dtype())? as i32, + codes_sizes_ptype: PType::try_from(array.codes_sizes().dtype())? as i32, + } + .encode_to_vec(), + )) + } + + fn deserialize( + &self, + dtype: &DType, + len: usize, + metadata: &[u8], + buffers: &[BufferHandle], + children: &dyn ArrayChildren, + _session: &VortexSession, + ) -> VortexResult> { + let metadata = FSSTViewMetadata::decode(metadata)?; + if buffers.len() != 3 { + vortex_bail!( + InvalidArgument: "Expected 3 buffers for fsstview, got {}", + buffers.len() + ); + } + let symbols = Buffer::::from_byte_buffer(buffers[0].clone().try_to_host_sync()?); + let symbol_lengths = Buffer::::from_byte_buffer(buffers[1].clone().try_to_host_sync()?); + let codes_bytes = buffers[2].clone(); + + let uncompressed_lengths = children.get( + 0, + &DType::Primitive( + metadata.get_uncompressed_lengths_ptype()?, + Nullability::NonNullable, + ), + len, + )?; + let codes_offsets = children.get( + 1, + &DType::Primitive( + metadata.get_codes_offsets_ptype()?, + Nullability::NonNullable, + ), + len, + )?; + let codes_sizes = children.get( + 2, + &DType::Primitive(metadata.get_codes_sizes_ptype()?, Nullability::NonNullable), + len, + )?; + + let validity = if children.len() == 3 { + Validity::from(dtype.nullability()) + } else if children.len() == 4 { + Validity::Array(children.get(3, &Validity::DTYPE, len)?) + } else { + vortex_bail!("Expected 3 or 4 children, got {}", children.len()); + }; + + validate_fsstview( + &symbols, + &symbol_lengths, + &codes_offsets, + &codes_sizes, + &uncompressed_lengths, + &validity, + dtype, + len, + )?; + + let data = FSSTData::try_new(symbols, symbol_lengths, codes_bytes, len)?; + let slots = make_slots( + uncompressed_lengths, + codes_offsets, + codes_sizes, + &validity, + len, + ); + Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) + } + + fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String { + FSSTViewSlots::NAMES[idx].to_string() + } + + fn execute(array: Array, ctx: &mut ExecutionCtx) -> VortexResult { + canonicalize_fsstview(array.as_view(), ctx).map(ExecutionResult::done) + } + + fn execute_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + PARENT_KERNELS.execute(array, parent, child_idx, ctx) + } + + fn reduce_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult> { + RULES.evaluate(array, parent, child_idx) + } +} + +impl ValidityVTable for FSSTView { + fn validity(array: ArrayView<'_, FSSTView>) -> VortexResult { + Ok(child_to_validity( + array.slots()[FSSTViewSlots::CODES_VALIDITY].as_ref(), + array.dtype().nullability(), + )) + } +} diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs new file mode 100644 index 00000000000..5a1df694d2b --- /dev/null +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; +use vortex_array::arrays::varbinview::build_views::build_views; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::ByteBufferMut; +use vortex_error::VortexResult; + +use super::array::FSSTView; +use super::array::FSSTViewArrayExt; +use super::array::FSSTViewArraySlotsExt; + +/// Canonicalize an [`FSSTView`] array into a [`VarBinViewArray`]. +/// +/// Because `filter`/`take`/`slice` leave the compressed byte heap untouched, the live codes of +/// element `i` are the (possibly out-of-order, possibly overlapping) slice +/// `codes_bytes[offset_i .. offset_i + size_i]`. We first gather them into element order, then +/// bulk-decompress in a single pass and build the binary views from the uncompressed lengths. +pub(super) fn canonicalize_fsstview( + array: ArrayView<'_, FSSTView>, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let len = array.len(); + let bytes = array.codes_bytes(); + + let offsets = array + .codes_offsets() + .clone() + .execute::(ctx)?; + let sizes = array.codes_sizes().clone().execute::(ctx)?; + let uncompressed_lengths = array + .uncompressed_lengths() + .clone() + .execute::(ctx)?; + + #[expect(clippy::cast_possible_truncation)] + let offsets: Vec = match_each_integer_ptype!(offsets.ptype(), |O| { + offsets + .as_slice::() + .iter() + .map(|o| *o as usize) + .collect() + }); + #[expect(clippy::cast_possible_truncation)] + let sizes: Vec = match_each_integer_ptype!(sizes.ptype(), |S| { + sizes.as_slice::().iter().map(|s| *s as usize).collect() + }); + + // Gather the live compressed bytes into element order. + let total_compressed: usize = sizes.iter().sum(); + let mut compressed = ByteBufferMut::with_capacity(total_compressed); + for i in 0..len { + compressed.extend_from_slice(&bytes[offsets[i]..offsets[i] + sizes[i]]); + } + + #[expect(clippy::cast_possible_truncation)] + let total_size: usize = match_each_integer_ptype!(uncompressed_lengths.ptype(), |P| { + uncompressed_lengths + .as_slice::

() + .iter() + .map(|x| *x as usize) + .sum() + }); + + // Bulk-decompress the gathered heap. We reserve 7 extra bytes because the FSST decoder may + // overrun the output by up to a word. + let decompressor = array.decompressor(); + let mut uncompressed_bytes = ByteBufferMut::with_capacity(total_size + 7); + let written = decompressor.decompress_into( + compressed.as_slice(), + uncompressed_bytes.spare_capacity_mut(), + ); + unsafe { uncompressed_bytes.set_len(written) }; + + let (buffers, views) = match_each_integer_ptype!(uncompressed_lengths.ptype(), |P| { + build_views( + 0, + MAX_BUFFER_LEN, + uncompressed_bytes, + uncompressed_lengths.as_slice::

(), + ) + }); + + // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. + Ok(unsafe { + VarBinViewArray::new_unchecked( + views, + Arc::from(buffers), + array.dtype().clone(), + array.fsstview_validity(), + ) + .into_array() + }) +} diff --git a/encodings/fsst/src/fsstview/compute.rs b/encodings/fsst/src/fsstview/compute.rs new file mode 100644 index 00000000000..d964ba53df2 --- /dev/null +++ b/encodings/fsst/src/fsstview/compute.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Metadata-only `filter` and `take` for [`FSSTView`]. +//! +//! Both operations rewrite only the small `offsets`/`sizes`/`uncompressed_lengths`/`validity` +//! arrays and reuse the compressed byte heap (and symbol table) untouched. This is the core +//! "ListView speed" win over plain [`FSST`][crate::FSST], whose `filter`/`take` delegate to +//! `VarBin` and therefore rewrite the entire compressed heap. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::dict::TakeExecute; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::scalar::Scalar; +use vortex_error::VortexResult; +use vortex_mask::Mask; + +use super::array::FSSTView; +use super::array::FSSTViewArrayExt; +use super::array::FSSTViewArraySlotsExt; + +impl FilterKernel for FSSTView { + fn filter( + array: ArrayView<'_, Self>, + mask: &Mask, + _ctx: &mut ExecutionCtx, + ) -> VortexResult> { + // Filter only the addressing arrays; the byte heap and symbol table are reused as-is. + let validity = array.fsstview_validity().filter(mask)?; + let codes_offsets = array.codes_offsets().filter(mask.clone())?; + let codes_sizes = array.codes_sizes().filter(mask.clone())?; + let uncompressed_lengths = array.uncompressed_lengths().filter(mask.clone())?; + + // SAFETY: filter preserves all `FSSTView` invariants — offsets/sizes/lengths stay + // non-nullable and equal-length, and validity tracks nullness separately. + Ok(Some( + unsafe { + FSSTView::new_unchecked( + array.dtype().clone(), + array.symbols().clone(), + array.symbol_lengths().clone(), + array.codes_bytes_handle().clone(), + codes_offsets, + codes_sizes, + uncompressed_lengths, + validity, + ) + } + .into_array(), + )) + } +} + +impl TakeExecute for FSSTView { + fn take( + array: ArrayView<'_, Self>, + indices: &ArrayRef, + _ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let dtype = array + .dtype() + .clone() + .union_nullability(indices.dtype().nullability()); + + let validity = array.fsstview_validity().take(indices)?; + + // `take` may reorder, duplicate, or skip elements, which is fine for `FSSTView` since + // offsets need not be monotonic. `take` yields nullable arrays (null index -> null), + // so we fill nulls with zero and rely on `validity` to track nullness. + let codes_offsets = array + .codes_offsets() + .take(indices.clone())? + .fill_null(Scalar::zero_value(array.codes_offsets().dtype()))?; + let codes_sizes = array + .codes_sizes() + .take(indices.clone())? + .fill_null(Scalar::zero_value(array.codes_sizes().dtype()))?; + let uncompressed_lengths = array + .uncompressed_lengths() + .take(indices.clone())? + .fill_null(Scalar::zero_value(array.uncompressed_lengths().dtype()))?; + + // SAFETY: take preserves all `FSSTView` invariants (see `filter`). + Ok(Some( + unsafe { + FSSTView::new_unchecked( + dtype, + array.symbols().clone(), + array.symbol_lengths().clone(), + array.codes_bytes_handle().clone(), + codes_offsets, + codes_sizes, + uncompressed_lengths, + validity, + ) + } + .into_array(), + )) + } +} diff --git a/encodings/fsst/src/fsstview/kernel.rs b/encodings/fsst/src/fsstview/kernel.rs new file mode 100644 index 00000000000..4cbd32bf565 --- /dev/null +++ b/encodings/fsst/src/fsstview/kernel.rs @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::arrays::dict::TakeExecuteAdaptor; +use vortex_array::arrays::filter::FilterExecuteAdaptor; +use vortex_array::kernel::ParentKernelSet; + +use super::array::FSSTView; + +pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ + ParentKernelSet::lift(&FilterExecuteAdaptor(FSSTView)), + ParentKernelSet::lift(&TakeExecuteAdaptor(FSSTView)), +]); diff --git a/encodings/fsst/src/fsstview/mod.rs b/encodings/fsst/src/fsstview/mod.rs new file mode 100644 index 00000000000..4afed4f4f9e --- /dev/null +++ b/encodings/fsst/src/fsstview/mod.rs @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! A "ListView"-style variant of the [`FSST`][crate::FSST] encoding. +//! +//! Where [`FSST`][crate::FSST] addresses its compressed codes with a single monotonic +//! offsets array (`len + 1` offsets, exactly like `VarBin`/`List`), [`FSSTView`] addresses +//! them with a pair of `offsets` **and** `sizes` arrays (exactly like +//! [`ListView`][vortex_array::arrays::ListView]). Element `i`'s compressed bytecodes live in +//! `codes_bytes[offsets[i] .. offsets[i] + sizes[i]]`. +//! +//! Decoupling the start (`offset`) from the length (`size`) means the offsets are no longer +//! required to be monotonic or contiguous, so `filter`, `take`, and `slice` become +//! metadata-only operations: they rewrite only the (small) `offsets`/`sizes`/lengths/validity +//! arrays and **reuse the compressed byte heap untouched**. The plain [`FSST`][crate::FSST] +//! encoding has to rewrite the entire compressed heap for `filter`/`take` because it delegates +//! to `VarBin`. This is the same trade-off `ListView` makes over `List`. + +mod array; +mod canonical; +mod compute; +mod kernel; +mod ops; +mod rules; +mod slice; +#[cfg(test)] +mod tests; + +pub use array::*; diff --git a/encodings/fsst/src/fsstview/ops.rs b/encodings/fsst/src/fsstview/ops.rs new file mode 100644 index 00000000000..46fef3554bb --- /dev/null +++ b/encodings/fsst/src/fsstview/ops.rs @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::varbin::varbin_scalar; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::OperationsVTable; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use super::array::FSSTView; +use super::array::FSSTViewArraySlotsExt; + +impl OperationsVTable for FSSTView { + fn scalar_at( + array: ArrayView<'_, FSSTView>, + index: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + // Preconditions (see `OperationsVTable`): `index` is in bounds and non-null. + let offset: usize = (&array.codes_offsets().execute_scalar(index, ctx)?).try_into()?; + let size: usize = (&array.codes_sizes().execute_scalar(index, ctx)?).try_into()?; + + let compressed = &array.codes_bytes()[offset..offset + size]; + let decoded = ByteBuffer::from(array.decompressor().decompress(compressed)); + Ok(varbin_scalar(decoded, array.dtype())) + } +} diff --git a/encodings/fsst/src/fsstview/rules.rs b/encodings/fsst/src/fsstview/rules.rs new file mode 100644 index 00000000000..a3a5c891be9 --- /dev/null +++ b/encodings/fsst/src/fsstview/rules.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::arrays::slice::SliceReduceAdaptor; +use vortex_array::optimizer::rules::ParentRuleSet; + +use super::array::FSSTView; + +pub(crate) static RULES: ParentRuleSet = + ParentRuleSet::new(&[ParentRuleSet::lift(&SliceReduceAdaptor(FSSTView))]); diff --git a/encodings/fsst/src/fsstview/slice.rs b/encodings/fsst/src/fsstview/slice.rs new file mode 100644 index 00000000000..4b77ce16e9f --- /dev/null +++ b/encodings/fsst/src/fsstview/slice.rs @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::ops::Range; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::IntoArray; +use vortex_array::arrays::slice::SliceReduce; +use vortex_error::VortexResult; + +use super::array::FSSTView; +use super::array::FSSTViewArrayExt; +use super::array::FSSTViewArraySlotsExt; + +impl SliceReduce for FSSTView { + fn slice(array: ArrayView<'_, Self>, range: Range) -> VortexResult> { + // Slicing leaves the symbol table and compressed byte heap intact; we only slice the + // addressing arrays. + Ok(Some( + unsafe { + FSSTView::new_unchecked( + array.dtype().clone(), + array.symbols().clone(), + array.symbol_lengths().clone(), + array.codes_bytes_handle().clone(), + array.codes_offsets().slice(range.clone())?, + array.codes_sizes().slice(range.clone())?, + array.uncompressed_lengths().slice(range.clone())?, + array.fsstview_validity().slice(range)?, + ) + } + .into_array(), + )) + } +} diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs new file mode 100644 index 00000000000..33e503c5af2 --- /dev/null +++ b/encodings/fsst/src/fsstview/tests.rs @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use rstest::rstest; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::assert_arrays_eq; +use vortex_array::compute::conformance::consistency::test_array_consistency; +use vortex_array::compute::conformance::filter::test_filter_conformance; +use vortex_array::compute::conformance::take::test_take_conformance; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_error::VortexResult; +use vortex_mask::Mask; + +use crate::FSSTView; +use crate::FSSTViewArray; +use crate::fsst_compress; +use crate::fsst_train_compressor; +use crate::fsstview_from_fsst; + +fn make_fsstview( + strings: &[Option<&str>], + nullability: Nullability, + ctx: &mut ExecutionCtx, +) -> FSSTViewArray { + let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability)); + let compressor = fsst_train_compressor(&varbin); + let fsst = fsst_compress(&varbin, varbin.len(), varbin.dtype(), &compressor, ctx); + fsstview_from_fsst(&fsst, ctx).expect("fsstview_from_fsst") +} + +const SAMPLE: [Option<&str>; 6] = [ + Some("hello world"), + Some("testing fsst compression"), + Some("hello world"), + Some("another string here"), + Some("the quick brown fox"), + Some("hello world"), +]; + +const SAMPLE_NULLABLE: [Option<&str>; 6] = [ + Some("hello world"), + None, + Some("testing fsst compression"), + Some("another string here"), + None, + Some("the quick brown fox"), +]; + +#[test] +fn canonicalizes_to_same_values() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = make_fsstview(&SAMPLE, Nullability::NonNullable, &mut ctx); + let array = view.into_array(); + assert!(array.is::()); + + let canonical = array.execute::(&mut ctx)?; + let expected = VarBinArray::from_iter( + SAMPLE.iter().copied(), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + .execute::(&mut ctx)?; + assert_arrays_eq!(canonical.into_array(), expected.into_array()); + Ok(()) +} + +#[test] +fn filter_matches_canonical() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = make_fsstview(&SAMPLE_NULLABLE, Nullability::Nullable, &mut ctx); + + let mask = Mask::from_iter([true, false, true, false, true, true]); + + // The filtered FSSTView reuses the original byte heap untouched. + let filtered = view.into_array().filter(mask.clone())?; + let result = filtered.execute::(&mut ctx)?; + + let expected = VarBinArray::from_iter( + SAMPLE_NULLABLE.iter().copied(), + DType::Utf8(Nullability::Nullable), + ) + .into_array() + .filter(mask)? + .execute::(&mut ctx)?; + + assert_arrays_eq!(result.into_array(), expected.into_array()); + Ok(()) +} + +#[test] +fn take_matches_canonical() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = make_fsstview(&SAMPLE, Nullability::NonNullable, &mut ctx); + + // Reorders and duplicates, which is fine for offsets+sizes addressing. + let indices = vortex_array::arrays::PrimitiveArray::from_iter([5u64, 0, 0, 3, 1]).into_array(); + + let taken = view.into_array().take(indices.clone())?; + let result = taken.execute::(&mut ctx)?; + + let expected = VarBinArray::from_iter( + SAMPLE.iter().copied(), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + .take(indices)? + .execute::(&mut ctx)?; + + assert_arrays_eq!(result.into_array(), expected.into_array()); + Ok(()) +} + +#[test] +fn slice_matches_canonical() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = make_fsstview(&SAMPLE, Nullability::NonNullable, &mut ctx); + + let sliced = view.into_array().slice(1..4)?; + let result = sliced.execute::(&mut ctx)?; + + let expected = VarBinArray::from_iter( + SAMPLE.iter().copied(), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + .slice(1..4)? + .execute::(&mut ctx)?; + + assert_arrays_eq!(result.into_array(), expected.into_array()); + Ok(()) +} + +#[test] +fn scalar_at_decodes_each_element() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = make_fsstview(&SAMPLE, Nullability::NonNullable, &mut ctx); + let array = view.into_array(); + + for (i, expected) in SAMPLE.iter().enumerate() { + let scalar = array.execute_scalar(i, &mut ctx)?; + let value = scalar.as_utf8().value().expect("non-null"); + assert_eq!(value.as_str(), expected.unwrap()); + } + Ok(()) +} + +#[rstest] +#[case(&SAMPLE, Nullability::NonNullable)] +#[case(&SAMPLE_NULLABLE, Nullability::Nullable)] +fn filter_conformance(#[case] strings: &[Option<&str>], #[case] nullability: Nullability) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = make_fsstview(strings, nullability, &mut ctx); + test_filter_conformance(&view.into_array()); +} + +#[rstest] +#[case(&SAMPLE, Nullability::NonNullable)] +#[case(&SAMPLE_NULLABLE, Nullability::Nullable)] +fn take_conformance(#[case] strings: &[Option<&str>], #[case] nullability: Nullability) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = make_fsstview(strings, nullability, &mut ctx); + test_take_conformance(&view.into_array()); +} + +#[rstest] +#[case(&SAMPLE, Nullability::NonNullable)] +#[case(&SAMPLE_NULLABLE, Nullability::Nullable)] +fn consistency(#[case] strings: &[Option<&str>], #[case] nullability: Nullability) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let view = make_fsstview(strings, nullability, &mut ctx); + test_array_consistency(&view.into_array()); +} diff --git a/encodings/fsst/src/lib.rs b/encodings/fsst/src/lib.rs index 3305c0e66fc..47fbfae9547 100644 --- a/encodings/fsst/src/lib.rs +++ b/encodings/fsst/src/lib.rs @@ -16,6 +16,7 @@ mod canonical; mod compress; mod compute; mod dfa; +mod fsstview; mod kernel; mod ops; mod rules; @@ -27,3 +28,4 @@ mod tests; pub use array::*; pub use compress::*; +pub use fsstview::*; diff --git a/vortex-file/src/lib.rs b/vortex-file/src/lib.rs index e69b5848de2..58179486b76 100644 --- a/vortex-file/src/lib.rs +++ b/vortex-file/src/lib.rs @@ -115,6 +115,7 @@ use vortex_array::arrays::patched::use_experimental_patches; use vortex_array::session::ArraySessionExt; use vortex_bytebool::ByteBool; use vortex_fsst::FSST; +use vortex_fsst::FSSTView; use vortex_pco::Pco; use vortex_session::VortexSession; use vortex_zigzag::ZigZag; @@ -162,6 +163,7 @@ pub fn register_default_encodings(session: &VortexSession) { arrays.register(ByteBool); arrays.register(Dict); arrays.register(FSST); + arrays.register(FSSTView); arrays.register(Pco); arrays.register(ZigZag); #[cfg(feature = "zstd")] From 0b1923377b00c3b54ec0f57e8dd8a7d0e1cd1a72 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 14:01:58 +0000 Subject: [PATCH 02/23] FSSTView: fast FSST->view filter/take, smarter canonical compaction, bench Adds the second hop and the canonicalization decision for the FSSTView pipeline, plus a benchmark that measures the trade-off directly. - `fsst_filter_to_view` / `fsst_take_to_view`: reinterpret an FSSTArray as an FSSTView (sharing symbols + codes bytes) and apply the metadata-only kernel, so filtering/taking an FSSTArray never rewrites the compressed byte heap. - Canonicalization now chooses a compaction strategy (FsstViewCompaction): - Direct: live codes still contiguous/in-order (untouched or sliced view) -> one bulk decompress, no copy. - GatherBulk ("compact"): copy the scattered live codes contiguous, then one bulk decompress. Wins when strings are short/numerous (per-call overhead dominates otherwise; the gather is cheap and unlocks bulk SIMD). - PerElement ("no compact"): decompress each element's slice in place, no copy. Wins when strings are long/few (the gather copy dominates). Auto picks Direct when contiguous, else GatherBulk/PerElement by average compressed bytes/element. `canonicalize_fsstview_with` exposes each strategy for benchmarking. - benches/fsst_view_compute.rs: calls kernels directly (no dispatch) and measures each part. filter (selective/non-selective), take (shuffle / selective / dense), and a filter+take combo, over two ~2 MiB inputs (many short strings, fewer long strings). fsst pipeline compacts into a fresh FSSTArray each step then canonicalizes; fsstview pipeline stays metadata-only then canonicalizes under each compaction strategy. - Tests: from_fsst helpers vs canonical, and all compaction strategies agree on both contiguous and scattered views. Signed-off-by: Joe Isaacs --- encodings/fsst/Cargo.toml | 4 + encodings/fsst/benches/fsst_view_compute.rs | 501 ++++++++++++++++++++ encodings/fsst/src/fsstview/canonical.rs | 232 ++++++--- encodings/fsst/src/fsstview/from_fsst.rs | 51 ++ encodings/fsst/src/fsstview/mod.rs | 5 + encodings/fsst/src/fsstview/tests.rs | 97 +++- 6 files changed, 836 insertions(+), 54 deletions(-) create mode 100644 encodings/fsst/benches/fsst_view_compute.rs create mode 100644 encodings/fsst/src/fsstview/from_fsst.rs diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index 0d722a131d3..b1010a53572 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -56,5 +56,9 @@ name = "chunked_dict_fsst_builder" harness = false required-features = ["_test-harness"] +[[bench]] +name = "fsst_view_compute" +harness = false + [package.metadata.cargo-machete] ignored = ["fsst-rs"] diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs new file mode 100644 index 00000000000..cecba407783 --- /dev/null +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -0,0 +1,501 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compares the two ways to run a `filter`/`take` pipeline that ends in a `VarBinViewArray`: +//! +//! 1. **fsst pipeline**: stay in [`FSSTArray`] at every step, compacting the codes into a fresh +//! [`FSSTArray`] each time (the kernels delegate to `VarBin`, rewriting the byte heap), then +//! canonicalize to a [`VarBinViewArray`] at the end. +//! 2. **fsstview pipeline**: convert to [`FSSTViewArray`] and apply the metadata-only kernels +//! (offsets/sizes only — the byte heap is never touched), then canonicalize to a +//! [`VarBinViewArray`] at the end. +//! +//! Kernels are invoked directly (no Vortex execution/dispatch) so each part is measured in +//! isolation: the `_step` benches measure just the filter/take hop; the `_pipeline` benches +//! measure the hop plus the final canonicalization. For the fsstview pipeline the final +//! canonicalization is measured under each [`FsstViewCompaction`] strategy so the compaction +//! trade-off is visible directly. +//! +//! Two ~2 MiB (uncompressed) inputs are used: one with **many short** strings and one with +//! **fewer long** strings. The expectation: the fsstview hop is far cheaper in both cases (no +//! heap rewrite); for the final canonicalization, `GatherBulk` (compact) wins on the short-string +//! input while `PerElement` (no compact) wins on the long-string input. + +#![expect(clippy::unwrap_used)] + +use divan::Bencher; +use divan::black_box; +use rand::RngExt; +use rand::SeedableRng; +use rand::rngs::StdRng; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::dict::TakeExecute; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_fsst::FSST; +use vortex_fsst::FSSTArray; +use vortex_fsst::FSSTView; +use vortex_fsst::FsstViewCompaction; +use vortex_fsst::canonicalize_fsstview_with; +use vortex_fsst::fsst_compress; +use vortex_fsst::fsst_train_compressor; +use vortex_fsst::fsstview_from_fsst; +use vortex_mask::Mask; + +fn main() { + divan::main(); +} + +/// ~2 MiB of uncompressed string data, in two shapes. +const TARGET_UNCOMPRESSED: usize = 2 * 1024 * 1024; + +#[derive(Clone, Copy, Debug)] +enum Shape { + /// Many short strings (~12 bytes each) — small per-element work. + ManyShort, + /// Fewer long strings (~256 bytes each) — large per-element work. + FewLong, +} + +impl Shape { + fn avg_len(self) -> usize { + match self { + Shape::ManyShort => 12, + Shape::FewLong => 256, + } + } + + fn count(self) -> usize { + TARGET_UNCOMPRESSED / self.avg_len() + } + + fn name(self) -> &'static str { + match self { + Shape::ManyShort => "many_short", + Shape::FewLong => "few_long", + } + } +} + +/// Build a ~2 MiB input. We use a small alphabet so FSST finds good symbols (realistic +/// compression), with some shared substrings to mimic real string columns. +fn generate(shape: Shape) -> VarBinArray { + let mut rng = StdRng::seed_from_u64(42); + let count = shape.count(); + let avg_len = shape.avg_len(); + let mut strings: Vec> = Vec::with_capacity(count); + + const WORDS: &[&str] = &[ + "https://", "example", "vortex", ".com/", "path", "query=", "value", "data", "alpha", + "bravo", "charlie", "delta", "_", "-", "/", "0123", + ]; + + for _ in 0..count { + let target = avg_len * rng.random_range(70..=130) / 100; + let mut s = String::with_capacity(target + 8); + while s.len() < target { + s.push_str(WORDS[rng.random_range(0..WORDS.len())]); + } + s.truncate(target.max(1)); + strings.push(s.into_bytes().into_boxed_slice()); + } + + VarBinArray::from_iter( + strings.into_iter().map(Some), + DType::Utf8(Nullability::NonNullable), + ) +} + +fn compress(varbin: &VarBinArray, ctx: &mut ExecutionCtx) -> FSSTArray { + let compressor = fsst_train_compressor(varbin); + fsst_compress(varbin, varbin.len(), varbin.dtype(), &compressor, ctx) +} + +/// A selective mask keeps ~10% of rows; a non-selective mask keeps ~90%. +fn make_mask(len: usize, keep_fraction: f64) -> Mask { + let mut rng = StdRng::seed_from_u64(7); + Mask::from_iter((0..len).map(|_| rng.random_bool(keep_fraction))) +} + +#[derive(Clone, Copy, Debug)] +enum TakeKind { + /// A full shuffle (permutation of all rows) — same length, reordered. + Shuffle, + /// Very selective — pick ~5% of rows at random (with possible repeats). + Selective, + /// Not selective — pick ~150% of rows at random (duplicates, output grows). + Dense, +} + +impl TakeKind { + fn name(self) -> &'static str { + match self { + TakeKind::Shuffle => "shuffle", + TakeKind::Selective => "selective", + TakeKind::Dense => "dense", + } + } +} + +fn compaction_name(strategy: FsstViewCompaction) -> &'static str { + match strategy { + FsstViewCompaction::Auto => "auto", + FsstViewCompaction::Direct => "direct", + FsstViewCompaction::GatherBulk => "gather_bulk", + FsstViewCompaction::PerElement => "per_element", + } +} + +fn make_indices(len: usize, kind: TakeKind) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(11); + let indices: Vec = match kind { + TakeKind::Shuffle => { + let mut v: Vec = (0..len as u64).collect(); + // Fisher-Yates. + for i in (1..v.len()).rev() { + v.swap(i, rng.random_range(0..=i)); + } + v + } + TakeKind::Selective => (0..(len / 20).max(1)) + .map(|_| rng.random_range(0..len as u64)) + .collect(), + TakeKind::Dense => (0..(len * 3 / 2)) + .map(|_| rng.random_range(0..len as u64)) + .collect(), + }; + PrimitiveArray::from_iter(indices).into_array() +} + +// ----- direct kernel wrappers (no Vortex dispatch) --------------------------------------------- + +fn fsst_filter(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> FSSTArray { + ::filter(array.as_view(), mask, ctx) + .unwrap() + .unwrap() + .try_downcast::() + .ok() + .unwrap() +} + +fn fsst_take(array: &FSSTArray, indices: &ArrayRef, ctx: &mut ExecutionCtx) -> FSSTArray { + ::take(array.as_view(), indices, ctx) + .unwrap() + .unwrap() + .try_downcast::() + .ok() + .unwrap() +} + +fn view_filter(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> ArrayRef { + let view = fsstview_from_fsst(array, ctx).unwrap(); + ::filter(view.as_view(), mask, ctx) + .unwrap() + .unwrap() +} + +fn view_take(array: &FSSTArray, indices: &ArrayRef, ctx: &mut ExecutionCtx) -> ArrayRef { + let view = fsstview_from_fsst(array, ctx).unwrap(); + ::take(view.as_view(), indices, ctx) + .unwrap() + .unwrap() +} + +fn fsst_to_canonical(array: &FSSTArray, ctx: &mut ExecutionCtx) -> ArrayRef { + // Decompress straight to a VarBinView via the VarBin codes (the FSST canonical path). + array + .clone() + .into_array() + .execute::(ctx) + .unwrap() + .into_array() +} + +const SHAPES: &[Shape] = &[Shape::ManyShort, Shape::FewLong]; + +// =============================== FILTER ======================================================== + +/// Filter masks to exercise: selective (~10% kept) and non-selective (~90% kept). +const FILTER_KEEP: &[(&str, f64)] = &[("selective_10pct", 0.10), ("nonselective_90pct", 0.90)]; + +#[divan::bench(args = filter_args())] +fn filter_step_fsst(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| black_box(fsst_filter(fsst, mask, ctx))); +} + +#[divan::bench(args = filter_args())] +fn filter_step_view(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| black_box(view_filter(fsst, mask, ctx))); +} + +/// Full pipeline: filter (compacting into another FSSTArray) then canonicalize to VarBinView. +#[divan::bench(args = filter_args())] +fn filter_pipeline_fsst(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let filtered = fsst_filter(fsst, mask, ctx); + black_box(fsst_to_canonical(&filtered, ctx)) + }); +} + +/// Full pipeline: filter to FSSTView then canonicalize, once per compaction strategy. +#[divan::bench(args = filter_view_pipeline_args())] +fn filter_pipeline_view(bencher: Bencher, args: FilterViewArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let view = view_filter(fsst, mask, ctx) + .try_downcast::() + .ok() + .unwrap(); + black_box(canonicalize_fsstview_with(view.as_view(), args.strategy, ctx).unwrap()) + }); +} + +// =============================== TAKE ========================================================== + +const TAKE_KINDS: &[TakeKind] = &[TakeKind::Shuffle, TakeKind::Selective, TakeKind::Dense]; + +#[divan::bench(args = take_args())] +fn take_step_fsst(bencher: Bencher, args: TakeArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let indices = make_indices(fsst.len(), args.kind); + bencher + .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, indices, ctx)| black_box(fsst_take(fsst, indices, ctx))); +} + +#[divan::bench(args = take_args())] +fn take_step_view(bencher: Bencher, args: TakeArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let indices = make_indices(fsst.len(), args.kind); + bencher + .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, indices, ctx)| black_box(view_take(fsst, indices, ctx))); +} + +#[divan::bench(args = take_args())] +fn take_pipeline_fsst(bencher: Bencher, args: TakeArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let indices = make_indices(fsst.len(), args.kind); + bencher + .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, indices, ctx)| { + let taken = fsst_take(fsst, indices, ctx); + black_box(fsst_to_canonical(&taken, ctx)) + }); +} + +#[divan::bench(args = take_view_pipeline_args())] +fn take_pipeline_view(bencher: Bencher, args: TakeViewArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let indices = make_indices(fsst.len(), args.kind); + bencher + .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, indices, ctx)| { + let view = view_take(fsst, indices, ctx) + .try_downcast::() + .ok() + .unwrap(); + black_box(canonicalize_fsstview_with(view.as_view(), args.strategy, ctx).unwrap()) + }); +} + +// =============================== COMBINATION =================================================== + +/// A filter (selective) followed by a take (shuffle) — the realistic "scan then reorder" shape. +/// fsst path compacts twice; fsstview path stays metadata-only until the final canonicalize. +#[divan::bench(args = SHAPES)] +fn combo_pipeline_fsst(bencher: Bencher, shape: Shape) { + let varbin = generate(shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), 0.10); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let filtered = fsst_filter(fsst, mask, ctx); + let indices = make_indices(filtered.len(), TakeKind::Shuffle); + let taken = fsst_take(&filtered, &indices, ctx); + black_box(fsst_to_canonical(&taken, ctx)) + }); +} + +#[divan::bench(args = SHAPES)] +fn combo_pipeline_view(bencher: Bencher, shape: Shape) { + let varbin = generate(shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), 0.10); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + // filter -> view, then take on the view (both metadata-only), then canonicalize. + let filtered = view_filter(fsst, mask, ctx) + .try_downcast::() + .ok() + .unwrap(); + let indices = make_indices(filtered.len(), TakeKind::Shuffle); + let taken = ::take(filtered.as_view(), &indices, ctx) + .unwrap() + .unwrap() + .try_downcast::() + .ok() + .unwrap(); + black_box( + canonicalize_fsstview_with(taken.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), + ) + }); +} + +// =============================== arg plumbing ================================================== + +#[derive(Clone, Copy)] +struct FilterArg { + shape: Shape, + keep: f64, + label: &'static str, +} + +impl std::fmt::Display for FilterArg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", self.shape.name(), self.label) + } +} + +fn filter_args() -> Vec { + let mut v = Vec::new(); + for &shape in SHAPES { + for &(label, keep) in FILTER_KEEP { + v.push(FilterArg { shape, keep, label }); + } + } + v +} + +#[derive(Clone, Copy)] +struct FilterViewArg { + shape: Shape, + keep: f64, + label: &'static str, + strategy: FsstViewCompaction, +} + +impl std::fmt::Display for FilterViewArg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}/{}/{}", + self.shape.name(), + self.label, + compaction_name(self.strategy) + ) + } +} + +fn filter_view_pipeline_args() -> Vec { + let mut v = Vec::new(); + for &shape in SHAPES { + for &(label, keep) in FILTER_KEEP { + for &strategy in COMPACTIONS { + v.push(FilterViewArg { + shape, + keep, + label, + strategy, + }); + } + } + } + v +} + +#[derive(Clone, Copy)] +struct TakeArg { + shape: Shape, + kind: TakeKind, +} + +impl std::fmt::Display for TakeArg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", self.shape.name(), self.kind.name()) + } +} + +fn take_args() -> Vec { + let mut v = Vec::new(); + for &shape in SHAPES { + for &kind in TAKE_KINDS { + v.push(TakeArg { shape, kind }); + } + } + v +} + +#[derive(Clone, Copy)] +struct TakeViewArg { + shape: Shape, + kind: TakeKind, + strategy: FsstViewCompaction, +} + +impl std::fmt::Display for TakeViewArg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}/{}/{}", + self.shape.name(), + self.kind.name(), + compaction_name(self.strategy) + ) + } +} + +const COMPACTIONS: &[FsstViewCompaction] = &[ + FsstViewCompaction::Auto, + FsstViewCompaction::GatherBulk, + FsstViewCompaction::PerElement, +]; + +fn take_view_pipeline_args() -> Vec { + let mut v = Vec::new(); + for &shape in SHAPES { + for &kind in TAKE_KINDS { + for &strategy in COMPACTIONS { + v.push(TakeViewArg { + shape, + kind, + strategy, + }); + } + } + } + v +} diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index 5a1df694d2b..9368db52e17 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -1,8 +1,32 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! Canonicalization of [`FSSTView`] into a [`VarBinViewArray`]. +//! +//! After metadata-only `filter`/`take`, an [`FSSTView`]'s byte heap is the *original* heap and +//! the live codes are scattered (gaps after a filter, reordering/duplication after a take). To +//! canonicalize we must produce one contiguous decompressed buffer in element order. There are +//! three ways to get there, with different cost profiles — see [`FsstViewCompaction`]: +//! +//! - [`Direct`][FsstViewCompaction::Direct]: the live codes are still contiguous and in order +//! (e.g. an untouched view or one that was only sliced). We bulk-decompress that single +//! contiguous range with no copy. Fastest, but only valid when contiguous. +//! - [`GatherBulk`][FsstViewCompaction::GatherBulk] ("compact"): copy the scattered live codes +//! into a contiguous buffer, then a *single* bulk decompress. Pays a copy of the live +//! compressed bytes but the one bulk call amortizes the FSST 8-wide fast path across all +//! element boundaries. +//! - [`PerElement`][FsstViewCompaction::PerElement] ("no compact"): decompress each element's +//! slice directly into its place in the output. No copy, but one decompress call per element. +//! +//! The compaction question, concretely: **compacting (`GatherBulk`) beats `PerElement` when the +//! strings are short and numerous** — per-call overhead then dominates `PerElement` while the +//! gather copy is cheap and unlocks bulk SIMD. **`PerElement` wins when the strings are long and +//! few** — the gather copy dominates and per-call overhead is negligible. Density decides whether +//! `Direct` is even available; average element size decides `GatherBulk` vs `PerElement`. + use std::sync::Arc; +use fsst::Decompressor; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; @@ -19,75 +43,95 @@ use super::array::FSSTView; use super::array::FSSTViewArrayExt; use super::array::FSSTViewArraySlotsExt; -/// Canonicalize an [`FSSTView`] array into a [`VarBinViewArray`]. +/// Strategy for materializing the decompressed bytes when canonicalizing an [`FSSTView`]. /// -/// Because `filter`/`take`/`slice` leave the compressed byte heap untouched, the live codes of -/// element `i` are the (possibly out-of-order, possibly overlapping) slice -/// `codes_bytes[offset_i .. offset_i + size_i]`. We first gather them into element order, then -/// bulk-decompress in a single pass and build the binary views from the uncompressed lengths. +/// See the [module docs][self] for the full trade-off analysis. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum FsstViewCompaction { + /// Pick a strategy automatically based on contiguity and average element size. + Auto, + /// Bulk-decompress the contiguous live range with no copy. Falls back to `GatherBulk` if the + /// view's codes are not contiguous and in order. + Direct, + /// Compact the scattered live codes into a contiguous buffer, then a single bulk decompress. + GatherBulk, + /// Decompress each element's code slice directly into place, without compacting. + PerElement, +} + +/// Average compressed bytes/element below which compaction (`GatherBulk`) is preferred over +/// `PerElement`. Heuristic; see module docs. Validated by the `fsst_view_compute` benchmark. +const SHORT_STRING_THRESHOLD: usize = 32; + pub(super) fn canonicalize_fsstview( array: ArrayView<'_, FSSTView>, ctx: &mut ExecutionCtx, ) -> VortexResult { - let len = array.len(); - let bytes = array.codes_bytes(); + canonicalize_fsstview_with(array, FsstViewCompaction::Auto, ctx) +} - let offsets = array - .codes_offsets() - .clone() - .execute::(ctx)?; - let sizes = array.codes_sizes().clone().execute::(ctx)?; - let uncompressed_lengths = array +/// Canonicalize an [`FSSTView`] to a [`VarBinViewArray`] using an explicit compaction strategy. +/// +/// Exposed (rather than only the dispatch-driven [`canonicalize_fsstview`]) so benchmarks can +/// measure each strategy directly. Production code goes through [`FsstViewCompaction::Auto`]. +pub fn canonicalize_fsstview_with( + array: ArrayView<'_, FSSTView>, + strategy: FsstViewCompaction, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let offsets = load_usize(array.codes_offsets(), ctx)?; + let sizes = load_usize(array.codes_sizes(), ctx)?; + + let ulen_prim = array .uncompressed_lengths() .clone() .execute::(ctx)?; - - #[expect(clippy::cast_possible_truncation)] - let offsets: Vec = match_each_integer_ptype!(offsets.ptype(), |O| { - offsets - .as_slice::() - .iter() - .map(|o| *o as usize) - .collect() - }); - #[expect(clippy::cast_possible_truncation)] - let sizes: Vec = match_each_integer_ptype!(sizes.ptype(), |S| { - sizes.as_slice::().iter().map(|s| *s as usize).collect() - }); - - // Gather the live compressed bytes into element order. - let total_compressed: usize = sizes.iter().sum(); - let mut compressed = ByteBufferMut::with_capacity(total_compressed); - for i in 0..len { - compressed.extend_from_slice(&bytes[offsets[i]..offsets[i] + sizes[i]]); - } - #[expect(clippy::cast_possible_truncation)] - let total_size: usize = match_each_integer_ptype!(uncompressed_lengths.ptype(), |P| { - uncompressed_lengths + let ulens: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + ulen_prim .as_slice::

() .iter() .map(|x| *x as usize) - .sum() + .collect() }); + let total_size: usize = ulens.iter().sum(); + let live: usize = sizes.iter().sum(); - // Bulk-decompress the gathered heap. We reserve 7 extra bytes because the FSST decoder may - // overrun the output by up to a word. + let heap_buffer = array.codes_bytes(); + let heap = heap_buffer.as_slice(); let decompressor = array.decompressor(); - let mut uncompressed_bytes = ByteBufferMut::with_capacity(total_size + 7); - let written = decompressor.decompress_into( - compressed.as_slice(), - uncompressed_bytes.spare_capacity_mut(), - ); - unsafe { uncompressed_bytes.set_len(written) }; - - let (buffers, views) = match_each_integer_ptype!(uncompressed_lengths.ptype(), |P| { - build_views( - 0, - MAX_BUFFER_LEN, - uncompressed_bytes, - uncompressed_lengths.as_slice::

(), - ) + + let contiguous = is_contiguous(&offsets, &sizes); + let chosen = match strategy { + FsstViewCompaction::Auto => { + if contiguous { + FsstViewCompaction::Direct + } else if !offsets.is_empty() && live / offsets.len() < SHORT_STRING_THRESHOLD { + FsstViewCompaction::GatherBulk + } else { + FsstViewCompaction::PerElement + } + } + // `Direct` is only valid for a contiguous layout; fall back to a compacting decode. + FsstViewCompaction::Direct if !contiguous => FsstViewCompaction::GatherBulk, + other => other, + }; + + let uncompressed = match chosen { + FsstViewCompaction::Direct => { + let start = offsets.first().copied().unwrap_or(0); + decompress_direct(&decompressor, heap, start, live, total_size) + } + FsstViewCompaction::GatherBulk => { + decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size) + } + FsstViewCompaction::PerElement | FsstViewCompaction::Auto => { + decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size) + } + }; + + let (buffers, views) = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + build_views(0, MAX_BUFFER_LEN, uncompressed, ulen_prim.as_slice::

()) }); // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. @@ -101,3 +145,85 @@ pub(super) fn canonicalize_fsstview( .into_array() }) } + +fn load_usize(array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult> { + let prim = array.clone().execute::(ctx)?; + #[expect(clippy::cast_possible_truncation)] + let out: Vec = match_each_integer_ptype!(prim.ptype(), |P| { + prim.as_slice::

().iter().map(|x| *x as usize).collect() + }); + Ok(out) +} + +/// Returns true if the live codes occupy a single contiguous, in-order run of the heap. +fn is_contiguous(offsets: &[usize], sizes: &[usize]) -> bool { + let Some(&first) = offsets.first() else { + return true; + }; + let mut pos = first; + for (&offset, &size) in offsets.iter().zip(sizes) { + if offset != pos { + return false; + } + pos += size; + } + true +} + +/// Decompress a single contiguous run of the heap in one bulk call (no copy). +fn decompress_direct( + decompressor: &Decompressor<'_>, + heap: &[u8], + start: usize, + live: usize, + total_size: usize, +) -> ByteBufferMut { + let mut out = ByteBufferMut::with_capacity(total_size + 7); + let written = + decompressor.decompress_into(&heap[start..start + live], out.spare_capacity_mut()); + unsafe { out.set_len(written) }; + out +} + +/// Compact the scattered live codes into a contiguous buffer, then a single bulk decompress. +fn decompress_gather( + decompressor: &Decompressor<'_>, + heap: &[u8], + offsets: &[usize], + sizes: &[usize], + live: usize, + total_size: usize, +) -> ByteBufferMut { + let mut compressed = ByteBufferMut::with_capacity(live); + for (&offset, &size) in offsets.iter().zip(sizes) { + compressed.extend_from_slice(&heap[offset..offset + size]); + } + let mut out = ByteBufferMut::with_capacity(total_size + 7); + let written = decompressor.decompress_into(compressed.as_slice(), out.spare_capacity_mut()); + unsafe { out.set_len(written) }; + out +} + +/// Decompress each element's code slice directly into its place in the output (no compaction). +fn decompress_per_element( + decompressor: &Decompressor<'_>, + heap: &[u8], + offsets: &[usize], + sizes: &[usize], + ulens: &[usize], + total_size: usize, +) -> ByteBufferMut { + let mut out = ByteBufferMut::with_capacity(total_size + 7); + { + let spare = out.spare_capacity_mut(); + let mut uoff = 0; + for ((&offset, &size), &ulen) in offsets.iter().zip(sizes).zip(ulens) { + if size > 0 { + decompressor.decompress_into(&heap[offset..offset + size], &mut spare[uoff..]); + } + uoff += ulen; + } + } + unsafe { out.set_len(total_size) }; + out +} diff --git a/encodings/fsst/src/fsstview/from_fsst.rs b/encodings/fsst/src/fsstview/from_fsst.rs new file mode 100644 index 00000000000..f83785d5cdf --- /dev/null +++ b/encodings/fsst/src/fsstview/from_fsst.rs @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Metadata-only `filter`/`take` that go straight from an [`FSSTArray`] to an [`FSSTViewArray`]. +//! +//! These are the "first hop" of the view pipeline. They never touch the compressed byte heap: +//! the [`FSSTArray`] is reinterpreted as an [`FSSTViewArray`] (sharing symbols + codes bytes, +//! deriving `sizes` from the consecutive offsets) and then the selection is applied to the small +//! `offsets`/`sizes`/`lengths`/`validity` arrays only. + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::dict::TakeExecute; +use vortex_array::arrays::filter::FilterKernel; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_mask::Mask; + +use super::array::FSSTView; +use super::array::FSSTViewArray; +use super::array::fsstview_from_fsst; +use crate::FSSTArray; + +/// Filter an [`FSSTArray`], producing an [`FSSTViewArray`] without touching the codes. +pub fn fsst_filter_to_view( + array: &FSSTArray, + mask: &Mask, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let view = fsstview_from_fsst(array, ctx)?; + let filtered: ArrayRef = ::filter(view.as_view(), mask, ctx)? + .vortex_expect("FSSTView filter always returns Some"); + filtered + .try_downcast::() + .map_err(|_| vortex_err!("FSSTView filter must return an FSSTView")) +} + +/// Take from an [`FSSTArray`], producing an [`FSSTViewArray`] without touching the codes. +pub fn fsst_take_to_view( + array: &FSSTArray, + indices: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let view = fsstview_from_fsst(array, ctx)?; + let taken: ArrayRef = ::take(view.as_view(), indices, ctx)? + .vortex_expect("FSSTView take always returns Some"); + taken + .try_downcast::() + .map_err(|_| vortex_err!("FSSTView take must return an FSSTView")) +} diff --git a/encodings/fsst/src/fsstview/mod.rs b/encodings/fsst/src/fsstview/mod.rs index 4afed4f4f9e..8efbb50b7fa 100644 --- a/encodings/fsst/src/fsstview/mod.rs +++ b/encodings/fsst/src/fsstview/mod.rs @@ -19,6 +19,7 @@ mod array; mod canonical; mod compute; +mod from_fsst; mod kernel; mod ops; mod rules; @@ -27,3 +28,7 @@ mod slice; mod tests; pub use array::*; +pub use canonical::FsstViewCompaction; +pub use canonical::canonicalize_fsstview_with; +pub use from_fsst::fsst_filter_to_view; +pub use from_fsst::fsst_take_to_view; diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs index 33e503c5af2..31fe057bcf8 100644 --- a/encodings/fsst/src/fsstview/tests.rs +++ b/encodings/fsst/src/fsstview/tests.rs @@ -6,6 +6,7 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::assert_arrays_eq; @@ -17,9 +18,14 @@ use vortex_array::dtype::Nullability; use vortex_error::VortexResult; use vortex_mask::Mask; +use crate::FSSTArray; use crate::FSSTView; use crate::FSSTViewArray; +use crate::FsstViewCompaction; +use crate::canonicalize_fsstview_with; use crate::fsst_compress; +use crate::fsst_filter_to_view; +use crate::fsst_take_to_view; use crate::fsst_train_compressor; use crate::fsstview_from_fsst; @@ -99,7 +105,7 @@ fn take_matches_canonical() -> VortexResult<()> { let view = make_fsstview(&SAMPLE, Nullability::NonNullable, &mut ctx); // Reorders and duplicates, which is fine for offsets+sizes addressing. - let indices = vortex_array::arrays::PrimitiveArray::from_iter([5u64, 0, 0, 3, 1]).into_array(); + let indices = PrimitiveArray::from_iter([5u64, 0, 0, 3, 1]).into_array(); let taken = view.into_array().take(indices.clone())?; let result = taken.execute::(&mut ctx)?; @@ -176,3 +182,92 @@ fn consistency(#[case] strings: &[Option<&str>], #[case] nullability: Nullabilit let view = make_fsstview(strings, nullability, &mut ctx); test_array_consistency(&view.into_array()); } + +fn make_fsst( + strings: &[Option<&str>], + nullability: Nullability, + ctx: &mut ExecutionCtx, +) -> FSSTArray { + let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability)); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(&varbin, varbin.len(), varbin.dtype(), &compressor, ctx) +} + +/// `fsst_filter_to_view` must agree with filtering the canonical VarBin, and must not touch the +/// codes bytes (the produced view shares the original heap). +#[test] +fn fsst_filter_to_view_matches_canonical() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let fsst = make_fsst(&SAMPLE_NULLABLE, Nullability::Nullable, &mut ctx); + let mask = Mask::from_iter([true, false, true, false, true, true]); + + let view = fsst_filter_to_view(&fsst, &mask, &mut ctx)?; + let result = view.into_array().execute::(&mut ctx)?; + + let expected = VarBinArray::from_iter( + SAMPLE_NULLABLE.iter().copied(), + DType::Utf8(Nullability::Nullable), + ) + .into_array() + .filter(mask)? + .execute::(&mut ctx)?; + assert_arrays_eq!(result.into_array(), expected.into_array()); + Ok(()) +} + +#[test] +fn fsst_take_to_view_matches_canonical() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let fsst = make_fsst(&SAMPLE, Nullability::NonNullable, &mut ctx); + let indices = PrimitiveArray::from_iter([5u64, 0, 0, 3, 1]).into_array(); + + let view = fsst_take_to_view(&fsst, &indices, &mut ctx)?; + let result = view.into_array().execute::(&mut ctx)?; + + let expected = VarBinArray::from_iter( + SAMPLE.iter().copied(), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + .take(indices)? + .execute::(&mut ctx)?; + assert_arrays_eq!(result.into_array(), expected.into_array()); + Ok(()) +} + +/// All three explicit compaction strategies must produce identical canonical output, both for a +/// contiguous (sliced) view and a scattered (taken) one. +#[rstest] +#[case(FsstViewCompaction::Auto)] +#[case(FsstViewCompaction::Direct)] +#[case(FsstViewCompaction::GatherBulk)] +#[case(FsstViewCompaction::PerElement)] +fn compaction_strategies_agree(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let fsst = make_fsst(&SAMPLE, Nullability::NonNullable, &mut ctx); + + // Scattered view via a take (reorders + duplicates -> non-contiguous codes). + let indices = PrimitiveArray::from_iter([5u64, 0, 0, 3, 1, 2]).into_array(); + let scattered = fsst_take_to_view(&fsst, &indices, &mut ctx)?; + let got = canonicalize_fsstview_with(scattered.as_view(), strategy, &mut ctx)?; + let expected = VarBinArray::from_iter( + SAMPLE.iter().copied(), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + .take(indices)? + .execute::(&mut ctx)?; + assert_arrays_eq!(got, expected.into_array()); + + // Contiguous view (untouched) — exercises the Direct fast path. + let contiguous = fsstview_from_fsst(&fsst, &mut ctx)?; + let got = canonicalize_fsstview_with(contiguous.as_view(), strategy, &mut ctx)?; + let expected = VarBinArray::from_iter( + SAMPLE.iter().copied(), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + .execute::(&mut ctx)?; + assert_arrays_eq!(got, expected.into_array()); + Ok(()) +} From df7aa0d256f6c8e6f2969cd22c1de27410c20c27 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 14:06:46 +0000 Subject: [PATCH 03/23] FSSTView: simplify Auto compaction to match benchmark results The fsst_view_compute benchmark (two ~2 MiB inputs, ~12-byte and ~256-byte strings) shows GatherBulk beats PerElement across the entire tested range, not just for short strings as originally guessed. FSST's decoder has a fast 8-wide body and a slow byte-by-byte tail; PerElement pays that tail once per element while GatherBulk pays it once for the whole heap, which dominates the gather memcpy even at 256-byte strings. Selected medians (canonicalize after the metadata-only hop): take few_long/shuffle: gather 459us vs per_element 623us take few_long/dense: gather 838us vs per_element 981us filter many_short/nonsel: gather 5.38ms vs per_element 5.92ms And the metadata-only hop itself is far cheaper than compacting FSST: take_step many_short/shuffle: view 650us vs fsst 2.84ms (~4x) take_step many_short/dense: view 604us vs fsst 4.15ms (~7x) So Auto now picks Direct when the live codes are contiguous and GatherBulk otherwise; it never selects PerElement (kept selectable for measurement, wins only in the few-very-long-strings extreme outside real columns). Drops the SHORT_STRING_THRESHOLD heuristic and updates the docs to the measured behavior. Signed-off-by: Joe Isaacs --- encodings/fsst/benches/fsst_view_compute.rs | 10 ++++-- encodings/fsst/src/fsstview/canonical.rs | 34 +++++++++++++-------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs index cecba407783..707340a6121 100644 --- a/encodings/fsst/benches/fsst_view_compute.rs +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -17,9 +17,13 @@ //! trade-off is visible directly. //! //! Two ~2 MiB (uncompressed) inputs are used: one with **many short** strings and one with -//! **fewer long** strings. The expectation: the fsstview hop is far cheaper in both cases (no -//! heap rewrite); for the final canonicalization, `GatherBulk` (compact) wins on the short-string -//! input while `PerElement` (no compact) wins on the long-string input. +//! **fewer long** strings. +//! +//! Observed (medians): the fsstview hop is far cheaper in both cases (no heap rewrite) — e.g. +//! `take many_short/shuffle` is ~650 µs vs ~2.84 ms for fsst. For the final canonicalization, +//! `GatherBulk` (compact) beats `PerElement` (no compact) across the whole range, short *and* +//! long strings, because it pays FSST's slow decode-tail once instead of once per element; that's +//! why `Auto` compacts whenever the codes aren't contiguous. #![expect(clippy::unwrap_used)] diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index 9368db52e17..c480b55829a 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -18,11 +18,20 @@ //! - [`PerElement`][FsstViewCompaction::PerElement] ("no compact"): decompress each element's //! slice directly into its place in the output. No copy, but one decompress call per element. //! -//! The compaction question, concretely: **compacting (`GatherBulk`) beats `PerElement` when the -//! strings are short and numerous** — per-call overhead then dominates `PerElement` while the -//! gather copy is cheap and unlocks bulk SIMD. **`PerElement` wins when the strings are long and -//! few** — the gather copy dominates and per-call overhead is negligible. Density decides whether -//! `Direct` is even available; average element size decides `GatherBulk` vs `PerElement`. +//! The compaction question, concretely. The `fsst_view_compute` benchmark (two ~2 MiB inputs, +//! ~12-byte and ~256-byte strings) shows **`GatherBulk` beats `PerElement` across the whole +//! tested range, for both short and long strings**. The reason: FSST's decoder has a fast 8-wide +//! body and a slow byte-by-byte tail. `PerElement` pays that tail *once per element* (N tails), +//! while `GatherBulk` decodes the whole heap in one call and pays the tail *once*. That saving +//! dominates the cost of the gather memcpy even at 256-byte strings (with ~8 K elements). For +//! example `take few_long/shuffle` canonicalizes in ~459 µs with `GatherBulk` vs ~623 µs with +//! `PerElement`. +//! +//! `PerElement` only wins in the opposite extreme — *very few, very long* strings — where N is +//! tiny (few tails saved) but the gather memcpy of the entire live heap is large. That regime is +//! outside what real string columns hit, so [`FsstViewCompaction::Auto`] never picks it: it uses +//! `Direct` when the live codes are still contiguous (untouched/sliced view) and `GatherBulk` +//! otherwise. `PerElement` is kept selectable so the trade-off stays measurable. use std::sync::Arc; @@ -48,7 +57,8 @@ use super::array::FSSTViewArraySlotsExt; /// See the [module docs][self] for the full trade-off analysis. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum FsstViewCompaction { - /// Pick a strategy automatically based on contiguity and average element size. + /// Pick a strategy automatically: `Direct` when the live codes are contiguous, else + /// `GatherBulk`. Never picks `PerElement` (see module docs). Auto, /// Bulk-decompress the contiguous live range with no copy. Falls back to `GatherBulk` if the /// view's codes are not contiguous and in order. @@ -59,10 +69,6 @@ pub enum FsstViewCompaction { PerElement, } -/// Average compressed bytes/element below which compaction (`GatherBulk`) is preferred over -/// `PerElement`. Heuristic; see module docs. Validated by the `fsst_view_compute` benchmark. -const SHORT_STRING_THRESHOLD: usize = 32; - pub(super) fn canonicalize_fsstview( array: ArrayView<'_, FSSTView>, ctx: &mut ExecutionCtx, @@ -103,13 +109,14 @@ pub fn canonicalize_fsstview_with( let contiguous = is_contiguous(&offsets, &sizes); let chosen = match strategy { + // Direct when the live codes are still one contiguous run, else compact-and-bulk. + // `GatherBulk` beats `PerElement` across the whole practical range (see module docs), so + // `Auto` never selects `PerElement`. FsstViewCompaction::Auto => { if contiguous { FsstViewCompaction::Direct - } else if !offsets.is_empty() && live / offsets.len() < SHORT_STRING_THRESHOLD { - FsstViewCompaction::GatherBulk } else { - FsstViewCompaction::PerElement + FsstViewCompaction::GatherBulk } } // `Direct` is only valid for a contiguous layout; fall back to a compacting decode. @@ -125,6 +132,7 @@ pub fn canonicalize_fsstview_with( FsstViewCompaction::GatherBulk => { decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size) } + // `Auto` is always resolved to a concrete strategy above. FsstViewCompaction::PerElement | FsstViewCompaction::Auto => { decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size) } From 60c4e03fa00a3c754ca462b336b1a86a623d4a06 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 14:32:27 +0000 Subject: [PATCH 04/23] FSSTView: zero-copy from_fsst conversion + chain-ops benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fsstview_from_fsst now reuses the FSST offsets buffer for codes_offsets via a zero-copy slice of its first `len` elements, instead of re-copying into a new Vec. Only the derived sizes array is freshly allocated. - Add chain_pipeline_{fsst,view} benches: a 5-op alternating filter/take chain ending in a canonicalize. This is where the view model is meant to win — each fsst op re-compacts the byte heap (cost compounds with chain length), while the view converts once and chains metadata-only ops, deferring the single gather+decode to the end. Measured medians (100 samples): FewLong: fsst 765us -> view 481us (1.6x) ManyShort: fsst 14.49ms -> view 9.64ms (1.5x) Signed-off-by: Joe Isaacs --- encodings/fsst/benches/fsst_view_compute.rs | 63 +++++++++++++++++++++ encodings/fsst/src/fsstview/array.rs | 19 ++++--- 2 files changed, 73 insertions(+), 9 deletions(-) diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs index 707340a6121..18b4a0894a2 100644 --- a/encodings/fsst/benches/fsst_view_compute.rs +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -379,6 +379,69 @@ fn combo_pipeline_view(bencher: Bencher, shape: Shape) { }); } +// =============================== CHAIN ========================================================= + +/// Number of ops in the chain benchmark. +const CHAIN_LEN: usize = 5; + +/// A chain of `CHAIN_LEN` alternating filter/take ops ending in a canonicalization. +/// +/// This is where the view model is meant to dominate: each fsst op re-compacts the byte heap, +/// so the cost compounds with chain length, whereas the view converts to offsets+sizes *once* +/// and every subsequent op is metadata-only, deferring the single gather+decode to the final +/// canonicalize. We keep every op only mildly selective (filter keeps 80%, take is a shuffle) +/// so there's still substantial data at the end — i.e. the heap rewrites the fsst path pays are +/// real work, not optimized away to nothing. +#[divan::bench(args = SHAPES)] +fn chain_pipeline_fsst(bencher: Bencher, shape: Shape) { + let varbin = generate(shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + bencher + .with_inputs(|| (&fsst, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, ctx)| { + let mut cur = (*fsst).clone(); + for op in 0..CHAIN_LEN { + if op % 2 == 0 { + let mask = make_mask(cur.len(), 0.80); + cur = fsst_filter(&cur, &mask, ctx); + } else { + let indices = make_indices(cur.len(), TakeKind::Shuffle); + cur = fsst_take(&cur, &indices, ctx); + } + } + black_box(fsst_to_canonical(&cur, ctx)) + }); +} + +#[divan::bench(args = SHAPES)] +fn chain_pipeline_view(bencher: Bencher, shape: Shape) { + let varbin = generate(shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + bencher + .with_inputs(|| (&fsst, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, ctx)| { + // Convert to the view once, then chain metadata-only ops, canonicalize once at the end. + let mut cur = fsstview_from_fsst(fsst, ctx).unwrap(); + for op in 0..CHAIN_LEN { + let next = if op % 2 == 0 { + let mask = make_mask(cur.len(), 0.80); + ::filter(cur.as_view(), &mask, ctx) + .unwrap() + .unwrap() + } else { + let indices = make_indices(cur.len(), TakeKind::Shuffle); + ::take(cur.as_view(), &indices, ctx) + .unwrap() + .unwrap() + }; + cur = next.try_downcast::().ok().unwrap(); + } + black_box( + canonicalize_fsstview_with(cur.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), + ) + }); +} + // =============================== arg plumbing ================================================== #[derive(Clone, Copy)] diff --git a/encodings/fsst/src/fsstview/array.rs b/encodings/fsst/src/fsstview/array.rs index 81b83506aa3..594c371d3f3 100644 --- a/encodings/fsst/src/fsstview/array.rs +++ b/encodings/fsst/src/fsstview/array.rs @@ -29,6 +29,7 @@ use vortex_array::vtable::ValidityVTable; use vortex_array::vtable::child_to_validity; use vortex_array::vtable::validity_to_child; use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; @@ -178,25 +179,25 @@ impl FSSTView { /// Convert a plain [`FSSTArray`] into an [`FSSTViewArray`], sharing the symbol table and the /// compressed byte heap (zero-copy) and deriving `sizes[i] = offsets[i + 1] - offsets[i]`. +/// +/// The `offsets` (length `len + 1`) are reused for the view's `codes_offsets` by a zero-copy +/// slice of their first `len` elements; only the `sizes` array is freshly allocated. pub fn fsstview_from_fsst(fsst: &FSSTArray, ctx: &mut ExecutionCtx) -> VortexResult { let codes = fsst.codes(); let validity = codes.validity()?; let offsets = codes.offsets().clone().execute::(ctx)?; + let len = offsets.len().saturating_sub(1); - let (codes_offsets, codes_sizes) = match_each_integer_ptype!(offsets.ptype(), |O| { + let codes_sizes = match_each_integer_ptype!(offsets.ptype(), |O| { let offsets = offsets.as_slice::(); - let len = offsets.len().saturating_sub(1); - let mut starts = Vec::with_capacity(len); - let mut sizes = Vec::with_capacity(len); + let mut sizes = BufferMut::::with_capacity(len); for i in 0..len { - starts.push(offsets[i]); sizes.push(offsets[i + 1] - offsets[i]); } - ( - PrimitiveArray::from_iter(starts).into_array(), - PrimitiveArray::from_iter(sizes).into_array(), - ) + sizes.into_array() }); + // `codes_offsets` is the first `len` offsets — a zero-copy slice of the existing buffer. + let codes_offsets = offsets.into_array().slice(0..len)?; FSSTView::try_new( fsst.dtype().clone(), From 43e99faf148c3fdc1067f0172582bee211e7f307 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 14:59:42 +0000 Subject: [PATCH 05/23] FSSTView: add RunCoalesce ("export paired slices") canonicalization + finding Implements the "compact like a list / export paired slices into a VarBinView" idea: decode contiguous heap runs straight into a heap-ordered buffer and point VarBinView views back into it out of order, with no gather copy and duplicate dedup. Wired as FsstViewCompaction::RunCoalesce, hash-free (sort-based), handles nulls/empties/duplicates; covered by an adversarial gaps+shuffle+nullable test and the all-strategies-agree test. Benchmark verdict: it loses to GatherBulk everywhere, badly for short strings (take many_short/shuffle ~18ms vs ~5.6ms). The random access you avoid at decode time reappears at view-build time: views are built in element order over a heap-ordered output, so make_view does N cache-missing random reads (and random inlining copies for <=12-byte strings), plus an O(N log N) sort. GatherBulk's output is element-ordered, so its view-build is sequential; the cheap sequential gather memcpy beats the scattered view construction. So Auto keeps using Direct (contiguous) / GatherBulk (otherwise) and never picks RunCoalesce; it's retained as a selectable, measurable baseline. Docs updated with the full reasoning. Signed-off-by: Joe Isaacs --- encodings/fsst/benches/fsst_view_compute.rs | 2 + encodings/fsst/src/fsstview/canonical.rs | 161 +++++++++++++++++--- encodings/fsst/src/fsstview/tests.rs | 58 ++++++- 3 files changed, 201 insertions(+), 20 deletions(-) diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs index 18b4a0894a2..fab3693eb22 100644 --- a/encodings/fsst/benches/fsst_view_compute.rs +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -155,6 +155,7 @@ fn compaction_name(strategy: FsstViewCompaction) -> &'static str { FsstViewCompaction::Direct => "direct", FsstViewCompaction::GatherBulk => "gather_bulk", FsstViewCompaction::PerElement => "per_element", + FsstViewCompaction::RunCoalesce => "run_coalesce", } } @@ -549,6 +550,7 @@ const COMPACTIONS: &[FsstViewCompaction] = &[ FsstViewCompaction::Auto, FsstViewCompaction::GatherBulk, FsstViewCompaction::PerElement, + FsstViewCompaction::RunCoalesce, ]; fn take_view_pipeline_args() -> Vec { diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index c480b55829a..91a2115936d 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -17,21 +17,29 @@ //! element boundaries. //! - [`PerElement`][FsstViewCompaction::PerElement] ("no compact"): decompress each element's //! slice directly into its place in the output. No copy, but one decompress call per element. +//! - [`RunCoalesce`][FsstViewCompaction::RunCoalesce] ("export paired slices"): decode contiguous +//! heap runs straight into a *heap-ordered* output and point `VarBinView`s back into it, out of +//! order — no gather copy, dedups duplicates. //! //! The compaction question, concretely. The `fsst_view_compute` benchmark (two ~2 MiB inputs, -//! ~12-byte and ~256-byte strings) shows **`GatherBulk` beats `PerElement` across the whole -//! tested range, for both short and long strings**. The reason: FSST's decoder has a fast 8-wide -//! body and a slow byte-by-byte tail. `PerElement` pays that tail *once per element* (N tails), -//! while `GatherBulk` decodes the whole heap in one call and pays the tail *once*. That saving -//! dominates the cost of the gather memcpy even at 256-byte strings (with ~8 K elements). For -//! example `take few_long/shuffle` canonicalizes in ~459 µs with `GatherBulk` vs ~623 µs with -//! `PerElement`. +//! ~12-byte and ~256-byte strings) shows **`GatherBulk` is the best non-contiguous strategy across +//! the whole tested range**, for both short and long strings. The reason FSST decode is shaped +//! this way: a fast 8-wide body and a slow byte-by-byte tail. `PerElement` pays that tail *once +//! per element* (N tails); `GatherBulk` decodes the whole heap in one call and pays it *once*, +//! which dominates the gather memcpy even at 256-byte strings. //! -//! `PerElement` only wins in the opposite extreme — *very few, very long* strings — where N is -//! tiny (few tails saved) but the gather memcpy of the entire live heap is large. That regime is -//! outside what real string columns hit, so [`FsstViewCompaction::Auto`] never picks it: it uses -//! `Direct` when the live codes are still contiguous (untouched/sliced view) and `GatherBulk` -//! otherwise. `PerElement` is kept selectable so the trade-off stays measurable. +//! `RunCoalesce` was the appealing idea of skipping the gather entirely — decode runs in place and +//! let the `VarBinView` reference them out of order. It loses anyway, badly for short strings +//! (`take many_short/shuffle`: ~18 ms vs ~5.6 ms for `GatherBulk`). The reason is subtle: the +//! random access you avoid at *decode* time reappears at *view-build* time. Views are built in +//! element order, so over a heap-ordered output the per-element `make_view` does N cache-missing +//! random reads (and, for ≤12-byte strings, random-access *inlining* copies), plus an +//! O(N log N) sort. `GatherBulk`'s output is element-ordered, so its view-build is sequential. The +//! cheap sequential gather memcpy beats the expensive scattered view construction. +//! +//! So [`FsstViewCompaction::Auto`] uses `Direct` when the live codes are contiguous +//! (untouched/sliced view) and `GatherBulk` otherwise. `PerElement` and `RunCoalesce` are kept +//! selectable so the trade-off stays measurable, but `Auto` never picks them. use std::sync::Arc; @@ -42,9 +50,13 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::varbinview::BinaryView; use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; use vortex_array::arrays::varbinview::build_views::build_views; use vortex_array::match_each_integer_ptype; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; use vortex_error::VortexResult; @@ -58,7 +70,7 @@ use super::array::FSSTViewArraySlotsExt; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum FsstViewCompaction { /// Pick a strategy automatically: `Direct` when the live codes are contiguous, else - /// `GatherBulk`. Never picks `PerElement` (see module docs). + /// `GatherBulk`. Never picks `PerElement` or `RunCoalesce` (both lose; see module docs). Auto, /// Bulk-decompress the contiguous live range with no copy. Falls back to `GatherBulk` if the /// view's codes are not contiguous and in order. @@ -67,6 +79,16 @@ pub enum FsstViewCompaction { GatherBulk, /// Decompress each element's code slice directly into place, without compacting. PerElement, + /// Coalesce survivors into contiguous heap runs and decompress each run with a *single* call + /// directly into a heap-ordered output (no gather copy), emitting `VarBinView` views — possibly + /// out of order — that point back into it. Decodes distinct codes once (duplicates share a + /// view). + /// + /// This is the "export paired slices into a `VarBinView`" approach. In theory it skips the + /// gather copy entirely; in practice it loses to `GatherBulk` (see module docs) because the + /// random access just moves to view-build time, where it's more expensive. Retained for + /// measurement only — `Auto` never selects it. + RunCoalesce, } pub(super) fn canonicalize_fsstview( @@ -110,8 +132,8 @@ pub fn canonicalize_fsstview_with( let contiguous = is_contiguous(&offsets, &sizes); let chosen = match strategy { // Direct when the live codes are still one contiguous run, else compact-and-bulk. - // `GatherBulk` beats `PerElement` across the whole practical range (see module docs), so - // `Auto` never selects `PerElement`. + // `GatherBulk` beats both `PerElement` and `RunCoalesce` across the whole practical range + // (see module docs), so `Auto` picks neither. FsstViewCompaction::Auto => { if contiguous { FsstViewCompaction::Direct @@ -124,6 +146,23 @@ pub fn canonicalize_fsstview_with( other => other, }; + // RunCoalesce builds its own (buffers, views) — decompression order is decoupled from element + // order, so it can't go through `build_views` (which assumes element-order contiguous output). + if chosen == FsstViewCompaction::RunCoalesce { + let (buffers, views) = + decompress_run_coalesce(&decompressor, heap, &offsets, &sizes, &ulens, total_size); + // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. + return Ok(unsafe { + VarBinViewArray::new_unchecked( + views, + Arc::from(buffers), + array.dtype().clone(), + array.fsstview_validity(), + ) + .into_array() + }); + } + let uncompressed = match chosen { FsstViewCompaction::Direct => { let start = offsets.first().copied().unwrap_or(0); @@ -132,10 +171,8 @@ pub fn canonicalize_fsstview_with( FsstViewCompaction::GatherBulk => { decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size) } - // `Auto` is always resolved to a concrete strategy above. - FsstViewCompaction::PerElement | FsstViewCompaction::Auto => { - decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size) - } + // `Auto`/`RunCoalesce` are resolved above. + _ => decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size), }; let (buffers, views) = match_each_integer_ptype!(ulen_prim.ptype(), |P| { @@ -212,6 +249,92 @@ fn decompress_gather( out } +/// Coalesce survivors into contiguous heap runs, decompress each run once directly into the +/// output, and build `VarBinView`s (in element order) pointing back into that output. +/// +/// Distinct elements are keyed by their `(offset, size)` heap span: duplicates (from a `take` +/// with repeats) are decoded once and share a view. Adjacent distinct spans (`offset == prev end`) +/// are decompressed in a single FSST call, so a shuffle take of the whole array is one decode. +fn decompress_run_coalesce( + decompressor: &Decompressor<'_>, + heap: &[u8], + offsets: &[usize], + sizes: &[usize], + ulens: &[usize], + total_size: usize, +) -> (Vec, Buffer) { + let count = offsets.len(); + + // Visit elements in heap order. Sorting by `(offset, size)` groups duplicates (same span) + // together and, at a shared offset, orders the zero-size span (null/empty) before the + // non-zero one — keeping the run extension below well-defined. `size` is part of the key + // because a zero-size element shares an offset with its heap neighbour. + let mut order: Vec = (0..count).collect(); + order.sort_unstable_by_key(|&i| (offsets[i], sizes[i])); + + // Output position of each element's decoded bytes, filled below. + let mut out_pos = vec![0usize; count]; + let mut out = ByteBufferMut::with_capacity(total_size + 7); + let spare = out.spare_capacity_mut(); + + let mut written = 0usize; + let mut cursor = 0usize; + while cursor < count { + let head = order[cursor]; + // Zero-size spans (empty/null) decode to nothing; share the current position. + if sizes[head] == 0 { + out_pos[head] = written; + cursor += 1; + continue; + } + // Start a run at this span and extend it while the next *distinct* span is heap-adjacent. + // Duplicate spans (identical offset+size) reuse the position already assigned for the run. + let run_out_base = written; + let run_heap_start = offsets[head]; + let mut run_heap_end = run_heap_start; + let mut elem_out = written; + while cursor < count { + let elem = order[cursor]; + if sizes[elem] == 0 { + break; + } + if offsets[elem] == run_heap_end { + // A new distinct span that continues the run. + out_pos[elem] = elem_out; + elem_out += ulens[elem]; + run_heap_end += sizes[elem]; + cursor += 1; + } else if offsets[elem] < run_heap_end { + // A duplicate of a span already decoded in this run: reuse its position. Duplicates + // are contiguous in the sorted order, so the previous entry shares this span. + out_pos[elem] = out_pos[order[cursor - 1]]; + cursor += 1; + } else { + break; + } + } + // One decode for the whole run, straight into the output at `run_out_base`. + decompressor.decompress_into( + &heap[run_heap_start..run_heap_end], + &mut spare[run_out_base..], + ); + written = elem_out; + } + unsafe { out.set_len(written) }; + let bytes = out.freeze(); + + // Build views in element order, each pointing at its decoded output position. + let mut views = BufferMut::::with_capacity(count); + for (i, &ulen) in ulens.iter().enumerate() { + let pos = out_pos[i]; + #[expect(clippy::cast_possible_truncation)] + let view = BinaryView::make_view(&bytes[pos..pos + ulen], 0, pos as u32); + views.push(view); + } + + (vec![bytes], views.freeze()) +} + /// Decompress each element's code slice directly into its place in the output (no compaction). fn decompress_per_element( decompressor: &Decompressor<'_>, diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs index 31fe057bcf8..4418f90a249 100644 --- a/encodings/fsst/src/fsstview/tests.rs +++ b/encodings/fsst/src/fsstview/tests.rs @@ -9,6 +9,7 @@ use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::dict::TakeExecute; use vortex_array::assert_arrays_eq; use vortex_array::compute::conformance::consistency::test_array_consistency; use vortex_array::compute::conformance::filter::test_filter_conformance; @@ -235,13 +236,14 @@ fn fsst_take_to_view_matches_canonical() -> VortexResult<()> { Ok(()) } -/// All three explicit compaction strategies must produce identical canonical output, both for a +/// All explicit compaction strategies must produce identical canonical output, both for a /// contiguous (sliced) view and a scattered (taken) one. #[rstest] #[case(FsstViewCompaction::Auto)] #[case(FsstViewCompaction::Direct)] #[case(FsstViewCompaction::GatherBulk)] #[case(FsstViewCompaction::PerElement)] +#[case(FsstViewCompaction::RunCoalesce)] fn compaction_strategies_agree(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let fsst = make_fsst(&SAMPLE, Nullability::NonNullable, &mut ctx); @@ -271,3 +273,57 @@ fn compaction_strategies_agree(#[case] strategy: FsstViewCompaction) -> VortexRe assert_arrays_eq!(got, expected.into_array()); Ok(()) } + +/// Adversarial coverage for `RunCoalesce`: a filter that punches gaps into the heap (so survivors +/// form multiple runs), then a shuffle take (reorders runs), over nullable data. Every strategy +/// must still agree with the canonical VarBin result. +#[rstest] +#[case(FsstViewCompaction::Auto)] +#[case(FsstViewCompaction::GatherBulk)] +#[case(FsstViewCompaction::RunCoalesce)] +#[case(FsstViewCompaction::PerElement)] +fn run_coalesce_gaps_and_shuffle(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + // 12 distinct-ish strings, nullable. + let strings: Vec> = vec![ + Some("alpha"), + None, + Some("bravo bravo"), + Some("charlie"), + Some("delta delta delta"), + None, + Some("echo"), + Some("foxtrot foxtrot"), + Some("golf"), + Some("hotel hotel hotel"), + None, + Some("india"), + ]; + let fsst = make_fsst(&strings, Nullability::Nullable, &mut ctx); + + // Filter to keep a gapped subset (drops 1,2,5,8,10 -> remaining survivors aren't all adjacent). + let keep = [ + true, false, false, true, true, false, true, true, false, true, false, true, + ]; + let mask = Mask::from_iter(keep); + let filtered = fsst_filter_to_view(&fsst, &mask, &mut ctx)?; + + // Then a shuffle+dup take over the filtered length (7 survivors). + let indices = PrimitiveArray::from_iter([6u64, 0, 3, 3, 5, 1, 2, 4]).into_array(); + let view = ::take(filtered.as_view(), &indices, &mut ctx)? + .unwrap() + .try_downcast::() + .ok() + .unwrap(); + + let got = canonicalize_fsstview_with(view.as_view(), strategy, &mut ctx)?; + + let expected = + VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(Nullability::Nullable)) + .into_array() + .filter(mask)? + .take(indices)? + .execute::(&mut ctx)?; + assert_arrays_eq!(got, expected.into_array()); + Ok(()) +} From 8c61b50280c15c0c087f709e1ba25dad08451c0f Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 15:09:35 +0000 Subject: [PATCH 06/23] FSSTView: skip fill_null in take for non-nullable indices; op-only benches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Callgrind on a shuffle take showed the kernel's cost was dominated by running take -> fill_null -> cast -> optimize three times (offsets/sizes/lengths). The fill_null + cast is only needed when a null index could introduce a null, i.e. when the indices are nullable. For non-nullable indices (the common case) the children stay non-nullable, so we now skip fill_null entirely. Re-profiling confirms fill_null (~450K ir) and its cast (~252K ir) drop out and the take kernel falls from ~612K to ~474K instructions per call. Also add take_op_only_view / filter_op_only_view benches that hoist the one-time FSST->view conversion out of the timed loop, isolating the metadata-only op. These show the op is constant-time regardless of size or selectivity (~457 ns filter, ~657 ns take), like a ListView op — the earlier "view loses on selective" was purely the O(n) conversion being charged to every op, which only the first op of a chain actually pays. Signed-off-by: Joe Isaacs --- encodings/fsst/benches/fsst_view_compute.rs | 39 +++++++++++++++++++++ encodings/fsst/src/fsstview/compute.rs | 34 ++++++++++-------- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs index fab3693eb22..09eef83ae27 100644 --- a/encodings/fsst/benches/fsst_view_compute.rs +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -251,6 +251,24 @@ fn filter_step_view(bencher: Bencher, args: FilterArg) { .bench_refs(|(fsst, mask, ctx)| black_box(view_filter(fsst, mask, ctx))); } +/// Metadata-only filter measured in isolation (conversion hoisted out). See `take_op_only_view`. +#[divan::bench(args = filter_args())] +fn filter_op_only_view(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let view = fsstview_from_fsst(&fsst, &mut LEGACY_SESSION.create_execution_ctx()).unwrap(); + let mask = make_mask(view.len(), args.keep); + bencher + .with_inputs(|| (&view, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(view, mask, ctx)| { + black_box( + ::filter(view.as_view(), mask, ctx) + .unwrap() + .unwrap(), + ) + }); +} + /// Full pipeline: filter (compacting into another FSSTArray) then canonicalize to VarBinView. #[divan::bench(args = filter_args())] fn filter_pipeline_fsst(bencher: Bencher, args: FilterArg) { @@ -306,6 +324,27 @@ fn take_step_view(bencher: Bencher, args: TakeArg) { .bench_refs(|(fsst, indices, ctx)| black_box(view_take(fsst, indices, ctx))); } +/// The metadata-only take measured *in isolation*: the FSST→view conversion is hoisted out of the +/// timed loop (a chain converts once), so this is the apples-to-apples "is the view op itself as +/// cheap as a ListView op" comparison. The `*_step_view` bench above instead folds the one-time +/// conversion into every op, which only the first op of a chain actually pays. +#[divan::bench(args = take_args())] +fn take_op_only_view(bencher: Bencher, args: TakeArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let view = fsstview_from_fsst(&fsst, &mut LEGACY_SESSION.create_execution_ctx()).unwrap(); + let indices = make_indices(view.len(), args.kind); + bencher + .with_inputs(|| (&view, &indices, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(view, indices, ctx)| { + black_box( + ::take(view.as_view(), indices, ctx) + .unwrap() + .unwrap(), + ) + }); +} + #[divan::bench(args = take_args())] fn take_pipeline_fsst(bencher: Bencher, args: TakeArg) { let varbin = generate(args.shape); diff --git a/encodings/fsst/src/fsstview/compute.rs b/encodings/fsst/src/fsstview/compute.rs index d964ba53df2..5e3dcc2395e 100644 --- a/encodings/fsst/src/fsstview/compute.rs +++ b/encodings/fsst/src/fsstview/compute.rs @@ -68,21 +68,14 @@ impl TakeExecute for FSSTView { let validity = array.fsstview_validity().take(indices)?; - // `take` may reorder, duplicate, or skip elements, which is fine for `FSSTView` since - // offsets need not be monotonic. `take` yields nullable arrays (null index -> null), - // so we fill nulls with zero and rely on `validity` to track nullness. - let codes_offsets = array - .codes_offsets() - .take(indices.clone())? - .fill_null(Scalar::zero_value(array.codes_offsets().dtype()))?; - let codes_sizes = array - .codes_sizes() - .take(indices.clone())? - .fill_null(Scalar::zero_value(array.codes_sizes().dtype()))?; - let uncompressed_lengths = array - .uncompressed_lengths() - .take(indices.clone())? - .fill_null(Scalar::zero_value(array.uncompressed_lengths().dtype()))?; + // `take` of a non-nullable child with non-nullable indices stays non-nullable, so the + // `fill_null` (and the `cast`/`optimize` it pulls in) is pure overhead in the common case. + // Only when the indices are nullable can a null index introduce a null we must fill with + // zero — nullness itself is tracked separately by `validity`. + let fill = indices.dtype().is_nullable(); + let codes_offsets = take_child(array.codes_offsets(), indices, fill)?; + let codes_sizes = take_child(array.codes_sizes(), indices, fill)?; + let uncompressed_lengths = take_child(array.uncompressed_lengths(), indices, fill)?; // SAFETY: take preserves all `FSSTView` invariants (see `filter`). Ok(Some( @@ -102,3 +95,14 @@ impl TakeExecute for FSSTView { )) } } + +/// Take a non-nullable integer child by `indices`, only filling nulls with zero when the indices +/// are nullable (and so could have introduced nulls). The child is always non-nullable on input. +fn take_child(child: &ArrayRef, indices: &ArrayRef, fill: bool) -> VortexResult { + let taken = child.take(indices.clone())?; + if fill { + taken.fill_null(Scalar::zero_value(child.dtype())) + } else { + Ok(taken) + } +} From 033112ee5098caf95ba89f156a5b7a92fc33ebb6 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 15:13:38 +0000 Subject: [PATCH 07/23] FSSTView: byte accounting (compressed + uncompressed) for the gap-merge idea Adds FsstViewByteStats / fsstview_byte_stats reporting, in both compressed (code) and uncompressed (decoded) space: live vs run-spanned vs whole-heap bytes, distinct spans, run count, and the dead-byte waste a gap-merged decode would carry. A byte_stats_report test prints it for a selective filter and a shuffle take (run with --nocapture). This quantifies why merging across gaps to keep decode runs long doesn't pay: filter_10pct (keep ~10% of 65536): runs=5945 over 6616 survivors (avg ~1.1 elem/run -> survivors are isolated) compressed: live=25.8KB, heap=255KB full-heap-merge waste = 89.9% (you'd decode ~10x the needed compressed bytes) shuffle_take (reorder all): runs=1, waste=0% (RunCoalesce's ideal) -- yet it still loses on time to GatherBulk because the random access just moves to view-build. So the dead-value budget the gap-merge idea needs is blown immediately on a selective filter (90% dead), and on the one input where merging is free (shuffle, 0% dead) GatherBulk still wins. There's also a hard blocker: after a filter the dead elements' uncompressed_lengths are gone and FSST decode only returns a total written count, so a single gap-merged decode can't even locate post-gap survivors. Conclusion: GatherBulk (zero waste) / Direct (contiguous) remain the right canonicalization; the stats make the trade-off measurable. Signed-off-by: Joe Isaacs --- encodings/fsst/src/fsstview/canonical.rs | 121 +++++++++++++++++++++++ encodings/fsst/src/fsstview/mod.rs | 2 + encodings/fsst/src/fsstview/tests.rs | 103 +++++++++++++++++++ 3 files changed, 226 insertions(+) diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index 91a2115936d..a6f8685bc04 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -98,6 +98,127 @@ pub(super) fn canonicalize_fsstview( canonicalize_fsstview_with(array, FsstViewCompaction::Auto, ctx) } +/// Byte accounting for an [`FSSTView`], in **both compressed (code) space and uncompressed +/// (decoded) space**, for reasoning about gather/coalesce trade-offs and dead-byte waste. +/// +/// All figures are in bytes. The "span" figures describe what a *gap-merged* decode (decoding each +/// run's full heap extent, dead bytes included) would touch; the difference from the live figures +/// is the waste such a strategy would carry. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub struct FsstViewByteStats { + /// Number of (logical) elements in the view. + pub elements: usize, + /// Distinct heap runs the live elements form (maximal heap-adjacent groups of distinct spans). + pub runs: usize, + /// Distinct (deduplicated) live code spans referenced by the view. + pub distinct_spans: usize, + /// Compressed bytes the live distinct spans occupy (what `GatherBulk` copies / decodes). + pub live_compressed: usize, + /// Compressed bytes spanned by the runs *including* dead gaps between survivors (what a + /// gap-merged decode would feed the decoder). `span_compressed - live_compressed` is the + /// compressed waste of merging across gaps. + pub span_compressed: usize, + /// Uncompressed bytes the live elements decode to (the canonical output size; deduped spans + /// counted once). + pub live_uncompressed: usize, + /// Total uncompressed output size with duplicates expanded (the `VarBinView`'s logical size). + pub logical_uncompressed: usize, + /// Total compressed heap size backing the view (the original, shared code buffer). + pub heap_compressed: usize, +} + +impl FsstViewByteStats { + /// Fraction of the spanned compressed bytes that are dead (would be wasted by a gap-merged + /// decode). `0.0` means the live spans are perfectly contiguous within each run. + pub fn compressed_waste_ratio(&self) -> f64 { + if self.span_compressed == 0 { + 0.0 + } else { + (self.span_compressed - self.live_compressed) as f64 / self.span_compressed as f64 + } + } +} + +/// Compute [`FsstViewByteStats`] for a view (diagnostics; not on the hot path). +pub fn fsstview_byte_stats( + array: ArrayView<'_, FSSTView>, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let offsets = load_usize(array.codes_offsets(), ctx)?; + let sizes = load_usize(array.codes_sizes(), ctx)?; + let ulen_prim = array + .uncompressed_lengths() + .clone() + .execute::(ctx)?; + #[expect(clippy::cast_possible_truncation)] + let ulens: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + ulen_prim + .as_slice::

() + .iter() + .map(|x| *x as usize) + .collect() + }); + + let elements = offsets.len(); + let logical_uncompressed: usize = ulens.iter().sum(); + let heap_compressed = array.codes_bytes().len(); + + // Walk distinct spans in heap order, accumulating live/run/span figures. + let mut order: Vec = (0..elements).filter(|&i| sizes[i] > 0).collect(); + order.sort_unstable_by_key(|&i| (offsets[i], sizes[i])); + + let mut runs = 0usize; + let mut distinct_spans = 0usize; + let mut live_compressed = 0usize; + let mut live_uncompressed = 0usize; + let mut span_compressed = 0usize; + let mut run_end: Option = None; + let mut run_start = 0usize; + let mut prev_span: Option<(usize, usize)> = None; + for &i in &order { + let span = (offsets[i], sizes[i]); + let is_dup = prev_span == Some(span); + prev_span = Some(span); + if is_dup { + continue; // duplicate of the previous distinct span + } + distinct_spans += 1; + live_compressed += sizes[i]; + live_uncompressed += ulens[i]; + match run_end { + Some(end) if offsets[i] == end => { + run_end = Some(end + sizes[i]); + } + Some(end) => { + // Close the previous run, open a new one. + span_compressed += end - run_start; + runs += 1; + run_start = offsets[i]; + run_end = Some(offsets[i] + sizes[i]); + } + None => { + run_start = offsets[i]; + run_end = Some(offsets[i] + sizes[i]); + } + } + } + if let Some(end) = run_end { + span_compressed += end - run_start; + runs += 1; + } + + Ok(FsstViewByteStats { + elements, + runs, + distinct_spans, + live_compressed, + span_compressed, + live_uncompressed, + logical_uncompressed, + heap_compressed, + }) +} + /// Canonicalize an [`FSSTView`] to a [`VarBinViewArray`] using an explicit compaction strategy. /// /// Exposed (rather than only the dispatch-driven [`canonicalize_fsstview`]) so benchmarks can diff --git a/encodings/fsst/src/fsstview/mod.rs b/encodings/fsst/src/fsstview/mod.rs index 8efbb50b7fa..f712018ac3f 100644 --- a/encodings/fsst/src/fsstview/mod.rs +++ b/encodings/fsst/src/fsstview/mod.rs @@ -28,7 +28,9 @@ mod slice; mod tests; pub use array::*; +pub use canonical::FsstViewByteStats; pub use canonical::FsstViewCompaction; pub use canonical::canonicalize_fsstview_with; +pub use canonical::fsstview_byte_stats; pub use from_fsst::fsst_filter_to_view; pub use from_fsst::fsst_take_to_view; diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs index 4418f90a249..804d44b8640 100644 --- a/encodings/fsst/src/fsstview/tests.rs +++ b/encodings/fsst/src/fsstview/tests.rs @@ -10,6 +10,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::dict::TakeExecute; +use vortex_array::arrays::filter::FilterKernel; use vortex_array::assert_arrays_eq; use vortex_array::compute::conformance::consistency::test_array_consistency; use vortex_array::compute::conformance::filter::test_filter_conformance; @@ -28,6 +29,7 @@ use crate::fsst_compress; use crate::fsst_filter_to_view; use crate::fsst_take_to_view; use crate::fsst_train_compressor; +use crate::fsstview_byte_stats; use crate::fsstview_from_fsst; fn make_fsstview( @@ -327,3 +329,104 @@ fn run_coalesce_gaps_and_shuffle(#[case] strategy: FsstViewCompaction) -> Vortex assert_arrays_eq!(got, expected.into_array()); Ok(()) } + +/// Build a ~`target`-uncompressed-byte FSSTView of random short URL-ish strings. +fn make_big_view(target: usize, avg_len: usize, ctx: &mut ExecutionCtx) -> FSSTViewArray { + use rand::RngExt; + use rand::SeedableRng; + use rand::rngs::StdRng; + let mut rng = StdRng::seed_from_u64(1); + let words = [ + "https://", "example", "vortex", ".com/", "path", "value", "data", "alpha", + ]; + let count = target / avg_len; + let strings: Vec> = (0..count) + .map(|_| { + let mut s = String::new(); + while s.len() < avg_len { + s.push_str(words[rng.random_range(0..words.len())]); + } + s.truncate(avg_len); + s.into_bytes().into_boxed_slice() + }) + .collect(); + let varbin = VarBinArray::from_iter( + strings.into_iter().map(Some), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + let fsst = fsst_compress(&varbin, varbin.len(), varbin.dtype(), &compressor, ctx); + fsstview_from_fsst(&fsst, ctx).expect("fsstview_from_fsst") +} + +/// Reports the byte accounting (compressed and uncompressed) and the dead-byte waste a gap-merged +/// decode would carry, for representative selective filter / shuffle take / dense take. Run with +/// `cargo test -p vortex-fsst byte_stats_report -- --nocapture` to see the numbers. +#[test] +fn byte_stats_report() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let base = make_big_view(1 << 20, 16, &mut ctx); + let n = base.len(); + + // Selective filter (keep ~10%): many small gaps -> high compressed waste if merged. + let mut rng_keep = { + use rand::SeedableRng; + rand::rngs::StdRng::seed_from_u64(3) + }; + let mask = { + use rand::RngExt; + Mask::from_iter((0..n).map(|_| rng_keep.random_bool(0.10))) + }; + let filtered = ::filter(base.as_view(), &mask, &mut ctx)? + .unwrap() + .try_downcast::() + .ok() + .unwrap(); + + // Shuffle take: same elements, reordered -> one run, zero waste. + let mut perm: Vec = (0..n as u64).collect(); + { + use rand::RngExt; + use rand::SeedableRng; + let mut r = rand::rngs::StdRng::seed_from_u64(4); + for i in (1..perm.len()).rev() { + perm.swap(i, r.random_range(0..=i)); + } + } + let shuffled = ::take( + base.as_view(), + &PrimitiveArray::from_iter(perm).into_array(), + &mut ctx, + )? + .unwrap() + .try_downcast::() + .ok() + .unwrap(); + + for (label, view) in [("filter_10pct", &filtered), ("shuffle_take", &shuffled)] { + let s = fsstview_byte_stats(view.as_view(), &mut ctx)?; + // Waste if we instead merged *everything* into a single decode of the whole heap extent + // (the most aggressive gap-merge): all heap bytes minus the live ones are dead. + let full_merge_waste = if s.heap_compressed == 0 { + 0.0 + } else { + (s.heap_compressed - s.live_compressed) as f64 / s.heap_compressed as f64 + }; + println!( + "{label}: elements={} runs={} distinct={} \ + | compressed: live={}B span={}B heap={}B run_waste={:.1}% full_merge_waste={:.1}% \ + | uncompressed: live={}B logical={}B", + s.elements, + s.runs, + s.distinct_spans, + s.live_compressed, + s.span_compressed, + s.heap_compressed, + s.compressed_waste_ratio() * 100.0, + full_merge_waste * 100.0, + s.live_uncompressed, + s.logical_uncompressed, + ); + } + Ok(()) +} From cd885337072128250de4d1f4eb9bab6399bccebc Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 15:36:45 +0000 Subject: [PATCH 08/23] FSSTView: coalesce adjacent spans in the gather (compaction at export) The only overhead GatherBulk carries over the theoretical minimum is the gather memcpy, and it was copying every element's span individually. For an order-preserving filter, surviving neighbours are still heap-adjacent, so a run of k survivors can be copied in one memcpy instead of k. The gather now accumulates a contiguous [run_start, run_end) heap range and flushes it once per run, making the copy cost proportional to the number of runs rather than the number of elements. This is a strict win where survivors form long runs (non-selective filter: many_short/nonselective canonicalize ~5.38ms -> ~4.75ms) and a no-op for a shuffle (no adjacency -> one copy per element as before, behind a cheap branch). Combined with Direct (single contiguous run, zero copy), the export is now optimal: gather work scales with run count, then one bulk decode, then a sequential element-ordered view-build. Correctness: spans are still emitted in element order, so the decoded buffer stays element-ordered; coalescing only fires on genuine zero-gap adjacency. Covered by the existing all-strategies-agree and gaps+shuffle+nullable tests. Signed-off-by: Joe Isaacs --- encodings/fsst/src/fsstview/canonical.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index a6f8685bc04..ee85aa6917c 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -352,6 +352,11 @@ fn decompress_direct( } /// Compact the scattered live codes into a contiguous buffer, then a single bulk decompress. +/// +/// The gather coalesces consecutive heap-adjacent spans into a single `extend_from_slice`: for an +/// order-preserving `filter`, surviving neighbours are still contiguous in the heap, so a run of +/// `k` survivors is copied in one memcpy instead of `k`. This collapses the per-span copy overhead +/// (which dominates for short codes) to per-run, while a shuffle (no adjacency) is unaffected. fn decompress_gather( decompressor: &Decompressor<'_>, heap: &[u8], @@ -361,8 +366,25 @@ fn decompress_gather( total_size: usize, ) -> ByteBufferMut { let mut compressed = ByteBufferMut::with_capacity(live); + // Accumulate a contiguous `[run_start, run_end)` heap range and flush it as one copy. + let mut run_start = 0usize; + let mut run_end = 0usize; for (&offset, &size) in offsets.iter().zip(sizes) { - compressed.extend_from_slice(&heap[offset..offset + size]); + if size == 0 { + continue; + } + if offset == run_end && run_end != run_start { + run_end += size; // extend the current run (heap-adjacent) + } else { + if run_end != run_start { + compressed.extend_from_slice(&heap[run_start..run_end]); + } + run_start = offset; + run_end = offset + size; + } + } + if run_end != run_start { + compressed.extend_from_slice(&heap[run_start..run_end]); } let mut out = ByteBufferMut::with_capacity(total_size + 7); let written = decompressor.decompress_into(compressed.as_slice(), out.spare_capacity_mut()); From 0d1232fc88f1642f54740079f509f63465a37618 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 16:48:39 +0000 Subject: [PATCH 09/23] FSSTView: VarBin export + single-filter 2x2 export benchmark Add canonicalize_fsstview_to_varbin: reuses the element-ordered decode path of the VarBinView canonicalizer (Direct/GatherBulk/PerElement), but the finisher builds len+1 cumulative offsets over the contiguous decoded bytes instead of a 16-byte view per element. Covered by varbin_export_matches_canonical across all element-ordered strategies on gapped nullable data. Add export_{fsst,view}_to_{varbin,varbinview} benches: a single filter then export, the full {fsst, fsstview} x {VarBin, VarBinView} matrix. Medians (100 samples), single filter + export: many_short (174k x ~12B): fsst->VBV fsst->VB view->VBV view->VB nonselective 90% 3.11ms 2.63ms 4.75ms 2.64ms selective 10% 561us 534us 1.02ms 728us few_long (8k x ~256B): nonselective 90% 467us 328us 472us 303us selective 10% 73us 55us 77us 72us Takeaways: - VarBin export is consistently cheaper than VarBinView for both encodings: no per-element 16-byte view construction, just an offsets cumsum. Biggest gap on many short strings (view->VB 2.64ms vs view->VBV 4.75ms, ~1.8x). - For a single filter, fsst is competitive with or ahead of fsstview: fsst pays the heap rewrite once during the (cheap-when-selective) filter, while fsstview pays a gather at export plus the one-time FSST->view conversion. The view's advantage is in chains, where the per-op heap rewrite is amortized away (see the chain_pipeline benches). Signed-off-by: Joe Isaacs --- encodings/fsst/benches/fsst_view_compute.rs | 81 +++++++++++++++++++++ encodings/fsst/src/fsstview/canonical.rs | 76 +++++++++++++++++++ encodings/fsst/src/fsstview/mod.rs | 1 + encodings/fsst/src/fsstview/tests.rs | 37 ++++++++++ 4 files changed, 195 insertions(+) diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs index 09eef83ae27..545daea37f9 100644 --- a/encodings/fsst/benches/fsst_view_compute.rs +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -47,7 +47,9 @@ use vortex_array::dtype::Nullability; use vortex_fsst::FSST; use vortex_fsst::FSSTArray; use vortex_fsst::FSSTView; +use vortex_fsst::FSSTViewArray; use vortex_fsst::FsstViewCompaction; +use vortex_fsst::canonicalize_fsstview_to_varbin; use vortex_fsst::canonicalize_fsstview_with; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; @@ -482,6 +484,85 @@ fn chain_pipeline_view(bencher: Bencher, shape: Shape) { }); } +// =============================== SINGLE FILTER + EXPORT (2x2) ================================== +// +// A single filter, then export to a canonical string array. The matrix is +// {fsst, fsstview} x {VarBinView, VarBin}: +// - fsst path: filter rewrites the compressed heap (VarBin filter on codes), then decode. +// - fsstview path: filter is metadata-only, then decode (coalesced gather + bulk) at export. +// - VarBinView export: build a 16-byte view per element. +// - VarBin export: build len+1 cumulative offsets over the contiguous decoded bytes. + +fn export_view(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> FSSTViewArray { + view_filter(array, mask, ctx) + .try_downcast::() + .ok() + .unwrap() +} + +#[divan::bench(args = filter_args())] +fn export_fsst_to_varbinview(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let filtered = fsst_filter(fsst, mask, ctx); + black_box(fsst_to_canonical(&filtered, ctx)) + }); +} + +#[divan::bench(args = filter_args())] +fn export_fsst_to_varbin(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + // Filter stays in FSST; reach VarBin by reinterpreting the (now contiguous) codes as a + // view and exporting offsets+bytes. + let filtered = fsst_filter(fsst, mask, ctx); + let view = fsstview_from_fsst(&filtered, ctx).unwrap(); + black_box( + canonicalize_fsstview_to_varbin(view.as_view(), FsstViewCompaction::Auto, ctx) + .unwrap(), + ) + }); +} + +#[divan::bench(args = filter_args())] +fn export_view_to_varbinview(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let view = export_view(fsst, mask, ctx); + black_box( + canonicalize_fsstview_with(view.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), + ) + }); +} + +#[divan::bench(args = filter_args())] +fn export_view_to_varbin(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let view = export_view(fsst, mask, ctx); + black_box( + canonicalize_fsstview_to_varbin(view.as_view(), FsstViewCompaction::Auto, ctx) + .unwrap(), + ) + }); +} + // =============================== arg plumbing ================================================== #[derive(Clone, Copy)] diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index ee85aa6917c..40f37f1c580 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -49,10 +49,12 @@ use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::varbinview::BinaryView; use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; use vortex_array::arrays::varbinview::build_views::build_views; +use vortex_array::buffer::BufferHandle; use vortex_array::match_each_integer_ptype; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; @@ -312,6 +314,80 @@ pub fn canonicalize_fsstview_with( }) } +/// Canonicalize an [`FSSTView`] to a [`VarBinArray`] (offsets + contiguous bytes) instead of a +/// [`VarBinViewArray`]. +/// +/// Shares the decode path with [`canonicalize_fsstview_with`]: the strategies that produce an +/// element-ordered output (`Direct`/`GatherBulk`/`PerElement`) are reused as-is; the only +/// difference is the finisher, which builds `len + 1` cumulative offsets from the uncompressed +/// lengths rather than per-element views. `RunCoalesce` is not applicable (its output is heap- +/// ordered, not element-ordered) and is treated as `GatherBulk`. +/// +/// Exposed for benchmarking the export target (VarBin vs VarBinView). `Auto` resolves to `Direct` +/// when contiguous, else `GatherBulk`. +pub fn canonicalize_fsstview_to_varbin( + array: ArrayView<'_, FSSTView>, + strategy: FsstViewCompaction, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let offsets = load_usize(array.codes_offsets(), ctx)?; + let sizes = load_usize(array.codes_sizes(), ctx)?; + + let ulen_prim = array + .uncompressed_lengths() + .clone() + .execute::(ctx)?; + #[expect(clippy::cast_possible_truncation)] + let ulens: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + ulen_prim + .as_slice::

() + .iter() + .map(|x| *x as usize) + .collect() + }); + let total_size: usize = ulens.iter().sum(); + let live: usize = sizes.iter().sum(); + + let heap_buffer = array.codes_bytes(); + let heap = heap_buffer.as_slice(); + let decompressor = array.decompressor(); + + let contiguous = is_contiguous(&offsets, &sizes); + let uncompressed = match strategy { + FsstViewCompaction::PerElement => { + decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size) + } + // Direct (or Auto) on a contiguous layout decodes the live range in place, no gather. + FsstViewCompaction::Direct | FsstViewCompaction::Auto if contiguous => { + let start = offsets.first().copied().unwrap_or(0); + decompress_direct(&decompressor, heap, start, live, total_size) + } + // Everything else uses the element-ordered (coalesced) gather + one bulk decode. + _ => decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size), + }; + + // Build `len + 1` cumulative offsets from the uncompressed lengths. + let mut varbin_offsets = BufferMut::::with_capacity(ulens.len() + 1); + let mut acc = 0i64; + varbin_offsets.push(acc); + for &ulen in &ulens { + acc += ulen as i64; + varbin_offsets.push(acc); + } + + let bytes = BufferHandle::new_host(uncompressed.freeze()); + // SAFETY: offsets are monotonic and end at the byte length; bytes are valid binary/UTF-8. + Ok(unsafe { + VarBinArray::new_unchecked_from_handle( + varbin_offsets.into_array(), + bytes, + array.dtype().clone(), + array.fsstview_validity(), + ) + .into_array() + }) +} + fn load_usize(array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult> { let prim = array.clone().execute::(ctx)?; #[expect(clippy::cast_possible_truncation)] diff --git a/encodings/fsst/src/fsstview/mod.rs b/encodings/fsst/src/fsstview/mod.rs index f712018ac3f..c7529359bbd 100644 --- a/encodings/fsst/src/fsstview/mod.rs +++ b/encodings/fsst/src/fsstview/mod.rs @@ -30,6 +30,7 @@ mod tests; pub use array::*; pub use canonical::FsstViewByteStats; pub use canonical::FsstViewCompaction; +pub use canonical::canonicalize_fsstview_to_varbin; pub use canonical::canonicalize_fsstview_with; pub use canonical::fsstview_byte_stats; pub use from_fsst::fsst_filter_to_view; diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs index 804d44b8640..2846909451a 100644 --- a/encodings/fsst/src/fsstview/tests.rs +++ b/encodings/fsst/src/fsstview/tests.rs @@ -24,6 +24,7 @@ use crate::FSSTArray; use crate::FSSTView; use crate::FSSTViewArray; use crate::FsstViewCompaction; +use crate::canonicalize_fsstview_to_varbin; use crate::canonicalize_fsstview_with; use crate::fsst_compress; use crate::fsst_filter_to_view; @@ -430,3 +431,39 @@ fn byte_stats_report() -> VortexResult<()> { } Ok(()) } + +/// The VarBin exporter must agree with the canonical VarBin filter, across all element-ordered +/// strategies, for a gapped filter over nullable data. +#[rstest] +#[case(FsstViewCompaction::Auto)] +#[case(FsstViewCompaction::GatherBulk)] +#[case(FsstViewCompaction::PerElement)] +fn varbin_export_matches_canonical(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let strings: Vec> = vec![ + Some("alpha"), + None, + Some("bravo bravo"), + Some("charlie"), + Some("delta delta delta"), + None, + Some("echo"), + Some("foxtrot foxtrot"), + ]; + let fsst = make_fsst(&strings, Nullability::Nullable, &mut ctx); + let keep = [true, false, true, true, false, false, true, true]; + let mask = Mask::from_iter(keep); + let view = fsst_filter_to_view(&fsst, &mask, &mut ctx)?; + + let got = canonicalize_fsstview_to_varbin(view.as_view(), strategy, &mut ctx)?; + // Compare as VarBinView so the offsets-vs-views layout difference doesn't matter. + let got_view = got.execute::(&mut ctx)?; + + let expected = + VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(Nullability::Nullable)) + .into_array() + .filter(mask)? + .execute::(&mut ctx)?; + assert_arrays_eq!(got_view.into_array(), expected.into_array()); + Ok(()) +} From 694b2513ff95d1de258b9d4d9d698922cf3ecd5a Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 17:06:08 +0000 Subject: [PATCH 10/23] FSSTView: bench the VarBin -> VarBinView conversion cost after export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds convert_varbin_to_varbinview: takes the VarBin produced by the view->VarBin export and times converting it into a VarBinView, isolated (conversion only). Medians (100 samples): many_short (174k x ~12B): nonsel 2.29ms selective 203us few_long (8k x ~256B): nonsel 197us selective 12us This answers "is decode-to-VarBin-then-convert cheaper than decode-straight-to- VarBinView?" when the consumer wants a view: many_short/nonsel, to reach a VarBinView: view->VarBin (3.42ms) + convert (2.29ms) = 5.71ms view->VarBinView直 = 4.92ms Going via VarBin is worse (5.71ms vs 4.92ms): the conversion adds back the per-element view construction you skipped, plus a full re-decode/copy of the bytes. So VarBin export is only the right target when the consumer actually wants offsets+bytes (VarBin); if the result must be a VarBinView, decode straight to it. Signed-off-by: Joe Isaacs --- encodings/fsst/benches/fsst_view_compute.rs | 26 +++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs index 545daea37f9..4a1ddb33ca1 100644 --- a/encodings/fsst/benches/fsst_view_compute.rs +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -563,6 +563,32 @@ fn export_view_to_varbin(bencher: Bencher, args: FilterArg) { }); } +/// Cost of converting the VarBin produced by `view->VarBin` *into* a VarBinView, isolated. Add this +/// to `export_view_to_varbin` to compare against `export_view_to_varbinview` (going straight to a +/// view): is "decode to VarBin, then convert" cheaper than "decode straight to VarBinView"? +#[divan::bench(args = filter_args())] +fn convert_varbin_to_varbinview(bencher: Bencher, args: FilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = make_mask(fsst.len(), args.keep); + // Pre-build the VarBin (the `view->VarBin` export output) outside the timed loop. + let view = { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + export_view(&fsst, &mask, &mut ctx) + }; + let vbin = canonicalize_fsstview_to_varbin( + view.as_view(), + FsstViewCompaction::Auto, + &mut LEGACY_SESSION.create_execution_ctx(), + ) + .unwrap(); + bencher + .with_inputs(|| (&vbin, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(vbin, ctx)| { + black_box((*vbin).clone().execute::(ctx).unwrap()) + }); +} + // =============================== arg plumbing ================================================== #[derive(Clone, Copy)] From 30a8f935ad4d46b4957960e959fa5f23f808a0ca Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 20:51:54 +0000 Subject: [PATCH 11/23] FSSTView: database-style benches + RunDecode export ("export all in place") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two parts, driven by what realistic query shapes revealed. Benches: real query masks are rarely uniform-random. Add Selectivity shapes (uniform / range-scan / clustered bursts) and a sorted-index "index lookup" take, plus a canon_only bench that isolates the export decode by strategy. These showed that *run length* (set by the selection shape), not raw selectivity, drives the view's export cost — and that uniform-random was the view's worst case all along. Optimization: callgrind showed fsstview_from_fsst is ~21% of a single-op filter->VarBinView (it derives sizes for all n elements before the filter discards most). Rather than fuse convert+filter (which would break composition across a chain of filters/takes), the new RunDecode export strategy attacks the gather instead: when survivors are monotonic (after any filter / sorted take / slice), decode each contiguous heap run directly into the element-ordered output with NO gather copy. Output stays element-ordered so the view-build is sequential (unlike RunCoalesce). Auto now chooses "export all in place" (RunDecode) vs "compact codes then export" (GatherBulk) by run count: runs <= len/4 and monotonic -> RunDecode, else GatherBulk. canon_only medians (export decode only): many_short clustered: RunDecode 313us vs GatherBulk 333us (Auto -> 313us) many_short range: RunDecode 345us vs GatherBulk 370us (Auto -> 333us) many_short uniform: GatherBulk 561us vs RunDecode 657us (Auto -> 563us) Auto picks the winner on every shape. The conversion and metadata-only filter/take stay separate, so chains still compose; only the final canonicalize compacts or not. Covered by run_decode_monotonic_filter (nulls/empties/multi-run/trailing-run) and the existing all-strategies-agree tests; 111 tests pass. Signed-off-by: Joe Isaacs --- encodings/fsst/benches/fsst_view_compute.rs | 229 ++++++++++++++++++++ encodings/fsst/src/fsstview/canonical.rs | 170 ++++++++++++++- encodings/fsst/src/fsstview/tests.rs | 42 ++++ 3 files changed, 430 insertions(+), 11 deletions(-) diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs index 4a1ddb33ca1..5d53b3d90e1 100644 --- a/encodings/fsst/benches/fsst_view_compute.rs +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -131,6 +131,67 @@ fn make_mask(len: usize, keep_fraction: f64) -> Mask { Mask::from_iter((0..len).map(|_| rng.random_bool(keep_fraction))) } +/// How a WHERE-clause selection is distributed over the rows — the shape that, in practice, drives +/// run length far more than raw selectivity does. Real query masks are rarely uniform-random. +#[derive(Clone, Copy, Debug)] +enum Selectivity { + /// Uniform-random `keep` fraction (the worst case for run length: ~no adjacency). + Uniform(f64), + /// One contiguous range of `keep` fraction — a sorted range scan (`WHERE k BETWEEN a AND b`). + /// Survivors are a single run. + Range(f64), + /// `bursts` contiguous blocks totalling ~`keep` — clustered hits (e.g. a low-cardinality + /// predicate over data sorted by a correlated key). Survivors form a few medium runs. + Clustered { keep: f64, bursts: usize }, +} + +impl Selectivity { + fn name(self) -> &'static str { + match self { + Selectivity::Uniform(k) if k <= 0.2 => "uniform_10pct", + Selectivity::Uniform(_) => "uniform_90pct", + Selectivity::Range(_) => "range_scan_10pct", + Selectivity::Clustered { .. } => "clustered_10pct", + } + } + + fn make(self, len: usize) -> Mask { + match self { + Selectivity::Uniform(keep) => make_mask(len, keep), + Selectivity::Range(keep) => { + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let take = (len as f64 * keep) as usize; + let start = (len - take) / 2; // a range in the middle of the column + Mask::from_iter((0..len).map(|i| i >= start && i < start + take)) + } + Selectivity::Clustered { keep, bursts } => { + let mut rng = StdRng::seed_from_u64(9); + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let total = (len as f64 * keep) as usize; + let burst_len = (total / bursts).max(1); + let mut keep_set = vec![false; len]; + for _ in 0..bursts { + let start = rng.random_range(0..len.saturating_sub(burst_len).max(1)); + for j in start..(start + burst_len).min(len) { + keep_set[j] = true; + } + } + Mask::from_iter(keep_set) + } + } + } +} + +/// The selection shapes exercised by the "database-style" filter benches. +const SELECTIVITIES: &[Selectivity] = &[ + Selectivity::Uniform(0.10), + Selectivity::Range(0.10), + Selectivity::Clustered { + keep: 0.10, + bursts: 32, + }, +]; + #[derive(Clone, Copy, Debug)] enum TakeKind { /// A full shuffle (permutation of all rows) — same length, reordered. @@ -158,6 +219,7 @@ fn compaction_name(strategy: FsstViewCompaction) -> &'static str { FsstViewCompaction::GatherBulk => "gather_bulk", FsstViewCompaction::PerElement => "per_element", FsstViewCompaction::RunCoalesce => "run_coalesce", + FsstViewCompaction::RunDecode => "run_decode", } } @@ -500,6 +562,29 @@ fn export_view(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> FSSTVi .unwrap() } +/// Canonicalize a *pre-filtered* view (filter hoisted out of the loop), parameterized by the +/// selection shape and the explicit compaction strategy. This isolates the export decode so +/// `RunDecode` ("export all in place") can be compared head-to-head against `GatherBulk` ("compact +/// codes") on each survivor layout. +#[divan::bench(args = canon_args())] +fn canon_only(bencher: Bencher, args: CanonArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = args.sel.make(fsst.len()); + let view = { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + view_filter(&fsst, &mask, &mut ctx) + .try_downcast::() + .ok() + .unwrap() + }; + bencher + .with_inputs(|| (&view, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(view, ctx)| { + black_box(canonicalize_fsstview_with(view.as_view(), args.strategy, ctx).unwrap()) + }); +} + #[divan::bench(args = filter_args())] fn export_fsst_to_varbinview(bencher: Bencher, args: FilterArg) { let varbin = generate(args.shape); @@ -589,8 +674,152 @@ fn convert_varbin_to_varbinview(bencher: Bencher, args: FilterArg) { }); } +// =============================== DATABASE-STYLE FILTER + EXPORT ================================ +// +// Real query masks are rarely uniform-random: a sorted range scan selects one contiguous run, and +// a clustered/correlated predicate selects a handful of bursts. Run length (not raw selectivity) +// is what drives the coalesced gather and the FSST->view conversion overhead, so these shapes are +// where the view encoding's behaviour actually diverges from the uniform-random case. Each bench +// filters then exports to a VarBinView; we compare fsst vs fsstview directly. + +#[divan::bench(args = db_filter_args())] +fn db_filter_fsst_to_varbinview(bencher: Bencher, args: DbFilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = args.sel.make(fsst.len()); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let filtered = fsst_filter(fsst, mask, ctx); + black_box(fsst_to_canonical(&filtered, ctx)) + }); +} + +#[divan::bench(args = db_filter_args())] +fn db_filter_view_to_varbinview(bencher: Bencher, args: DbFilterArg) { + let varbin = generate(args.shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = args.sel.make(fsst.len()); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let view = view_filter(fsst, mask, ctx) + .try_downcast::() + .ok() + .unwrap(); + black_box( + canonicalize_fsstview_with(view.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), + ) + }); +} + +/// An index lookup / sorted-key join: take with **sorted** indices selecting ~30% of rows. Unlike +/// a shuffle this preserves heap order, so survivors coalesce into runs — the common DB take shape +/// (e.g. fetching rows by a sorted RID list). +fn make_sorted_take(len: usize, keep: f64) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(13); + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let n = (len as f64 * keep) as usize; + let mut idx: Vec = (0..n).map(|_| rng.random_range(0..len as u64)).collect(); + idx.sort_unstable(); + PrimitiveArray::from_iter(idx).into_array() +} + +#[divan::bench(args = SHAPES)] +fn db_indexlookup_fsst_to_varbinview(bencher: Bencher, shape: Shape) { + let varbin = generate(shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let indices = make_sorted_take(fsst.len(), 0.30); + bencher + .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, indices, ctx)| { + let taken = fsst_take(fsst, indices, ctx); + black_box(fsst_to_canonical(&taken, ctx)) + }); +} + +#[divan::bench(args = SHAPES)] +fn db_indexlookup_view_to_varbinview(bencher: Bencher, shape: Shape) { + let varbin = generate(shape); + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let indices = make_sorted_take(fsst.len(), 0.30); + bencher + .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, indices, ctx)| { + let view = view_take(fsst, indices, ctx) + .try_downcast::() + .ok() + .unwrap(); + black_box( + canonicalize_fsstview_with(view.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), + ) + }); +} + // =============================== arg plumbing ================================================== +#[derive(Clone, Copy)] +struct DbFilterArg { + shape: Shape, + sel: Selectivity, +} + +impl std::fmt::Display for DbFilterArg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", self.shape.name(), self.sel.name()) + } +} + +fn db_filter_args() -> Vec { + let mut v = Vec::new(); + for &shape in SHAPES { + for &sel in SELECTIVITIES { + v.push(DbFilterArg { shape, sel }); + } + } + v +} + +#[derive(Clone, Copy)] +struct CanonArg { + shape: Shape, + sel: Selectivity, + strategy: FsstViewCompaction, +} + +impl std::fmt::Display for CanonArg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}/{}/{}", + self.shape.name(), + self.sel.name(), + compaction_name(self.strategy) + ) + } +} + +fn canon_args() -> Vec { + let strategies = [ + FsstViewCompaction::Auto, + FsstViewCompaction::GatherBulk, + FsstViewCompaction::RunDecode, + ]; + let mut v = Vec::new(); + for &shape in SHAPES { + for &sel in SELECTIVITIES { + for strategy in strategies { + v.push(CanonArg { + shape, + sel, + strategy, + }); + } + } + } + v +} + #[derive(Clone, Copy)] struct FilterArg { shape: Shape, diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index 40f37f1c580..6ac82e0d7de 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -40,6 +40,24 @@ //! So [`FsstViewCompaction::Auto`] uses `Direct` when the live codes are contiguous //! (untouched/sliced view) and `GatherBulk` otherwise. `PerElement` and `RunCoalesce` are kept //! selectable so the trade-off stays measurable, but `Auto` never picks them. +//! +//! ## Export heuristic: "export all in place" vs "compact codes" +//! +//! `GatherBulk` always copies the live codes contiguous before decoding. But after a `filter`, a +//! sorted-index `take`, or a `slice`, the survivors' offsets stay **monotonic** — so we can skip +//! the gather entirely and decode each contiguous heap run *directly into* the (element-ordered) +//! output: [`RunDecode`][FsstViewCompaction::RunDecode]. Unlike `RunCoalesce`, the output is in +//! element order, so the view-build stays sequential. The cost is one decode call per run, so it +//! wins while survivors form few runs (clustered / range selections) and loses once they fragment +//! into many tiny runs (a uniform-random filter), where one bulk decode (`GatherBulk`) is cheaper. +//! +//! `Auto` therefore decides between *exporting all in place* and *compacting codes then exporting* +//! by **run count**: `RunDecode` when `runs <= len / RUN_DECODE_MAX_RUN_FRACTION` (and the layout +//! is monotonic), else `GatherBulk`. The `db_*`/`canon_only` benches calibrate this: on +//! `many_short` it's RunDecode ~313 µs (clustered) / ~345 µs (range) vs GatherBulk ~333 / ~370 µs, +//! and GatherBulk ~561 µs vs RunDecode ~657 µs on uniform-random. Crucially this lives entirely in +//! the export — the conversion and the metadata-only `filter`/`take` stay separate so a *chain* of +//! them still composes; only the final canonicalize compacts (or not). use std::sync::Arc; @@ -91,6 +109,16 @@ pub enum FsstViewCompaction { /// random access just moves to view-build time, where it's more expensive. Retained for /// measurement only — `Auto` never selects it. RunCoalesce, + /// "Export all in place": when survivors are in heap order (offsets monotonically increasing, + /// as after any `filter`, a sorted-index `take`, or a `slice`), decode each maximal contiguous + /// heap run *directly* into the element-ordered output, with **no gather copy**. The output is + /// element-ordered, so the view-build stays sequential (unlike `RunCoalesce`). Cost is one + /// decode call per run; it beats `GatherBulk` when survivors form few runs (clustered/range + /// selections), and degrades toward per-element decode when survivors are scattered (a + /// uniform-random filter), which is when `GatherBulk`'s single bulk decode wins instead. + /// + /// Requires monotonic offsets; falls back to `GatherBulk` otherwise (e.g. a shuffle take). + RunDecode, } pub(super) fn canonicalize_fsstview( @@ -252,23 +280,50 @@ pub fn canonicalize_fsstview_with( let heap = heap_buffer.as_slice(); let decompressor = array.decompressor(); - let contiguous = is_contiguous(&offsets, &sizes); + // Analyse the survivor layout once: a single contiguous run (Direct), monotonic-but-gapped + // (RunDecode candidate), or out of heap order (must gather). + let layout = analyze_layout(&offsets, &sizes); let chosen = match strategy { - // Direct when the live codes are still one contiguous run, else compact-and-bulk. - // `GatherBulk` beats both `PerElement` and `RunCoalesce` across the whole practical range - // (see module docs), so `Auto` picks neither. - FsstViewCompaction::Auto => { - if contiguous { - FsstViewCompaction::Direct - } else { - FsstViewCompaction::GatherBulk + // The export heuristic. With monotonic offsets we can "export all in place" by decoding + // each contiguous run with no gather copy; this wins while the runs are few. Once survivors + // fragment into many tiny runs (a uniform-random filter), the per-run decode-tail overhead + // dominates and compacting the codes into one bulk decode (`GatherBulk`) wins instead. + // Non-monotonic layouts (a shuffle take) can't run-decode, so they always gather. + FsstViewCompaction::Auto => match layout { + Layout::Contiguous => FsstViewCompaction::Direct, + Layout::Monotonic { runs } if runs <= offsets.len() / RUN_DECODE_MAX_RUN_FRACTION => { + FsstViewCompaction::RunDecode } + _ => FsstViewCompaction::GatherBulk, + }, + // `Direct`/`RunDecode` require a (contiguous / monotonic) layout; fall back to gather. + FsstViewCompaction::Direct if !matches!(layout, Layout::Contiguous) => { + FsstViewCompaction::GatherBulk + } + FsstViewCompaction::RunDecode if matches!(layout, Layout::Scattered) => { + FsstViewCompaction::GatherBulk } - // `Direct` is only valid for a contiguous layout; fall back to a compacting decode. - FsstViewCompaction::Direct if !contiguous => FsstViewCompaction::GatherBulk, other => other, }; + if chosen == FsstViewCompaction::RunDecode { + let uncompressed = + decompress_run_decode(&decompressor, heap, &offsets, &sizes, &ulens, total_size); + let (buffers, views) = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + build_views(0, MAX_BUFFER_LEN, uncompressed, ulen_prim.as_slice::

()) + }); + // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. + return Ok(unsafe { + VarBinViewArray::new_unchecked( + views, + Arc::from(buffers), + array.dtype().clone(), + array.fsstview_validity(), + ) + .into_array() + }); + } + // RunCoalesce builds its own (buffers, views) — decompression order is decoupled from element // order, so it can't go through `build_views` (which assumes element-order contiguous output). if chosen == FsstViewCompaction::RunCoalesce { @@ -412,6 +467,99 @@ fn is_contiguous(offsets: &[usize], sizes: &[usize]) -> bool { true } +/// `Auto` prefers `RunDecode` (export all in place) over `GatherBulk` (compact codes) while the +/// number of contiguous runs is at most `len / RUN_DECODE_MAX_RUN_FRACTION` — i.e. while survivors +/// average more than this many elements per run. Calibrated by the `db_*` benchmarks: clustered +/// and range selections sit well under this, uniform-random filters well over it. +const RUN_DECODE_MAX_RUN_FRACTION: usize = 4; + +/// The survivor layout in the heap, used to pick an export strategy. +enum Layout { + /// Survivors are one contiguous in-order run (untouched / sliced view) — `Direct`. + Contiguous, + /// Offsets are strictly increasing but gapped: survivors form `runs` contiguous blocks. + /// Eligible for `RunDecode` (decode each run in place, no gather). + Monotonic { runs: usize }, + /// Offsets are out of heap order (e.g. a shuffle take) — must gather. + Scattered, +} + +/// Classify the survivor layout in a single O(n) pass: are offsets monotonic, and how many +/// maximal contiguous runs do the (non-empty) survivors form? +fn analyze_layout(offsets: &[usize], sizes: &[usize]) -> Layout { + let mut runs = 0usize; + let mut gapped = false; + let mut prev_end: Option = None; + for (&offset, &size) in offsets.iter().zip(sizes) { + if size == 0 { + continue; // empty/null elements don't affect run structure + } + match prev_end { + None => runs = 1, + Some(end) if offset == end => {} // continues the current run + Some(end) if offset > end => { + runs += 1; + gapped = true; + } + Some(_) => return Layout::Scattered, // offset < end: out of order + } + prev_end = Some(offset + size); + } + if !gapped { + Layout::Contiguous + } else { + Layout::Monotonic { runs } + } +} + +/// "Export all in place": decode each maximal contiguous heap run directly into the element-ordered +/// output, with no gather copy. Requires monotonic offsets (the caller guarantees this). +fn decompress_run_decode( + decompressor: &Decompressor<'_>, + heap: &[u8], + offsets: &[usize], + sizes: &[usize], + ulens: &[usize], + total_size: usize, +) -> ByteBufferMut { + let mut out = ByteBufferMut::with_capacity(total_size + 7); + { + let spare = out.spare_capacity_mut(); + // Walk elements in order, batching heap-adjacent survivors into one decode call. `out_pos` + // tracks where the current run's decoded bytes begin in the (element-ordered) output. + let mut out_pos = 0usize; + let mut i = 0usize; + while i < offsets.len() { + if sizes[i] == 0 { + i += 1; + continue; + } + let run_heap_start = offsets[i]; + let mut run_heap_end = run_heap_start; + let mut run_uncompressed = 0usize; + let mut j = i; + while j < offsets.len() { + if sizes[j] == 0 { + j += 1; + continue; + } + if offsets[j] != run_heap_end { + break; + } + run_heap_end += sizes[j]; + run_uncompressed += ulens[j]; + j += 1; + } + decompressor + .decompress_into(&heap[run_heap_start..run_heap_end], &mut spare[out_pos..]); + out_pos += run_uncompressed; + i = j; + } + } + unsafe { out.set_len(total_size) }; + out +} + /// Decompress a single contiguous run of the heap in one bulk call (no copy). fn decompress_direct( decompressor: &Decompressor<'_>, diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs index 2846909451a..b6a52901f20 100644 --- a/encodings/fsst/src/fsstview/tests.rs +++ b/encodings/fsst/src/fsstview/tests.rs @@ -331,6 +331,48 @@ fn run_coalesce_gaps_and_shuffle(#[case] strategy: FsstViewCompaction) -> Vortex Ok(()) } +/// `RunDecode` ("export all in place") must agree with the canonical result on a *monotonic* +/// gapped view (a filter, which keeps offsets increasing). Covers nulls, empty strings, and a +/// trailing run, across the strategies that accept monotonic input. +#[rstest] +#[case(FsstViewCompaction::Auto)] +#[case(FsstViewCompaction::RunDecode)] +#[case(FsstViewCompaction::GatherBulk)] +#[case(FsstViewCompaction::Direct)] +fn run_decode_monotonic_filter(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let strings: Vec> = vec![ + Some("alpha"), + Some(""), + None, + Some("bravo bravo"), + Some("charlie"), + None, + Some("delta delta delta"), + Some("echo"), + Some(""), + Some("foxtrot foxtrot"), + Some("golf golf"), + ]; + let fsst = make_fsst(&strings, Nullability::Nullable, &mut ctx); + // Keep a gapped-but-ordered subset (multiple runs, including an adjacent pair and a trailing + // run) so RunDecode exercises >1 run and the GatherBulk fallback is also valid. + let keep = [ + true, true, false, true, false, false, true, true, true, false, true, + ]; + let mask = Mask::from_iter(keep); + let view = fsst_filter_to_view(&fsst, &mask, &mut ctx)?; + + let got = canonicalize_fsstview_with(view.as_view(), strategy, &mut ctx)?; + let expected = + VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(Nullability::Nullable)) + .into_array() + .filter(mask)? + .execute::(&mut ctx)?; + assert_arrays_eq!(got, expected.into_array()); + Ok(()) +} + /// Build a ~`target`-uncompressed-byte FSSTView of random short URL-ish strings. fn make_big_view(target: usize, avg_len: usize, ctx: &mut ExecutionCtx) -> FSSTViewArray { use rand::RngExt; From 6144875121965cfc019ff2acd614c844b80a6c1e Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 20:58:44 +0000 Subject: [PATCH 12/23] FSSTView: 12x faster from_fsst conversion (push_unchecked over windows) Callgrind on fsstview_from_fsst in isolation showed BufferMut::push was 68% of the conversion: deriving the sizes array pushed element-by-element, and push does a reserve(1) capacity recheck every iteration even though capacity is reserved up front. Switching the size-derivation loop to push_unchecked over offsets.windows(2) (capacity guaranteed) lets it vectorize. Conversion instruction count: 20.7B -> 1.73B over 3000 iters (~12x). End-to-end single filter -> VarBinView (many_short, where conversion was ~21%): clustered 868us -> 406us (2.1x) range 885us -> 417us (2.1x) uniform 1108us -> 689us (1.6x) This flips the single-op verdict: the view now beats fsst on clustered and range filters (the realistic DB selection shapes), losing only on the adversarial uniform-random case. Combined with the RunDecode export heuristic, the view is now competitive even for a single op, not just chains. Signed-off-by: Joe Isaacs --- encodings/fsst/src/fsstview/array.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/encodings/fsst/src/fsstview/array.rs b/encodings/fsst/src/fsstview/array.rs index 594c371d3f3..eb989692ab4 100644 --- a/encodings/fsst/src/fsstview/array.rs +++ b/encodings/fsst/src/fsstview/array.rs @@ -188,11 +188,15 @@ pub fn fsstview_from_fsst(fsst: &FSSTArray, ctx: &mut ExecutionCtx) -> VortexRes let offsets = codes.offsets().clone().execute::(ctx)?; let len = offsets.len().saturating_sub(1); + // `sizes[i] = offsets[i + 1] - offsets[i]`, built from adjacent windows. `push_unchecked` (with + // the capacity reserved up front) avoids the per-element capacity recheck that `push` does and + // lets the loop vectorize — this conversion is otherwise dominated by the size derivation. let codes_sizes = match_each_integer_ptype!(offsets.ptype(), |O| { let offsets = offsets.as_slice::(); let mut sizes = BufferMut::::with_capacity(len); - for i in 0..len { - sizes.push(offsets[i + 1] - offsets[i]); + for w in offsets.windows(2) { + // SAFETY: `len` slots were reserved above and we push exactly `len` of them. + unsafe { sizes.push_unchecked(w[1] - w[0]) }; } sizes.into_array() }); From 9eb6fd16947b0722be422dc7769eabdcad10d3fd Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 21:05:11 +0000 Subject: [PATCH 13/23] FSSTView: trim canonicalize allocations (defer ulens widen, cheap cumsum) Profiling the full single-op pipeline (post conversion-fix) showed canonicalize_fsstview_with self-cost at ~19%, dominated by materializing three Vec (offsets, sizes, ulens) and the VarBin exporter's per-element offset push. - total_size is now summed straight from the typed uncompressed-lengths slice; the widened ulens: Vec is built lazily (widen_ulens) only by the run/per-element decoders. Direct and GatherBulk no longer allocate it. - The VarBin exporter builds its len+1 cumulative offsets directly from the typed slice with push_unchecked (capacity reserved), instead of widening to Vec and pushing element-by-element. Net: VarBin export many_short/selective ~726us -> ~467us; the canon decode itself is unchanged (RunDecode already avoided the ulens Vec). All strategies still agree; 111 tests pass. Signed-off-by: Joe Isaacs --- encodings/fsst/src/fsstview/canonical.rs | 67 +++++++++++++++--------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index 6ac82e0d7de..66ba15e7edb 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -265,15 +265,13 @@ pub fn canonicalize_fsstview_with( .uncompressed_lengths() .clone() .execute::(ctx)?; + // `total_size` is needed by every path; compute it directly from the typed slice. The widened + // `ulens: Vec` is only needed by the run/per-element decoders, so defer it until the + // strategy is chosen (Direct/GatherBulk don't need it at all). #[expect(clippy::cast_possible_truncation)] - let ulens: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { - ulen_prim - .as_slice::

() - .iter() - .map(|x| *x as usize) - .collect() + let total_size: usize = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + ulen_prim.as_slice::

().iter().map(|x| *x as usize).sum() }); - let total_size: usize = ulens.iter().sum(); let live: usize = sizes.iter().sum(); let heap_buffer = array.codes_bytes(); @@ -307,6 +305,7 @@ pub fn canonicalize_fsstview_with( }; if chosen == FsstViewCompaction::RunDecode { + let ulens = widen_ulens(&ulen_prim); let uncompressed = decompress_run_decode(&decompressor, heap, &offsets, &sizes, &ulens, total_size); let (buffers, views) = match_each_integer_ptype!(ulen_prim.ptype(), |P| { @@ -327,6 +326,7 @@ pub fn canonicalize_fsstview_with( // RunCoalesce builds its own (buffers, views) — decompression order is decoupled from element // order, so it can't go through `build_views` (which assumes element-order contiguous output). if chosen == FsstViewCompaction::RunCoalesce { + let ulens = widen_ulens(&ulen_prim); let (buffers, views) = decompress_run_coalesce(&decompressor, heap, &offsets, &sizes, &ulens, total_size); // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. @@ -350,7 +350,10 @@ pub fn canonicalize_fsstview_with( decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size) } // `Auto`/`RunCoalesce` are resolved above. - _ => decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size), + _ => { + let ulens = widen_ulens(&ulen_prim); + decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size) + } }; let (buffers, views) = match_each_integer_ptype!(ulen_prim.ptype(), |P| { @@ -392,15 +395,23 @@ pub fn canonicalize_fsstview_to_varbin( .uncompressed_lengths() .clone() .execute::(ctx)?; - #[expect(clippy::cast_possible_truncation)] - let ulens: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { - ulen_prim - .as_slice::

() - .iter() - .map(|x| *x as usize) - .collect() + let len = ulen_prim.len(); + + // Build `len + 1` cumulative offsets directly from the typed lengths slice (no widened Vec), + // and pick up `total_size` as the final running sum. `push_unchecked` (capacity reserved) keeps + // this vectorized. + let mut varbin_offsets = BufferMut::::with_capacity(len + 1); + #[expect(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] + let total_size: usize = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + let mut acc: usize = 0; + // SAFETY: `len + 1` slots reserved; we push exactly that many. + unsafe { varbin_offsets.push_unchecked(0) }; + for &ulen in ulen_prim.as_slice::

() { + acc += ulen as usize; + unsafe { varbin_offsets.push_unchecked(acc as i64) }; + } + acc }); - let total_size: usize = ulens.iter().sum(); let live: usize = sizes.iter().sum(); let heap_buffer = array.codes_bytes(); @@ -410,6 +421,7 @@ pub fn canonicalize_fsstview_to_varbin( let contiguous = is_contiguous(&offsets, &sizes); let uncompressed = match strategy { FsstViewCompaction::PerElement => { + let ulens = widen_ulens(&ulen_prim); decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size) } // Direct (or Auto) on a contiguous layout decodes the live range in place, no gather. @@ -421,15 +433,6 @@ pub fn canonicalize_fsstview_to_varbin( _ => decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size), }; - // Build `len + 1` cumulative offsets from the uncompressed lengths. - let mut varbin_offsets = BufferMut::::with_capacity(ulens.len() + 1); - let mut acc = 0i64; - varbin_offsets.push(acc); - for &ulen in &ulens { - acc += ulen as i64; - varbin_offsets.push(acc); - } - let bytes = BufferHandle::new_host(uncompressed.freeze()); // SAFETY: offsets are monotonic and end at the byte length; bytes are valid binary/UTF-8. Ok(unsafe { @@ -452,6 +455,20 @@ fn load_usize(array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult`. Only the +/// run/per-element decoders need this; `Direct`/`GatherBulk` work without it. +fn widen_ulens(ulen_prim: &PrimitiveArray) -> Vec { + #[expect(clippy::cast_possible_truncation)] + let out: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + ulen_prim + .as_slice::

() + .iter() + .map(|x| *x as usize) + .collect() + }); + out +} + /// Returns true if the live codes occupy a single contiguous, in-order run of the heap. fn is_contiguous(offsets: &[usize], sizes: &[usize]) -> bool { let Some(&first) = offsets.first() else { From 3c38645890f0bfc1a0609728aae06f9b54e37b3b Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 00:07:09 +0000 Subject: [PATCH 14/23] FSSTView: benchmark on real FineWeb columns (url + text) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds fsst_view_fineweb, which benchmarks FSST vs FSSTView on two real columns from the HuggingFace FineWeb 10BT sample, instead of synthetic data. The sample is ~2GB so it isn't downloaded by the bench; it reads length-prefixed dumps of the `url` (~72B avg, 200k rows) and `text` (~3KB avg, 40k rows) columns produced once with DuckDB (recipe in the module docs). The bench no-ops if the FINEWEB_URL / FINEWEB_TEXT env vars are unset, so CI stays green. Two workloads, fsst (rewrite heap per op) vs fsstview (metadata-only per op): single filter -> VarBinView, and a 5-op filter/take chain -> VarBinView. Real-data medians: fsst fsstview speedup single_filter url 1.02ms 0.84ms 1.2x single_filter text 5.81ms 4.38ms 1.3x chain url 6.23ms 3.95ms 1.6x chain text 44.2ms 5.16ms 8.6x The view wins every real case, decisively on chained ops over long strings (text chain 8.6x): fsst rewrites the large code heap on every op while the view stays metadata-only and decodes once at the end. On real (longer) strings the FSST->view conversion is no longer a notable cost — the earlier synthetic "conversion dominates" finding was an artifact of 12-byte strings plus per-op-in-a-loop measurement; the design win (metadata-only chaining) is what actually pays off. Signed-off-by: Joe Isaacs --- encodings/fsst/Cargo.toml | 4 + encodings/fsst/benches/fsst_view_fineweb.rs | 295 ++++++++++++++++++++ 2 files changed, 299 insertions(+) create mode 100644 encodings/fsst/benches/fsst_view_fineweb.rs diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index b1010a53572..40c96bbb78f 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -60,5 +60,9 @@ required-features = ["_test-harness"] name = "fsst_view_compute" harness = false +[[bench]] +name = "fsst_view_fineweb" +harness = false + [package.metadata.cargo-machete] ignored = ["fsst-rs"] diff --git a/encodings/fsst/benches/fsst_view_fineweb.rs b/encodings/fsst/benches/fsst_view_fineweb.rs new file mode 100644 index 00000000000..0249e424101 --- /dev/null +++ b/encodings/fsst/benches/fsst_view_fineweb.rs @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! FSST vs FSSTView on **real FineWeb columns** (not synthetic data). +//! +//! The HuggingFace FineWeb `10BT` sample is ~2 GB, so this bench does not download it. Instead it +//! reads two length-prefixed binary dumps of real columns, produced once with DuckDB: +//! +//! ```text +//! python3 - <<'PY' +//! import duckdb, struct +//! con = duckdb.connect(); con.execute("INSTALL httpfs; LOAD httpfs;") +//! url = "https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/v1.4.0/sample/10BT/001_00000.parquet" +//! con.execute(f"COPY (SELECT url, text FROM read_parquet('{url}') LIMIT 200000) TO '/tmp/s.parquet' (FORMAT PARQUET)") +//! def dump(col, path, limit): +//! rows = con.execute(f"SELECT {col} FROM read_parquet('/tmp/s.parquet') WHERE {col} IS NOT NULL LIMIT {limit}").fetchall() +//! with open(path, "wb") as f: +//! f.write(struct.pack(") -> std::fmt::Result { + f.write_str(match self { + Column::Url => "url", + Column::Text => "text", + }) + } +} + +impl Column { + fn env_var(self) -> &'static str { + match self { + Column::Url => "FINEWEB_URL", + Column::Text => "FINEWEB_TEXT", + } + } + + fn path(self) -> Option { + std::env::var_os(self.env_var()) + .map(PathBuf::from) + .filter(|p| p.exists()) + } +} + +const COLUMNS: &[Column] = &[Column::Url, Column::Text]; + +/// Read a length-prefixed dump into a `VarBinArray`. Returns `None` if the column isn't configured +/// (so the bench no-ops cleanly when the data isn't present). +fn load_column(col: Column) -> Option { + let bytes = std::fs::read(col.path()?).unwrap(); + let mut pos = 0usize; + #[expect(clippy::cast_possible_truncation)] + let row_count = u64::from_le_bytes(bytes[0..8].try_into().unwrap()) as usize; + pos += 8; + let mut values: Vec>> = Vec::with_capacity(row_count); + for _ in 0..row_count { + let len = u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap()) as usize; + pos += 4; + values.push(Some(bytes[pos..pos + len].to_vec())); + pos += len; + } + Some(VarBinArray::from_iter( + values.into_iter().map(|v| v.map(Vec::into_boxed_slice)), + DType::Utf8(Nullability::NonNullable), + )) +} + +fn compress(varbin: &VarBinArray, ctx: &mut ExecutionCtx) -> FSSTArray { + let compressor = fsst_train_compressor(varbin); + fsst_compress(varbin, varbin.len(), varbin.dtype(), &compressor, ctx) +} + +/// Clustered selection (32 bursts, ~`keep` fraction) — a realistic correlated predicate, the shape +/// where survivors form runs rather than scattering uniformly. +fn clustered_mask(len: usize, keep: f64) -> Mask { + let mut rng = StdRng::seed_from_u64(9); + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let total = (len as f64 * keep) as usize; + let bursts = 32usize; + let burst_len = (total / bursts).max(1); + let mut keep_set = vec![false; len]; + for _ in 0..bursts { + let start = rng.random_range(0..len.saturating_sub(burst_len).max(1)); + for j in start..(start + burst_len).min(len) { + keep_set[j] = true; + } + } + Mask::from_iter(keep_set) +} + +/// Sorted-index take (~`keep` fraction) — an index lookup / RID-list join; preserves heap order. +fn sorted_take(len: usize, keep: f64) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(13); + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let n = (len as f64 * keep) as usize; + let mut idx: Vec = (0..n).map(|_| rng.random_range(0..len as u64)).collect(); + idx.sort_unstable(); + PrimitiveArray::from_iter(idx).into_array() +} + +fn fsst_filter(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> FSSTArray { + ::filter(array.as_view(), mask, ctx) + .unwrap() + .unwrap() + .try_downcast::() + .ok() + .unwrap() +} + +fn fsst_take(array: &FSSTArray, indices: &ArrayRef, ctx: &mut ExecutionCtx) -> FSSTArray { + ::take(array.as_view(), indices, ctx) + .unwrap() + .unwrap() + .try_downcast::() + .ok() + .unwrap() +} + +fn fsst_to_vbv(array: &FSSTArray, ctx: &mut ExecutionCtx) -> ArrayRef { + array + .clone() + .into_array() + .execute::(ctx) + .unwrap() + .into_array() +} + +// =============================== SINGLE FILTER -> VarBinView =================================== + +#[divan::bench(args = COLUMNS)] +fn single_filter_fsst(bencher: Bencher, col: Column) { + let Some(varbin) = load_column(col) else { + return; + }; + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = clustered_mask(fsst.len(), 0.10); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let filtered = fsst_filter(fsst, mask, ctx); + black_box(fsst_to_vbv(&filtered, ctx)) + }); +} + +#[divan::bench(args = COLUMNS)] +fn single_filter_view(bencher: Bencher, col: Column) { + let Some(varbin) = load_column(col) else { + return; + }; + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + let mask = clustered_mask(fsst.len(), 0.10); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let view = fsstview_from_fsst(fsst, ctx).unwrap(); + let filtered = ::filter(view.as_view(), mask, ctx) + .unwrap() + .unwrap() + .try_downcast::() + .ok() + .unwrap(); + black_box( + canonicalize_fsstview_with(filtered.as_view(), FsstViewCompaction::Auto, ctx) + .unwrap(), + ) + }); +} + +// =============================== CHAIN (convert once, N ops, export once) ====================== + +const CHAIN_LEN: usize = 5; + +#[divan::bench(args = COLUMNS)] +fn chain_fsst(bencher: Bencher, col: Column) { + let Some(varbin) = load_column(col) else { + return; + }; + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + bencher + .with_inputs(|| (&fsst, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, ctx)| { + let mut cur = (*fsst).clone(); + for op in 0..CHAIN_LEN { + if op % 2 == 0 { + let mask = clustered_mask(cur.len(), 0.80); + cur = fsst_filter(&cur, &mask, ctx); + } else { + let indices = sorted_take(cur.len(), 0.80); + cur = fsst_take(&cur, &indices, ctx); + } + } + black_box(fsst_to_vbv(&cur, ctx)) + }); +} + +#[divan::bench(args = COLUMNS)] +fn chain_view(bencher: Bencher, col: Column) { + let Some(varbin) = load_column(col) else { + return; + }; + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + bencher + .with_inputs(|| (&fsst, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, ctx)| { + // Convert once, then chain metadata-only ops, canonicalize once. + let mut cur = fsstview_from_fsst(fsst, ctx).unwrap(); + for op in 0..CHAIN_LEN { + let next = if op % 2 == 0 { + let mask = clustered_mask(cur.len(), 0.80); + ::filter(cur.as_view(), &mask, ctx) + .unwrap() + .unwrap() + } else { + let indices = sorted_take(cur.len(), 0.80); + ::take(cur.as_view(), &indices, ctx) + .unwrap() + .unwrap() + }; + cur = next.try_downcast::().ok().unwrap(); + } + black_box( + canonicalize_fsstview_with(cur.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), + ) + }); +} From dea8281183f2e8c0b0291807990eba76a7c1552e Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 00:38:13 +0000 Subject: [PATCH 15/23] FSSTView: trim to the production path for merge Removes exploratory scaffolding now that the design has settled, keeping only what production uses: - Compaction strategies: drop PerElement and RunCoalesce (both proven to lose; Auto never picked them) and the FsstViewByteStats diagnostics. FsstViewCompaction is now Auto / Direct / GatherBulk / RunDecode. - canonical.rs: factor the shared element-ordered decode into decode_element_ordered, reused by the VarBinView and VarBin finishers; ~600 -> ~430 lines. - Synthetic bench (fsst_view_compute): replace the 945-line exploration matrix with a focused single-filter + 5-op-chain comparison over two shapes, mirroring the FineWeb bench. Real-data benchmarking lives in fsst_view_fineweb. - Tests: drop the removed-strategy cases and the byte-stats report; keep the all-strategies-agree, gapped-filter, RunDecode-monotonic, VarBin-export, and conformance coverage. Net -1361/+195 lines. 107 tests pass, clippy clean, fmt clean, vortex-file builds. Signed-off-by: Joe Isaacs --- encodings/fsst/benches/fsst_view_compute.rs | 826 ++------------------ encodings/fsst/src/fsstview/canonical.rs | 606 +++----------- encodings/fsst/src/fsstview/mod.rs | 2 - encodings/fsst/src/fsstview/tests.rs | 122 +-- 4 files changed, 195 insertions(+), 1361 deletions(-) diff --git a/encodings/fsst/benches/fsst_view_compute.rs b/encodings/fsst/benches/fsst_view_compute.rs index 5d53b3d90e1..59ced582c63 100644 --- a/encodings/fsst/benches/fsst_view_compute.rs +++ b/encodings/fsst/benches/fsst_view_compute.rs @@ -1,29 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Compares the two ways to run a `filter`/`take` pipeline that ends in a `VarBinViewArray`: +//! FSST vs FSSTView on synthetic string data, ending in a `VarBinViewArray`. //! -//! 1. **fsst pipeline**: stay in [`FSSTArray`] at every step, compacting the codes into a fresh -//! [`FSSTArray`] each time (the kernels delegate to `VarBin`, rewriting the byte heap), then -//! canonicalize to a [`VarBinViewArray`] at the end. -//! 2. **fsstview pipeline**: convert to [`FSSTViewArray`] and apply the metadata-only kernels -//! (offsets/sizes only — the byte heap is never touched), then canonicalize to a -//! [`VarBinViewArray`] at the end. -//! -//! Kernels are invoked directly (no Vortex execution/dispatch) so each part is measured in -//! isolation: the `_step` benches measure just the filter/take hop; the `_pipeline` benches -//! measure the hop plus the final canonicalization. For the fsstview pipeline the final -//! canonicalization is measured under each [`FsstViewCompaction`] strategy so the compaction -//! trade-off is visible directly. -//! -//! Two ~2 MiB (uncompressed) inputs are used: one with **many short** strings and one with -//! **fewer long** strings. -//! -//! Observed (medians): the fsstview hop is far cheaper in both cases (no heap rewrite) — e.g. -//! `take many_short/shuffle` is ~650 µs vs ~2.84 ms for fsst. For the final canonicalization, -//! `GatherBulk` (compact) beats `PerElement` (no compact) across the whole range, short *and* -//! long strings, because it pays FSST's slow decode-tail once instead of once per element; that's -//! why `Auto` compacts whenever the codes aren't contiguous. +//! `fsst` rewrites the compressed code heap on every `filter`/`take` (it delegates to `VarBin`); +//! `fsstview` keeps those ops metadata-only and decodes once at canonicalize. This bench measures +//! both a single filter and a 5-op filter/take chain, over two shapes — many short strings and +//! fewer long strings — with a clustered selection (the realistic shape, where survivors form runs +//! the view's `RunDecode` export exploits). For a benchmark on real FineWeb columns, see +//! `fsst_view_fineweb`. #![expect(clippy::unwrap_used)] @@ -47,9 +32,7 @@ use vortex_array::dtype::Nullability; use vortex_fsst::FSST; use vortex_fsst::FSSTArray; use vortex_fsst::FSSTView; -use vortex_fsst::FSSTViewArray; use vortex_fsst::FsstViewCompaction; -use vortex_fsst::canonicalize_fsstview_to_varbin; use vortex_fsst::canonicalize_fsstview_with; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; @@ -65,9 +48,9 @@ const TARGET_UNCOMPRESSED: usize = 2 * 1024 * 1024; #[derive(Clone, Copy, Debug)] enum Shape { - /// Many short strings (~12 bytes each) — small per-element work. + /// Many short strings (~12 bytes each). ManyShort, - /// Fewer long strings (~256 bytes each) — large per-element work. + /// Fewer long strings (~256 bytes each). FewLong, } @@ -78,32 +61,21 @@ impl Shape { Shape::FewLong => 256, } } - - fn count(self) -> usize { - TARGET_UNCOMPRESSED / self.avg_len() - } - - fn name(self) -> &'static str { - match self { - Shape::ManyShort => "many_short", - Shape::FewLong => "few_long", - } - } } -/// Build a ~2 MiB input. We use a small alphabet so FSST finds good symbols (realistic -/// compression), with some shared substrings to mimic real string columns. +const SHAPES: &[Shape] = &[Shape::ManyShort, Shape::FewLong]; + +/// Build a ~2 MiB input from a small alphabet so FSST finds good symbols, with shared substrings +/// to mimic real string columns. fn generate(shape: Shape) -> VarBinArray { let mut rng = StdRng::seed_from_u64(42); - let count = shape.count(); let avg_len = shape.avg_len(); - let mut strings: Vec> = Vec::with_capacity(count); - + let count = TARGET_UNCOMPRESSED / avg_len; const WORDS: &[&str] = &[ "https://", "example", "vortex", ".com/", "path", "query=", "value", "data", "alpha", "bravo", "charlie", "delta", "_", "-", "/", "0123", ]; - + let mut strings: Vec> = Vec::with_capacity(count); for _ in 0..count { let target = avg_len * rng.random_range(70..=130) / 100; let mut s = String::with_capacity(target + 8); @@ -113,7 +85,6 @@ fn generate(shape: Shape) -> VarBinArray { s.truncate(target.max(1)); strings.push(s.into_bytes().into_boxed_slice()); } - VarBinArray::from_iter( strings.into_iter().map(Some), DType::Utf8(Nullability::NonNullable), @@ -125,127 +96,32 @@ fn compress(varbin: &VarBinArray, ctx: &mut ExecutionCtx) -> FSSTArray { fsst_compress(varbin, varbin.len(), varbin.dtype(), &compressor, ctx) } -/// A selective mask keeps ~10% of rows; a non-selective mask keeps ~90%. -fn make_mask(len: usize, keep_fraction: f64) -> Mask { - let mut rng = StdRng::seed_from_u64(7); - Mask::from_iter((0..len).map(|_| rng.random_bool(keep_fraction))) -} - -/// How a WHERE-clause selection is distributed over the rows — the shape that, in practice, drives -/// run length far more than raw selectivity does. Real query masks are rarely uniform-random. -#[derive(Clone, Copy, Debug)] -enum Selectivity { - /// Uniform-random `keep` fraction (the worst case for run length: ~no adjacency). - Uniform(f64), - /// One contiguous range of `keep` fraction — a sorted range scan (`WHERE k BETWEEN a AND b`). - /// Survivors are a single run. - Range(f64), - /// `bursts` contiguous blocks totalling ~`keep` — clustered hits (e.g. a low-cardinality - /// predicate over data sorted by a correlated key). Survivors form a few medium runs. - Clustered { keep: f64, bursts: usize }, -} - -impl Selectivity { - fn name(self) -> &'static str { - match self { - Selectivity::Uniform(k) if k <= 0.2 => "uniform_10pct", - Selectivity::Uniform(_) => "uniform_90pct", - Selectivity::Range(_) => "range_scan_10pct", - Selectivity::Clustered { .. } => "clustered_10pct", - } - } - - fn make(self, len: usize) -> Mask { - match self { - Selectivity::Uniform(keep) => make_mask(len, keep), - Selectivity::Range(keep) => { - #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] - let take = (len as f64 * keep) as usize; - let start = (len - take) / 2; // a range in the middle of the column - Mask::from_iter((0..len).map(|i| i >= start && i < start + take)) - } - Selectivity::Clustered { keep, bursts } => { - let mut rng = StdRng::seed_from_u64(9); - #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] - let total = (len as f64 * keep) as usize; - let burst_len = (total / bursts).max(1); - let mut keep_set = vec![false; len]; - for _ in 0..bursts { - let start = rng.random_range(0..len.saturating_sub(burst_len).max(1)); - for j in start..(start + burst_len).min(len) { - keep_set[j] = true; - } - } - Mask::from_iter(keep_set) - } - } - } -} - -/// The selection shapes exercised by the "database-style" filter benches. -const SELECTIVITIES: &[Selectivity] = &[ - Selectivity::Uniform(0.10), - Selectivity::Range(0.10), - Selectivity::Clustered { - keep: 0.10, - bursts: 32, - }, -]; - -#[derive(Clone, Copy, Debug)] -enum TakeKind { - /// A full shuffle (permutation of all rows) — same length, reordered. - Shuffle, - /// Very selective — pick ~5% of rows at random (with possible repeats). - Selective, - /// Not selective — pick ~150% of rows at random (duplicates, output grows). - Dense, -} - -impl TakeKind { - fn name(self) -> &'static str { - match self { - TakeKind::Shuffle => "shuffle", - TakeKind::Selective => "selective", - TakeKind::Dense => "dense", +/// Clustered selection (32 bursts, ~`keep` fraction) — survivors form runs, the realistic shape. +fn clustered_mask(len: usize, keep: f64) -> Mask { + let mut rng = StdRng::seed_from_u64(9); + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let total = (len as f64 * keep) as usize; + let burst_len = (total / 32).max(1); + let mut keep_set = vec![false; len]; + for _ in 0..32 { + let start = rng.random_range(0..len.saturating_sub(burst_len).max(1)); + for j in start..(start + burst_len).min(len) { + keep_set[j] = true; } } + Mask::from_iter(keep_set) } -fn compaction_name(strategy: FsstViewCompaction) -> &'static str { - match strategy { - FsstViewCompaction::Auto => "auto", - FsstViewCompaction::Direct => "direct", - FsstViewCompaction::GatherBulk => "gather_bulk", - FsstViewCompaction::PerElement => "per_element", - FsstViewCompaction::RunCoalesce => "run_coalesce", - FsstViewCompaction::RunDecode => "run_decode", - } -} - -fn make_indices(len: usize, kind: TakeKind) -> ArrayRef { - let mut rng = StdRng::seed_from_u64(11); - let indices: Vec = match kind { - TakeKind::Shuffle => { - let mut v: Vec = (0..len as u64).collect(); - // Fisher-Yates. - for i in (1..v.len()).rev() { - v.swap(i, rng.random_range(0..=i)); - } - v - } - TakeKind::Selective => (0..(len / 20).max(1)) - .map(|_| rng.random_range(0..len as u64)) - .collect(), - TakeKind::Dense => (0..(len * 3 / 2)) - .map(|_| rng.random_range(0..len as u64)) - .collect(), - }; - PrimitiveArray::from_iter(indices).into_array() +/// Sorted-index take (~`keep` fraction) — an index lookup; preserves heap order. +fn sorted_take(len: usize, keep: f64) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(13); + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let n = (len as f64 * keep) as usize; + let mut idx: Vec = (0..n).map(|_| rng.random_range(0..len as u64)).collect(); + idx.sort_unstable(); + PrimitiveArray::from_iter(idx).into_array() } -// ----- direct kernel wrappers (no Vortex dispatch) --------------------------------------------- - fn fsst_filter(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> FSSTArray { ::filter(array.as_view(), mask, ctx) .unwrap() @@ -264,22 +140,7 @@ fn fsst_take(array: &FSSTArray, indices: &ArrayRef, ctx: &mut ExecutionCtx) -> F .unwrap() } -fn view_filter(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> ArrayRef { - let view = fsstview_from_fsst(array, ctx).unwrap(); - ::filter(view.as_view(), mask, ctx) - .unwrap() - .unwrap() -} - -fn view_take(array: &FSSTArray, indices: &ArrayRef, ctx: &mut ExecutionCtx) -> ArrayRef { - let view = fsstview_from_fsst(array, ctx).unwrap(); - ::take(view.as_view(), indices, ctx) - .unwrap() - .unwrap() -} - -fn fsst_to_canonical(array: &FSSTArray, ctx: &mut ExecutionCtx) -> ArrayRef { - // Decompress straight to a VarBinView via the VarBin codes (the FSST canonical path). +fn fsst_to_vbv(array: &FSSTArray, ctx: &mut ExecutionCtx) -> ArrayRef { array .clone() .into_array() @@ -288,252 +149,81 @@ fn fsst_to_canonical(array: &FSSTArray, ctx: &mut ExecutionCtx) -> ArrayRef { .into_array() } -const SHAPES: &[Shape] = &[Shape::ManyShort, Shape::FewLong]; - -// =============================== FILTER ======================================================== +// =============================== SINGLE FILTER -> VarBinView =================================== -/// Filter masks to exercise: selective (~10% kept) and non-selective (~90% kept). -const FILTER_KEEP: &[(&str, f64)] = &[("selective_10pct", 0.10), ("nonselective_90pct", 0.90)]; - -#[divan::bench(args = filter_args())] -fn filter_step_fsst(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| black_box(fsst_filter(fsst, mask, ctx))); -} - -#[divan::bench(args = filter_args())] -fn filter_step_view(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| black_box(view_filter(fsst, mask, ctx))); -} - -/// Metadata-only filter measured in isolation (conversion hoisted out). See `take_op_only_view`. -#[divan::bench(args = filter_args())] -fn filter_op_only_view(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let view = fsstview_from_fsst(&fsst, &mut LEGACY_SESSION.create_execution_ctx()).unwrap(); - let mask = make_mask(view.len(), args.keep); - bencher - .with_inputs(|| (&view, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(view, mask, ctx)| { - black_box( - ::filter(view.as_view(), mask, ctx) - .unwrap() - .unwrap(), - ) - }); -} - -/// Full pipeline: filter (compacting into another FSSTArray) then canonicalize to VarBinView. -#[divan::bench(args = filter_args())] -fn filter_pipeline_fsst(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let filtered = fsst_filter(fsst, mask, ctx); - black_box(fsst_to_canonical(&filtered, ctx)) - }); -} - -/// Full pipeline: filter to FSSTView then canonicalize, once per compaction strategy. -#[divan::bench(args = filter_view_pipeline_args())] -fn filter_pipeline_view(bencher: Bencher, args: FilterViewArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let view = view_filter(fsst, mask, ctx) - .try_downcast::() - .ok() - .unwrap(); - black_box(canonicalize_fsstview_with(view.as_view(), args.strategy, ctx).unwrap()) - }); -} - -// =============================== TAKE ========================================================== - -const TAKE_KINDS: &[TakeKind] = &[TakeKind::Shuffle, TakeKind::Selective, TakeKind::Dense]; - -#[divan::bench(args = take_args())] -fn take_step_fsst(bencher: Bencher, args: TakeArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let indices = make_indices(fsst.len(), args.kind); - bencher - .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, indices, ctx)| black_box(fsst_take(fsst, indices, ctx))); -} - -#[divan::bench(args = take_args())] -fn take_step_view(bencher: Bencher, args: TakeArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let indices = make_indices(fsst.len(), args.kind); - bencher - .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, indices, ctx)| black_box(view_take(fsst, indices, ctx))); -} - -/// The metadata-only take measured *in isolation*: the FSST→view conversion is hoisted out of the -/// timed loop (a chain converts once), so this is the apples-to-apples "is the view op itself as -/// cheap as a ListView op" comparison. The `*_step_view` bench above instead folds the one-time -/// conversion into every op, which only the first op of a chain actually pays. -#[divan::bench(args = take_args())] -fn take_op_only_view(bencher: Bencher, args: TakeArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let view = fsstview_from_fsst(&fsst, &mut LEGACY_SESSION.create_execution_ctx()).unwrap(); - let indices = make_indices(view.len(), args.kind); - bencher - .with_inputs(|| (&view, &indices, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(view, indices, ctx)| { - black_box( - ::take(view.as_view(), indices, ctx) - .unwrap() - .unwrap(), - ) - }); -} - -#[divan::bench(args = take_args())] -fn take_pipeline_fsst(bencher: Bencher, args: TakeArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let indices = make_indices(fsst.len(), args.kind); - bencher - .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, indices, ctx)| { - let taken = fsst_take(fsst, indices, ctx); - black_box(fsst_to_canonical(&taken, ctx)) - }); -} - -#[divan::bench(args = take_view_pipeline_args())] -fn take_pipeline_view(bencher: Bencher, args: TakeViewArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let indices = make_indices(fsst.len(), args.kind); - bencher - .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, indices, ctx)| { - let view = view_take(fsst, indices, ctx) - .try_downcast::() - .ok() - .unwrap(); - black_box(canonicalize_fsstview_with(view.as_view(), args.strategy, ctx).unwrap()) - }); -} - -// =============================== COMBINATION =================================================== - -/// A filter (selective) followed by a take (shuffle) — the realistic "scan then reorder" shape. -/// fsst path compacts twice; fsstview path stays metadata-only until the final canonicalize. #[divan::bench(args = SHAPES)] -fn combo_pipeline_fsst(bencher: Bencher, shape: Shape) { - let varbin = generate(shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), 0.10); +fn single_filter_fsst(bencher: Bencher, shape: Shape) { + let fsst = compress(&generate(shape), &mut LEGACY_SESSION.create_execution_ctx()); + let mask = clustered_mask(fsst.len(), 0.10); bencher .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) .bench_refs(|(fsst, mask, ctx)| { let filtered = fsst_filter(fsst, mask, ctx); - let indices = make_indices(filtered.len(), TakeKind::Shuffle); - let taken = fsst_take(&filtered, &indices, ctx); - black_box(fsst_to_canonical(&taken, ctx)) + black_box(fsst_to_vbv(&filtered, ctx)) }); } #[divan::bench(args = SHAPES)] -fn combo_pipeline_view(bencher: Bencher, shape: Shape) { - let varbin = generate(shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), 0.10); +fn single_filter_view(bencher: Bencher, shape: Shape) { + let fsst = compress(&generate(shape), &mut LEGACY_SESSION.create_execution_ctx()); + let mask = clustered_mask(fsst.len(), 0.10); bencher .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) .bench_refs(|(fsst, mask, ctx)| { - // filter -> view, then take on the view (both metadata-only), then canonicalize. - let filtered = view_filter(fsst, mask, ctx) - .try_downcast::() - .ok() - .unwrap(); - let indices = make_indices(filtered.len(), TakeKind::Shuffle); - let taken = ::take(filtered.as_view(), &indices, ctx) + let view = fsstview_from_fsst(fsst, ctx).unwrap(); + let filtered = ::filter(view.as_view(), mask, ctx) .unwrap() .unwrap() .try_downcast::() .ok() .unwrap(); black_box( - canonicalize_fsstview_with(taken.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), + canonicalize_fsstview_with(filtered.as_view(), FsstViewCompaction::Auto, ctx) + .unwrap(), ) }); } -// =============================== CHAIN ========================================================= +// =============================== CHAIN (convert once, N ops, export once) ====================== -/// Number of ops in the chain benchmark. const CHAIN_LEN: usize = 5; -/// A chain of `CHAIN_LEN` alternating filter/take ops ending in a canonicalization. -/// -/// This is where the view model is meant to dominate: each fsst op re-compacts the byte heap, -/// so the cost compounds with chain length, whereas the view converts to offsets+sizes *once* -/// and every subsequent op is metadata-only, deferring the single gather+decode to the final -/// canonicalize. We keep every op only mildly selective (filter keeps 80%, take is a shuffle) -/// so there's still substantial data at the end — i.e. the heap rewrites the fsst path pays are -/// real work, not optimized away to nothing. #[divan::bench(args = SHAPES)] -fn chain_pipeline_fsst(bencher: Bencher, shape: Shape) { - let varbin = generate(shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); +fn chain_fsst(bencher: Bencher, shape: Shape) { + let fsst = compress(&generate(shape), &mut LEGACY_SESSION.create_execution_ctx()); bencher .with_inputs(|| (&fsst, LEGACY_SESSION.create_execution_ctx())) .bench_refs(|(fsst, ctx)| { let mut cur = (*fsst).clone(); for op in 0..CHAIN_LEN { if op % 2 == 0 { - let mask = make_mask(cur.len(), 0.80); + let mask = clustered_mask(cur.len(), 0.80); cur = fsst_filter(&cur, &mask, ctx); } else { - let indices = make_indices(cur.len(), TakeKind::Shuffle); + let indices = sorted_take(cur.len(), 0.80); cur = fsst_take(&cur, &indices, ctx); } } - black_box(fsst_to_canonical(&cur, ctx)) + black_box(fsst_to_vbv(&cur, ctx)) }); } #[divan::bench(args = SHAPES)] -fn chain_pipeline_view(bencher: Bencher, shape: Shape) { - let varbin = generate(shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); +fn chain_view(bencher: Bencher, shape: Shape) { + let fsst = compress(&generate(shape), &mut LEGACY_SESSION.create_execution_ctx()); bencher .with_inputs(|| (&fsst, LEGACY_SESSION.create_execution_ctx())) .bench_refs(|(fsst, ctx)| { - // Convert to the view once, then chain metadata-only ops, canonicalize once at the end. + // Convert once, then chain metadata-only ops, canonicalize once. let mut cur = fsstview_from_fsst(fsst, ctx).unwrap(); for op in 0..CHAIN_LEN { let next = if op % 2 == 0 { - let mask = make_mask(cur.len(), 0.80); + let mask = clustered_mask(cur.len(), 0.80); ::filter(cur.as_view(), &mask, ctx) .unwrap() .unwrap() } else { - let indices = make_indices(cur.len(), TakeKind::Shuffle); + let indices = sorted_take(cur.len(), 0.80); ::take(cur.as_view(), &indices, ctx) .unwrap() .unwrap() @@ -545,401 +235,3 @@ fn chain_pipeline_view(bencher: Bencher, shape: Shape) { ) }); } - -// =============================== SINGLE FILTER + EXPORT (2x2) ================================== -// -// A single filter, then export to a canonical string array. The matrix is -// {fsst, fsstview} x {VarBinView, VarBin}: -// - fsst path: filter rewrites the compressed heap (VarBin filter on codes), then decode. -// - fsstview path: filter is metadata-only, then decode (coalesced gather + bulk) at export. -// - VarBinView export: build a 16-byte view per element. -// - VarBin export: build len+1 cumulative offsets over the contiguous decoded bytes. - -fn export_view(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> FSSTViewArray { - view_filter(array, mask, ctx) - .try_downcast::() - .ok() - .unwrap() -} - -/// Canonicalize a *pre-filtered* view (filter hoisted out of the loop), parameterized by the -/// selection shape and the explicit compaction strategy. This isolates the export decode so -/// `RunDecode` ("export all in place") can be compared head-to-head against `GatherBulk` ("compact -/// codes") on each survivor layout. -#[divan::bench(args = canon_args())] -fn canon_only(bencher: Bencher, args: CanonArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = args.sel.make(fsst.len()); - let view = { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - view_filter(&fsst, &mask, &mut ctx) - .try_downcast::() - .ok() - .unwrap() - }; - bencher - .with_inputs(|| (&view, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(view, ctx)| { - black_box(canonicalize_fsstview_with(view.as_view(), args.strategy, ctx).unwrap()) - }); -} - -#[divan::bench(args = filter_args())] -fn export_fsst_to_varbinview(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let filtered = fsst_filter(fsst, mask, ctx); - black_box(fsst_to_canonical(&filtered, ctx)) - }); -} - -#[divan::bench(args = filter_args())] -fn export_fsst_to_varbin(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - // Filter stays in FSST; reach VarBin by reinterpreting the (now contiguous) codes as a - // view and exporting offsets+bytes. - let filtered = fsst_filter(fsst, mask, ctx); - let view = fsstview_from_fsst(&filtered, ctx).unwrap(); - black_box( - canonicalize_fsstview_to_varbin(view.as_view(), FsstViewCompaction::Auto, ctx) - .unwrap(), - ) - }); -} - -#[divan::bench(args = filter_args())] -fn export_view_to_varbinview(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let view = export_view(fsst, mask, ctx); - black_box( - canonicalize_fsstview_with(view.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), - ) - }); -} - -#[divan::bench(args = filter_args())] -fn export_view_to_varbin(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let view = export_view(fsst, mask, ctx); - black_box( - canonicalize_fsstview_to_varbin(view.as_view(), FsstViewCompaction::Auto, ctx) - .unwrap(), - ) - }); -} - -/// Cost of converting the VarBin produced by `view->VarBin` *into* a VarBinView, isolated. Add this -/// to `export_view_to_varbin` to compare against `export_view_to_varbinview` (going straight to a -/// view): is "decode to VarBin, then convert" cheaper than "decode straight to VarBinView"? -#[divan::bench(args = filter_args())] -fn convert_varbin_to_varbinview(bencher: Bencher, args: FilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = make_mask(fsst.len(), args.keep); - // Pre-build the VarBin (the `view->VarBin` export output) outside the timed loop. - let view = { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - export_view(&fsst, &mask, &mut ctx) - }; - let vbin = canonicalize_fsstview_to_varbin( - view.as_view(), - FsstViewCompaction::Auto, - &mut LEGACY_SESSION.create_execution_ctx(), - ) - .unwrap(); - bencher - .with_inputs(|| (&vbin, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(vbin, ctx)| { - black_box((*vbin).clone().execute::(ctx).unwrap()) - }); -} - -// =============================== DATABASE-STYLE FILTER + EXPORT ================================ -// -// Real query masks are rarely uniform-random: a sorted range scan selects one contiguous run, and -// a clustered/correlated predicate selects a handful of bursts. Run length (not raw selectivity) -// is what drives the coalesced gather and the FSST->view conversion overhead, so these shapes are -// where the view encoding's behaviour actually diverges from the uniform-random case. Each bench -// filters then exports to a VarBinView; we compare fsst vs fsstview directly. - -#[divan::bench(args = db_filter_args())] -fn db_filter_fsst_to_varbinview(bencher: Bencher, args: DbFilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = args.sel.make(fsst.len()); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let filtered = fsst_filter(fsst, mask, ctx); - black_box(fsst_to_canonical(&filtered, ctx)) - }); -} - -#[divan::bench(args = db_filter_args())] -fn db_filter_view_to_varbinview(bencher: Bencher, args: DbFilterArg) { - let varbin = generate(args.shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = args.sel.make(fsst.len()); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let view = view_filter(fsst, mask, ctx) - .try_downcast::() - .ok() - .unwrap(); - black_box( - canonicalize_fsstview_with(view.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), - ) - }); -} - -/// An index lookup / sorted-key join: take with **sorted** indices selecting ~30% of rows. Unlike -/// a shuffle this preserves heap order, so survivors coalesce into runs — the common DB take shape -/// (e.g. fetching rows by a sorted RID list). -fn make_sorted_take(len: usize, keep: f64) -> ArrayRef { - let mut rng = StdRng::seed_from_u64(13); - #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] - let n = (len as f64 * keep) as usize; - let mut idx: Vec = (0..n).map(|_| rng.random_range(0..len as u64)).collect(); - idx.sort_unstable(); - PrimitiveArray::from_iter(idx).into_array() -} - -#[divan::bench(args = SHAPES)] -fn db_indexlookup_fsst_to_varbinview(bencher: Bencher, shape: Shape) { - let varbin = generate(shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let indices = make_sorted_take(fsst.len(), 0.30); - bencher - .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, indices, ctx)| { - let taken = fsst_take(fsst, indices, ctx); - black_box(fsst_to_canonical(&taken, ctx)) - }); -} - -#[divan::bench(args = SHAPES)] -fn db_indexlookup_view_to_varbinview(bencher: Bencher, shape: Shape) { - let varbin = generate(shape); - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let indices = make_sorted_take(fsst.len(), 0.30); - bencher - .with_inputs(|| (&fsst, &indices, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, indices, ctx)| { - let view = view_take(fsst, indices, ctx) - .try_downcast::() - .ok() - .unwrap(); - black_box( - canonicalize_fsstview_with(view.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), - ) - }); -} - -// =============================== arg plumbing ================================================== - -#[derive(Clone, Copy)] -struct DbFilterArg { - shape: Shape, - sel: Selectivity, -} - -impl std::fmt::Display for DbFilterArg { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}", self.shape.name(), self.sel.name()) - } -} - -fn db_filter_args() -> Vec { - let mut v = Vec::new(); - for &shape in SHAPES { - for &sel in SELECTIVITIES { - v.push(DbFilterArg { shape, sel }); - } - } - v -} - -#[derive(Clone, Copy)] -struct CanonArg { - shape: Shape, - sel: Selectivity, - strategy: FsstViewCompaction, -} - -impl std::fmt::Display for CanonArg { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}/{}/{}", - self.shape.name(), - self.sel.name(), - compaction_name(self.strategy) - ) - } -} - -fn canon_args() -> Vec { - let strategies = [ - FsstViewCompaction::Auto, - FsstViewCompaction::GatherBulk, - FsstViewCompaction::RunDecode, - ]; - let mut v = Vec::new(); - for &shape in SHAPES { - for &sel in SELECTIVITIES { - for strategy in strategies { - v.push(CanonArg { - shape, - sel, - strategy, - }); - } - } - } - v -} - -#[derive(Clone, Copy)] -struct FilterArg { - shape: Shape, - keep: f64, - label: &'static str, -} - -impl std::fmt::Display for FilterArg { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}", self.shape.name(), self.label) - } -} - -fn filter_args() -> Vec { - let mut v = Vec::new(); - for &shape in SHAPES { - for &(label, keep) in FILTER_KEEP { - v.push(FilterArg { shape, keep, label }); - } - } - v -} - -#[derive(Clone, Copy)] -struct FilterViewArg { - shape: Shape, - keep: f64, - label: &'static str, - strategy: FsstViewCompaction, -} - -impl std::fmt::Display for FilterViewArg { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}/{}/{}", - self.shape.name(), - self.label, - compaction_name(self.strategy) - ) - } -} - -fn filter_view_pipeline_args() -> Vec { - let mut v = Vec::new(); - for &shape in SHAPES { - for &(label, keep) in FILTER_KEEP { - for &strategy in COMPACTIONS { - v.push(FilterViewArg { - shape, - keep, - label, - strategy, - }); - } - } - } - v -} - -#[derive(Clone, Copy)] -struct TakeArg { - shape: Shape, - kind: TakeKind, -} - -impl std::fmt::Display for TakeArg { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}", self.shape.name(), self.kind.name()) - } -} - -fn take_args() -> Vec { - let mut v = Vec::new(); - for &shape in SHAPES { - for &kind in TAKE_KINDS { - v.push(TakeArg { shape, kind }); - } - } - v -} - -#[derive(Clone, Copy)] -struct TakeViewArg { - shape: Shape, - kind: TakeKind, - strategy: FsstViewCompaction, -} - -impl std::fmt::Display for TakeViewArg { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}/{}/{}", - self.shape.name(), - self.kind.name(), - compaction_name(self.strategy) - ) - } -} - -const COMPACTIONS: &[FsstViewCompaction] = &[ - FsstViewCompaction::Auto, - FsstViewCompaction::GatherBulk, - FsstViewCompaction::PerElement, - FsstViewCompaction::RunCoalesce, -]; - -fn take_view_pipeline_args() -> Vec { - let mut v = Vec::new(); - for &shape in SHAPES { - for &kind in TAKE_KINDS { - for &strategy in COMPACTIONS { - v.push(TakeViewArg { - shape, - kind, - strategy, - }); - } - } - } - v -} diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index 66ba15e7edb..ea7cff6d201 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -1,63 +1,30 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Canonicalization of [`FSSTView`] into a [`VarBinViewArray`]. +//! Canonicalization of [`FSSTView`] into a [`VarBinViewArray`] (or [`VarBinArray`]). //! -//! After metadata-only `filter`/`take`, an [`FSSTView`]'s byte heap is the *original* heap and -//! the live codes are scattered (gaps after a filter, reordering/duplication after a take). To -//! canonicalize we must produce one contiguous decompressed buffer in element order. There are -//! three ways to get there, with different cost profiles — see [`FsstViewCompaction`]: +//! After metadata-only `filter`/`take`/`slice`, an [`FSSTView`]'s byte heap is the *original* heap +//! and the live codes are scattered (gaps after a filter, reordering/duplication after a take). To +//! canonicalize we must decode the survivors into one element-ordered buffer. [`FsstViewCompaction`] +//! captures how: //! -//! - [`Direct`][FsstViewCompaction::Direct]: the live codes are still contiguous and in order -//! (e.g. an untouched view or one that was only sliced). We bulk-decompress that single -//! contiguous range with no copy. Fastest, but only valid when contiguous. -//! - [`GatherBulk`][FsstViewCompaction::GatherBulk] ("compact"): copy the scattered live codes -//! into a contiguous buffer, then a *single* bulk decompress. Pays a copy of the live -//! compressed bytes but the one bulk call amortizes the FSST 8-wide fast path across all -//! element boundaries. -//! - [`PerElement`][FsstViewCompaction::PerElement] ("no compact"): decompress each element's -//! slice directly into its place in the output. No copy, but one decompress call per element. -//! - [`RunCoalesce`][FsstViewCompaction::RunCoalesce] ("export paired slices"): decode contiguous -//! heap runs straight into a *heap-ordered* output and point `VarBinView`s back into it, out of -//! order — no gather copy, dedups duplicates. +//! - [`Direct`][FsstViewCompaction::Direct]: the live codes are still one contiguous in-order run +//! (an untouched or sliced view). Decode that single range in one call, no copy. +//! - [`RunDecode`][FsstViewCompaction::RunDecode] ("export all in place"): the offsets are still +//! monotonic (after any `filter`, sorted-index `take`, or `slice`) but gapped. Decode each +//! maximal contiguous heap run *directly* into the element-ordered output, with **no gather +//! copy** — one decode call per run. Wins while survivors form few runs (clustered / range +//! selections). +//! - [`GatherBulk`][FsstViewCompaction::GatherBulk] ("compact codes"): for scattered survivors (a +//! shuffle take) or heavily fragmented ones (a uniform-random filter), compact the live codes +//! into one contiguous buffer, then a single bulk decode. The one bulk call amortizes FSST's slow +//! decode tail across all elements, which beats run-decode once the runs get small. //! -//! The compaction question, concretely. The `fsst_view_compute` benchmark (two ~2 MiB inputs, -//! ~12-byte and ~256-byte strings) shows **`GatherBulk` is the best non-contiguous strategy across -//! the whole tested range**, for both short and long strings. The reason FSST decode is shaped -//! this way: a fast 8-wide body and a slow byte-by-byte tail. `PerElement` pays that tail *once -//! per element* (N tails); `GatherBulk` decodes the whole heap in one call and pays it *once*, -//! which dominates the gather memcpy even at 256-byte strings. -//! -//! `RunCoalesce` was the appealing idea of skipping the gather entirely — decode runs in place and -//! let the `VarBinView` reference them out of order. It loses anyway, badly for short strings -//! (`take many_short/shuffle`: ~18 ms vs ~5.6 ms for `GatherBulk`). The reason is subtle: the -//! random access you avoid at *decode* time reappears at *view-build* time. Views are built in -//! element order, so over a heap-ordered output the per-element `make_view` does N cache-missing -//! random reads (and, for ≤12-byte strings, random-access *inlining* copies), plus an -//! O(N log N) sort. `GatherBulk`'s output is element-ordered, so its view-build is sequential. The -//! cheap sequential gather memcpy beats the expensive scattered view construction. -//! -//! So [`FsstViewCompaction::Auto`] uses `Direct` when the live codes are contiguous -//! (untouched/sliced view) and `GatherBulk` otherwise. `PerElement` and `RunCoalesce` are kept -//! selectable so the trade-off stays measurable, but `Auto` never picks them. -//! -//! ## Export heuristic: "export all in place" vs "compact codes" -//! -//! `GatherBulk` always copies the live codes contiguous before decoding. But after a `filter`, a -//! sorted-index `take`, or a `slice`, the survivors' offsets stay **monotonic** — so we can skip -//! the gather entirely and decode each contiguous heap run *directly into* the (element-ordered) -//! output: [`RunDecode`][FsstViewCompaction::RunDecode]. Unlike `RunCoalesce`, the output is in -//! element order, so the view-build stays sequential. The cost is one decode call per run, so it -//! wins while survivors form few runs (clustered / range selections) and loses once they fragment -//! into many tiny runs (a uniform-random filter), where one bulk decode (`GatherBulk`) is cheaper. -//! -//! `Auto` therefore decides between *exporting all in place* and *compacting codes then exporting* -//! by **run count**: `RunDecode` when `runs <= len / RUN_DECODE_MAX_RUN_FRACTION` (and the layout -//! is monotonic), else `GatherBulk`. The `db_*`/`canon_only` benches calibrate this: on -//! `many_short` it's RunDecode ~313 µs (clustered) / ~345 µs (range) vs GatherBulk ~333 / ~370 µs, -//! and GatherBulk ~561 µs vs RunDecode ~657 µs on uniform-random. Crucially this lives entirely in -//! the export — the conversion and the metadata-only `filter`/`take` stay separate so a *chain* of -//! them still composes; only the final canonicalize compacts (or not). +//! [`FsstViewCompaction::Auto`] picks `Direct` when contiguous, `RunDecode` when the offsets are +//! monotonic and the survivors form few runs (`runs <= len / RUN_DECODE_MAX_RUN_FRACTION`), and +//! `GatherBulk` otherwise. The choice lives entirely in the export: the conversion and the +//! metadata-only `filter`/`take` stay separate so a *chain* of them composes; only the final +//! canonicalize compacts (or not). use std::sync::Arc; @@ -69,14 +36,11 @@ use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; -use vortex_array::arrays::varbinview::BinaryView; use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; use vortex_array::arrays::varbinview::build_views::build_views; use vortex_array::buffer::BufferHandle; use vortex_array::match_each_integer_ptype; -use vortex_buffer::Buffer; use vortex_buffer::BufferMut; -use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; use vortex_error::VortexResult; @@ -86,41 +50,30 @@ use super::array::FSSTViewArraySlotsExt; /// Strategy for materializing the decompressed bytes when canonicalizing an [`FSSTView`]. /// -/// See the [module docs][self] for the full trade-off analysis. +/// See the [module docs][self] for the full trade-off analysis. Every strategy produces an +/// element-ordered decoded buffer; they differ only in how the survivor codes are fed to the +/// decoder. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum FsstViewCompaction { - /// Pick a strategy automatically: `Direct` when the live codes are contiguous, else - /// `GatherBulk`. Never picks `PerElement` or `RunCoalesce` (both lose; see module docs). + /// Pick automatically: `Direct` when contiguous, `RunDecode` when monotonic with few runs, + /// else `GatherBulk`. Auto, - /// Bulk-decompress the contiguous live range with no copy. Falls back to `GatherBulk` if the - /// view's codes are not contiguous and in order. + /// Bulk-decode the single contiguous live range, no copy. Falls back to `GatherBulk` if the + /// codes are not contiguous and in order. Direct, - /// Compact the scattered live codes into a contiguous buffer, then a single bulk decompress. + /// Compact the scattered live codes into a contiguous buffer, then a single bulk decode. GatherBulk, - /// Decompress each element's code slice directly into place, without compacting. - PerElement, - /// Coalesce survivors into contiguous heap runs and decompress each run with a *single* call - /// directly into a heap-ordered output (no gather copy), emitting `VarBinView` views — possibly - /// out of order — that point back into it. Decodes distinct codes once (duplicates share a - /// view). - /// - /// This is the "export paired slices into a `VarBinView`" approach. In theory it skips the - /// gather copy entirely; in practice it loses to `GatherBulk` (see module docs) because the - /// random access just moves to view-build time, where it's more expensive. Retained for - /// measurement only — `Auto` never selects it. - RunCoalesce, - /// "Export all in place": when survivors are in heap order (offsets monotonically increasing, - /// as after any `filter`, a sorted-index `take`, or a `slice`), decode each maximal contiguous - /// heap run *directly* into the element-ordered output, with **no gather copy**. The output is - /// element-ordered, so the view-build stays sequential (unlike `RunCoalesce`). Cost is one - /// decode call per run; it beats `GatherBulk` when survivors form few runs (clustered/range - /// selections), and degrades toward per-element decode when survivors are scattered (a - /// uniform-random filter), which is when `GatherBulk`'s single bulk decode wins instead. - /// + /// Decode each contiguous heap run directly into the element-ordered output, no gather copy. /// Requires monotonic offsets; falls back to `GatherBulk` otherwise (e.g. a shuffle take). RunDecode, } +/// `Auto` prefers `RunDecode` (export all in place) over `GatherBulk` (compact codes) while the +/// number of contiguous runs is at most `len / RUN_DECODE_MAX_RUN_FRACTION` — i.e. while survivors +/// average more than this many elements per run. Calibrated by the `fsst_view_compute` benches: +/// clustered and range selections sit well under this, uniform-random filters well over it. +const RUN_DECODE_MAX_RUN_FRACTION: usize = 4; + pub(super) fn canonicalize_fsstview( array: ArrayView<'_, FSSTView>, ctx: &mut ExecutionCtx, @@ -128,136 +81,77 @@ pub(super) fn canonicalize_fsstview( canonicalize_fsstview_with(array, FsstViewCompaction::Auto, ctx) } -/// Byte accounting for an [`FSSTView`], in **both compressed (code) space and uncompressed -/// (decoded) space**, for reasoning about gather/coalesce trade-offs and dead-byte waste. +/// Canonicalize an [`FSSTView`] to a [`VarBinViewArray`] using an explicit compaction strategy. /// -/// All figures are in bytes. The "span" figures describe what a *gap-merged* decode (decoding each -/// run's full heap extent, dead bytes included) would touch; the difference from the live figures -/// is the waste such a strategy would carry. -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] -pub struct FsstViewByteStats { - /// Number of (logical) elements in the view. - pub elements: usize, - /// Distinct heap runs the live elements form (maximal heap-adjacent groups of distinct spans). - pub runs: usize, - /// Distinct (deduplicated) live code spans referenced by the view. - pub distinct_spans: usize, - /// Compressed bytes the live distinct spans occupy (what `GatherBulk` copies / decodes). - pub live_compressed: usize, - /// Compressed bytes spanned by the runs *including* dead gaps between survivors (what a - /// gap-merged decode would feed the decoder). `span_compressed - live_compressed` is the - /// compressed waste of merging across gaps. - pub span_compressed: usize, - /// Uncompressed bytes the live elements decode to (the canonical output size; deduped spans - /// counted once). - pub live_uncompressed: usize, - /// Total uncompressed output size with duplicates expanded (the `VarBinView`'s logical size). - pub logical_uncompressed: usize, - /// Total compressed heap size backing the view (the original, shared code buffer). - pub heap_compressed: usize, -} - -impl FsstViewByteStats { - /// Fraction of the spanned compressed bytes that are dead (would be wasted by a gap-merged - /// decode). `0.0` means the live spans are perfectly contiguous within each run. - pub fn compressed_waste_ratio(&self) -> f64 { - if self.span_compressed == 0 { - 0.0 - } else { - (self.span_compressed - self.live_compressed) as f64 / self.span_compressed as f64 - } - } -} - -/// Compute [`FsstViewByteStats`] for a view (diagnostics; not on the hot path). -pub fn fsstview_byte_stats( +/// Exposed (rather than only the dispatch-driven [`canonicalize_fsstview`]) so callers and +/// benchmarks can force a strategy. Production code goes through [`FsstViewCompaction::Auto`]. +pub fn canonicalize_fsstview_with( array: ArrayView<'_, FSSTView>, + strategy: FsstViewCompaction, ctx: &mut ExecutionCtx, -) -> VortexResult { - let offsets = load_usize(array.codes_offsets(), ctx)?; - let sizes = load_usize(array.codes_sizes(), ctx)?; - let ulen_prim = array - .uncompressed_lengths() - .clone() - .execute::(ctx)?; - #[expect(clippy::cast_possible_truncation)] - let ulens: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { - ulen_prim - .as_slice::

() - .iter() - .map(|x| *x as usize) - .collect() +) -> VortexResult { + let decoded = decode_element_ordered(array, strategy, ctx)?; + let (buffers, views) = match_each_integer_ptype!(decoded.ulen_prim.ptype(), |P| { + build_views( + 0, + MAX_BUFFER_LEN, + decoded.uncompressed, + decoded.ulen_prim.as_slice::

(), + ) }); - - let elements = offsets.len(); - let logical_uncompressed: usize = ulens.iter().sum(); - let heap_compressed = array.codes_bytes().len(); - - // Walk distinct spans in heap order, accumulating live/run/span figures. - let mut order: Vec = (0..elements).filter(|&i| sizes[i] > 0).collect(); - order.sort_unstable_by_key(|&i| (offsets[i], sizes[i])); - - let mut runs = 0usize; - let mut distinct_spans = 0usize; - let mut live_compressed = 0usize; - let mut live_uncompressed = 0usize; - let mut span_compressed = 0usize; - let mut run_end: Option = None; - let mut run_start = 0usize; - let mut prev_span: Option<(usize, usize)> = None; - for &i in &order { - let span = (offsets[i], sizes[i]); - let is_dup = prev_span == Some(span); - prev_span = Some(span); - if is_dup { - continue; // duplicate of the previous distinct span - } - distinct_spans += 1; - live_compressed += sizes[i]; - live_uncompressed += ulens[i]; - match run_end { - Some(end) if offsets[i] == end => { - run_end = Some(end + sizes[i]); - } - Some(end) => { - // Close the previous run, open a new one. - span_compressed += end - run_start; - runs += 1; - run_start = offsets[i]; - run_end = Some(offsets[i] + sizes[i]); - } - None => { - run_start = offsets[i]; - run_end = Some(offsets[i] + sizes[i]); - } - } - } - if let Some(end) = run_end { - span_compressed += end - run_start; - runs += 1; - } - - Ok(FsstViewByteStats { - elements, - runs, - distinct_spans, - live_compressed, - span_compressed, - live_uncompressed, - logical_uncompressed, - heap_compressed, + // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. + Ok(unsafe { + VarBinViewArray::new_unchecked( + views, + Arc::from(buffers), + array.dtype().clone(), + array.fsstview_validity(), + ) + .into_array() }) } -/// Canonicalize an [`FSSTView`] to a [`VarBinViewArray`] using an explicit compaction strategy. +/// Canonicalize an [`FSSTView`] to a [`VarBinArray`] (offsets + contiguous bytes) instead of a +/// [`VarBinViewArray`]. /// -/// Exposed (rather than only the dispatch-driven [`canonicalize_fsstview`]) so benchmarks can -/// measure each strategy directly. Production code goes through [`FsstViewCompaction::Auto`]. -pub fn canonicalize_fsstview_with( +/// Shares the element-ordered decode path with [`canonicalize_fsstview_with`]; the only difference +/// is the finisher, which builds `len + 1` cumulative offsets from the uncompressed lengths rather +/// than per-element views. Cheaper than a `VarBinViewArray` when the consumer wants offsets+bytes +/// (no per-element 16-byte view construction). +pub fn canonicalize_fsstview_to_varbin( array: ArrayView<'_, FSSTView>, strategy: FsstViewCompaction, ctx: &mut ExecutionCtx, ) -> VortexResult { + let decoded = decode_element_ordered(array, strategy, ctx)?; + + let varbin_offsets = cumulative_offsets(&decoded.ulen_prim); + let bytes = BufferHandle::new_host(decoded.uncompressed.freeze()); + // SAFETY: offsets are monotonic and end at the byte length; bytes are valid binary/UTF-8. + Ok(unsafe { + VarBinArray::new_unchecked_from_handle( + varbin_offsets.into_array(), + bytes, + array.dtype().clone(), + array.fsstview_validity(), + ) + .into_array() + }) +} + +/// The element-ordered decoded bytes plus the uncompressed-lengths array the finishers need. +struct Decoded { + uncompressed: ByteBufferMut, + ulen_prim: PrimitiveArray, +} + +/// Decode an [`FSSTView`]'s survivors into one element-ordered buffer using the chosen (or `Auto`) +/// strategy. Shared by the `VarBinView` and `VarBin` finishers. +fn decode_element_ordered( + array: ArrayView<'_, FSSTView>, + strategy: FsstViewCompaction, + ctx: &mut ExecutionCtx, +) -> VortexResult { let offsets = load_usize(array.codes_offsets(), ctx)?; let sizes = load_usize(array.codes_sizes(), ctx)?; @@ -265,9 +159,8 @@ pub fn canonicalize_fsstview_with( .uncompressed_lengths() .clone() .execute::(ctx)?; - // `total_size` is needed by every path; compute it directly from the typed slice. The widened - // `ulens: Vec` is only needed by the run/per-element decoders, so defer it until the - // strategy is chosen (Direct/GatherBulk don't need it at all). + // `total_size` is needed by every path; sum it from the typed slice. The widened + // `ulens: Vec` is only needed by `RunDecode`, so defer it. #[expect(clippy::cast_possible_truncation)] let total_size: usize = match_each_integer_ptype!(ulen_prim.ptype(), |P| { ulen_prim.as_slice::

().iter().map(|x| *x as usize).sum() @@ -278,15 +171,8 @@ pub fn canonicalize_fsstview_with( let heap = heap_buffer.as_slice(); let decompressor = array.decompressor(); - // Analyse the survivor layout once: a single contiguous run (Direct), monotonic-but-gapped - // (RunDecode candidate), or out of heap order (must gather). let layout = analyze_layout(&offsets, &sizes); let chosen = match strategy { - // The export heuristic. With monotonic offsets we can "export all in place" by decoding - // each contiguous run with no gather copy; this wins while the runs are few. Once survivors - // fragment into many tiny runs (a uniform-random filter), the per-run decode-tail overhead - // dominates and compacting the codes into one bulk decode (`GatherBulk`) wins instead. - // Non-monotonic layouts (a shuffle take) can't run-decode, so they always gather. FsstViewCompaction::Auto => match layout { Layout::Contiguous => FsstViewCompaction::Direct, Layout::Monotonic { runs } if runs <= offsets.len() / RUN_DECODE_MAX_RUN_FRACTION => { @@ -304,159 +190,54 @@ pub fn canonicalize_fsstview_with( other => other, }; - if chosen == FsstViewCompaction::RunDecode { - let ulens = widen_ulens(&ulen_prim); - let uncompressed = - decompress_run_decode(&decompressor, heap, &offsets, &sizes, &ulens, total_size); - let (buffers, views) = match_each_integer_ptype!(ulen_prim.ptype(), |P| { - build_views(0, MAX_BUFFER_LEN, uncompressed, ulen_prim.as_slice::

()) - }); - // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. - return Ok(unsafe { - VarBinViewArray::new_unchecked( - views, - Arc::from(buffers), - array.dtype().clone(), - array.fsstview_validity(), - ) - .into_array() - }); - } - - // RunCoalesce builds its own (buffers, views) — decompression order is decoupled from element - // order, so it can't go through `build_views` (which assumes element-order contiguous output). - if chosen == FsstViewCompaction::RunCoalesce { - let ulens = widen_ulens(&ulen_prim); - let (buffers, views) = - decompress_run_coalesce(&decompressor, heap, &offsets, &sizes, &ulens, total_size); - // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. - return Ok(unsafe { - VarBinViewArray::new_unchecked( - views, - Arc::from(buffers), - array.dtype().clone(), - array.fsstview_validity(), - ) - .into_array() - }); - } - let uncompressed = match chosen { FsstViewCompaction::Direct => { let start = offsets.first().copied().unwrap_or(0); decompress_direct(&decompressor, heap, start, live, total_size) } - FsstViewCompaction::GatherBulk => { - decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size) - } - // `Auto`/`RunCoalesce` are resolved above. - _ => { + FsstViewCompaction::RunDecode => { let ulens = widen_ulens(&ulen_prim); - decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size) + decompress_run_decode(&decompressor, heap, &offsets, &sizes, &ulens, total_size) } + // `Auto` is resolved above; `GatherBulk` is the catch-all. + _ => decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size), }; - let (buffers, views) = match_each_integer_ptype!(ulen_prim.ptype(), |P| { - build_views(0, MAX_BUFFER_LEN, uncompressed, ulen_prim.as_slice::

()) - }); - - // SAFETY: FSST validates the bytes for binary/UTF-8; the views point at valid ranges. - Ok(unsafe { - VarBinViewArray::new_unchecked( - views, - Arc::from(buffers), - array.dtype().clone(), - array.fsstview_validity(), - ) - .into_array() + Ok(Decoded { + uncompressed, + ulen_prim, }) } -/// Canonicalize an [`FSSTView`] to a [`VarBinArray`] (offsets + contiguous bytes) instead of a -/// [`VarBinViewArray`]. -/// -/// Shares the decode path with [`canonicalize_fsstview_with`]: the strategies that produce an -/// element-ordered output (`Direct`/`GatherBulk`/`PerElement`) are reused as-is; the only -/// difference is the finisher, which builds `len + 1` cumulative offsets from the uncompressed -/// lengths rather than per-element views. `RunCoalesce` is not applicable (its output is heap- -/// ordered, not element-ordered) and is treated as `GatherBulk`. -/// -/// Exposed for benchmarking the export target (VarBin vs VarBinView). `Auto` resolves to `Direct` -/// when contiguous, else `GatherBulk`. -pub fn canonicalize_fsstview_to_varbin( - array: ArrayView<'_, FSSTView>, - strategy: FsstViewCompaction, - ctx: &mut ExecutionCtx, -) -> VortexResult { - let offsets = load_usize(array.codes_offsets(), ctx)?; - let sizes = load_usize(array.codes_sizes(), ctx)?; +fn load_usize(array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult> { + let prim = array.clone().execute::(ctx)?; + #[expect(clippy::cast_possible_truncation)] + let out: Vec = match_each_integer_ptype!(prim.ptype(), |P| { + prim.as_slice::

().iter().map(|x| *x as usize).collect() + }); + Ok(out) +} - let ulen_prim = array - .uncompressed_lengths() - .clone() - .execute::(ctx)?; +/// Build `len + 1` cumulative offsets over the uncompressed lengths (the `VarBin` offsets array), +/// directly from the typed slice. `push_unchecked` (capacity reserved) keeps this vectorized. +fn cumulative_offsets(ulen_prim: &PrimitiveArray) -> ArrayRef { let len = ulen_prim.len(); - - // Build `len + 1` cumulative offsets directly from the typed lengths slice (no widened Vec), - // and pick up `total_size` as the final running sum. `push_unchecked` (capacity reserved) keeps - // this vectorized. - let mut varbin_offsets = BufferMut::::with_capacity(len + 1); + let mut offsets = BufferMut::::with_capacity(len + 1); #[expect(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] - let total_size: usize = match_each_integer_ptype!(ulen_prim.ptype(), |P| { + let _: () = match_each_integer_ptype!(ulen_prim.ptype(), |P| { let mut acc: usize = 0; // SAFETY: `len + 1` slots reserved; we push exactly that many. - unsafe { varbin_offsets.push_unchecked(0) }; + unsafe { offsets.push_unchecked(0) }; for &ulen in ulen_prim.as_slice::

() { acc += ulen as usize; - unsafe { varbin_offsets.push_unchecked(acc as i64) }; - } - acc - }); - let live: usize = sizes.iter().sum(); - - let heap_buffer = array.codes_bytes(); - let heap = heap_buffer.as_slice(); - let decompressor = array.decompressor(); - - let contiguous = is_contiguous(&offsets, &sizes); - let uncompressed = match strategy { - FsstViewCompaction::PerElement => { - let ulens = widen_ulens(&ulen_prim); - decompress_per_element(&decompressor, heap, &offsets, &sizes, &ulens, total_size) + unsafe { offsets.push_unchecked(acc as i64) }; } - // Direct (or Auto) on a contiguous layout decodes the live range in place, no gather. - FsstViewCompaction::Direct | FsstViewCompaction::Auto if contiguous => { - let start = offsets.first().copied().unwrap_or(0); - decompress_direct(&decompressor, heap, start, live, total_size) - } - // Everything else uses the element-ordered (coalesced) gather + one bulk decode. - _ => decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size), - }; - - let bytes = BufferHandle::new_host(uncompressed.freeze()); - // SAFETY: offsets are monotonic and end at the byte length; bytes are valid binary/UTF-8. - Ok(unsafe { - VarBinArray::new_unchecked_from_handle( - varbin_offsets.into_array(), - bytes, - array.dtype().clone(), - array.fsstview_validity(), - ) - .into_array() - }) -} - -fn load_usize(array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult> { - let prim = array.clone().execute::(ctx)?; - #[expect(clippy::cast_possible_truncation)] - let out: Vec = match_each_integer_ptype!(prim.ptype(), |P| { - prim.as_slice::

().iter().map(|x| *x as usize).collect() }); - Ok(out) + offsets.into_array() } -/// Widen an already-executed uncompressed-lengths primitive array into `Vec`. Only the -/// run/per-element decoders need this; `Direct`/`GatherBulk` work without it. +/// Widen an already-executed uncompressed-lengths primitive array into `Vec`. Only +/// `RunDecode` needs this; `Direct`/`GatherBulk` work without it. fn widen_ulens(ulen_prim: &PrimitiveArray) -> Vec { #[expect(clippy::cast_possible_truncation)] let out: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { @@ -469,40 +250,19 @@ fn widen_ulens(ulen_prim: &PrimitiveArray) -> Vec { out } -/// Returns true if the live codes occupy a single contiguous, in-order run of the heap. -fn is_contiguous(offsets: &[usize], sizes: &[usize]) -> bool { - let Some(&first) = offsets.first() else { - return true; - }; - let mut pos = first; - for (&offset, &size) in offsets.iter().zip(sizes) { - if offset != pos { - return false; - } - pos += size; - } - true -} - -/// `Auto` prefers `RunDecode` (export all in place) over `GatherBulk` (compact codes) while the -/// number of contiguous runs is at most `len / RUN_DECODE_MAX_RUN_FRACTION` — i.e. while survivors -/// average more than this many elements per run. Calibrated by the `db_*` benchmarks: clustered -/// and range selections sit well under this, uniform-random filters well over it. -const RUN_DECODE_MAX_RUN_FRACTION: usize = 4; - /// The survivor layout in the heap, used to pick an export strategy. enum Layout { /// Survivors are one contiguous in-order run (untouched / sliced view) — `Direct`. Contiguous, /// Offsets are strictly increasing but gapped: survivors form `runs` contiguous blocks. - /// Eligible for `RunDecode` (decode each run in place, no gather). + /// Eligible for `RunDecode`. Monotonic { runs: usize }, /// Offsets are out of heap order (e.g. a shuffle take) — must gather. Scattered, } -/// Classify the survivor layout in a single O(n) pass: are offsets monotonic, and how many -/// maximal contiguous runs do the (non-empty) survivors form? +/// Classify the survivor layout in a single O(n) pass: are offsets monotonic, and how many maximal +/// contiguous runs do the (non-empty) survivors form? fn analyze_layout(offsets: &[usize], sizes: &[usize]) -> Layout { let mut runs = 0usize; let mut gapped = false; @@ -522,10 +282,10 @@ fn analyze_layout(offsets: &[usize], sizes: &[usize]) -> Layout { } prev_end = Some(offset + size); } - if !gapped { - Layout::Contiguous - } else { + if gapped { Layout::Monotonic { runs } + } else { + Layout::Contiguous } } @@ -632,113 +392,3 @@ fn decompress_gather( unsafe { out.set_len(written) }; out } - -/// Coalesce survivors into contiguous heap runs, decompress each run once directly into the -/// output, and build `VarBinView`s (in element order) pointing back into that output. -/// -/// Distinct elements are keyed by their `(offset, size)` heap span: duplicates (from a `take` -/// with repeats) are decoded once and share a view. Adjacent distinct spans (`offset == prev end`) -/// are decompressed in a single FSST call, so a shuffle take of the whole array is one decode. -fn decompress_run_coalesce( - decompressor: &Decompressor<'_>, - heap: &[u8], - offsets: &[usize], - sizes: &[usize], - ulens: &[usize], - total_size: usize, -) -> (Vec, Buffer) { - let count = offsets.len(); - - // Visit elements in heap order. Sorting by `(offset, size)` groups duplicates (same span) - // together and, at a shared offset, orders the zero-size span (null/empty) before the - // non-zero one — keeping the run extension below well-defined. `size` is part of the key - // because a zero-size element shares an offset with its heap neighbour. - let mut order: Vec = (0..count).collect(); - order.sort_unstable_by_key(|&i| (offsets[i], sizes[i])); - - // Output position of each element's decoded bytes, filled below. - let mut out_pos = vec![0usize; count]; - let mut out = ByteBufferMut::with_capacity(total_size + 7); - let spare = out.spare_capacity_mut(); - - let mut written = 0usize; - let mut cursor = 0usize; - while cursor < count { - let head = order[cursor]; - // Zero-size spans (empty/null) decode to nothing; share the current position. - if sizes[head] == 0 { - out_pos[head] = written; - cursor += 1; - continue; - } - // Start a run at this span and extend it while the next *distinct* span is heap-adjacent. - // Duplicate spans (identical offset+size) reuse the position already assigned for the run. - let run_out_base = written; - let run_heap_start = offsets[head]; - let mut run_heap_end = run_heap_start; - let mut elem_out = written; - while cursor < count { - let elem = order[cursor]; - if sizes[elem] == 0 { - break; - } - if offsets[elem] == run_heap_end { - // A new distinct span that continues the run. - out_pos[elem] = elem_out; - elem_out += ulens[elem]; - run_heap_end += sizes[elem]; - cursor += 1; - } else if offsets[elem] < run_heap_end { - // A duplicate of a span already decoded in this run: reuse its position. Duplicates - // are contiguous in the sorted order, so the previous entry shares this span. - out_pos[elem] = out_pos[order[cursor - 1]]; - cursor += 1; - } else { - break; - } - } - // One decode for the whole run, straight into the output at `run_out_base`. - decompressor.decompress_into( - &heap[run_heap_start..run_heap_end], - &mut spare[run_out_base..], - ); - written = elem_out; - } - unsafe { out.set_len(written) }; - let bytes = out.freeze(); - - // Build views in element order, each pointing at its decoded output position. - let mut views = BufferMut::::with_capacity(count); - for (i, &ulen) in ulens.iter().enumerate() { - let pos = out_pos[i]; - #[expect(clippy::cast_possible_truncation)] - let view = BinaryView::make_view(&bytes[pos..pos + ulen], 0, pos as u32); - views.push(view); - } - - (vec![bytes], views.freeze()) -} - -/// Decompress each element's code slice directly into its place in the output (no compaction). -fn decompress_per_element( - decompressor: &Decompressor<'_>, - heap: &[u8], - offsets: &[usize], - sizes: &[usize], - ulens: &[usize], - total_size: usize, -) -> ByteBufferMut { - let mut out = ByteBufferMut::with_capacity(total_size + 7); - { - let spare = out.spare_capacity_mut(); - let mut uoff = 0; - for ((&offset, &size), &ulen) in offsets.iter().zip(sizes).zip(ulens) { - if size > 0 { - decompressor.decompress_into(&heap[offset..offset + size], &mut spare[uoff..]); - } - uoff += ulen; - } - } - unsafe { out.set_len(total_size) }; - out -} diff --git a/encodings/fsst/src/fsstview/mod.rs b/encodings/fsst/src/fsstview/mod.rs index c7529359bbd..f84affeb5eb 100644 --- a/encodings/fsst/src/fsstview/mod.rs +++ b/encodings/fsst/src/fsstview/mod.rs @@ -28,10 +28,8 @@ mod slice; mod tests; pub use array::*; -pub use canonical::FsstViewByteStats; pub use canonical::FsstViewCompaction; pub use canonical::canonicalize_fsstview_to_varbin; pub use canonical::canonicalize_fsstview_with; -pub use canonical::fsstview_byte_stats; pub use from_fsst::fsst_filter_to_view; pub use from_fsst::fsst_take_to_view; diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs index b6a52901f20..c0decae0363 100644 --- a/encodings/fsst/src/fsstview/tests.rs +++ b/encodings/fsst/src/fsstview/tests.rs @@ -10,7 +10,6 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::dict::TakeExecute; -use vortex_array::arrays::filter::FilterKernel; use vortex_array::assert_arrays_eq; use vortex_array::compute::conformance::consistency::test_array_consistency; use vortex_array::compute::conformance::filter::test_filter_conformance; @@ -30,7 +29,6 @@ use crate::fsst_compress; use crate::fsst_filter_to_view; use crate::fsst_take_to_view; use crate::fsst_train_compressor; -use crate::fsstview_byte_stats; use crate::fsstview_from_fsst; fn make_fsstview( @@ -245,8 +243,7 @@ fn fsst_take_to_view_matches_canonical() -> VortexResult<()> { #[case(FsstViewCompaction::Auto)] #[case(FsstViewCompaction::Direct)] #[case(FsstViewCompaction::GatherBulk)] -#[case(FsstViewCompaction::PerElement)] -#[case(FsstViewCompaction::RunCoalesce)] +#[case(FsstViewCompaction::RunDecode)] fn compaction_strategies_agree(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let fsst = make_fsst(&SAMPLE, Nullability::NonNullable, &mut ctx); @@ -277,15 +274,13 @@ fn compaction_strategies_agree(#[case] strategy: FsstViewCompaction) -> VortexRe Ok(()) } -/// Adversarial coverage for `RunCoalesce`: a filter that punches gaps into the heap (so survivors -/// form multiple runs), then a shuffle take (reorders runs), over nullable data. Every strategy -/// must still agree with the canonical VarBin result. +/// Adversarial coverage: a filter that punches gaps into the heap (so survivors form multiple +/// runs), then a shuffle take (reorders runs, forcing `GatherBulk`), over nullable data. Every +/// strategy must still agree with the canonical result. #[rstest] #[case(FsstViewCompaction::Auto)] #[case(FsstViewCompaction::GatherBulk)] -#[case(FsstViewCompaction::RunCoalesce)] -#[case(FsstViewCompaction::PerElement)] -fn run_coalesce_gaps_and_shuffle(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { +fn gaps_and_shuffle_agree(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); // 12 distinct-ish strings, nullable. let strings: Vec> = vec![ @@ -373,113 +368,12 @@ fn run_decode_monotonic_filter(#[case] strategy: FsstViewCompaction) -> VortexRe Ok(()) } -/// Build a ~`target`-uncompressed-byte FSSTView of random short URL-ish strings. -fn make_big_view(target: usize, avg_len: usize, ctx: &mut ExecutionCtx) -> FSSTViewArray { - use rand::RngExt; - use rand::SeedableRng; - use rand::rngs::StdRng; - let mut rng = StdRng::seed_from_u64(1); - let words = [ - "https://", "example", "vortex", ".com/", "path", "value", "data", "alpha", - ]; - let count = target / avg_len; - let strings: Vec> = (0..count) - .map(|_| { - let mut s = String::new(); - while s.len() < avg_len { - s.push_str(words[rng.random_range(0..words.len())]); - } - s.truncate(avg_len); - s.into_bytes().into_boxed_slice() - }) - .collect(); - let varbin = VarBinArray::from_iter( - strings.into_iter().map(Some), - DType::Utf8(Nullability::NonNullable), - ); - let compressor = fsst_train_compressor(&varbin); - let fsst = fsst_compress(&varbin, varbin.len(), varbin.dtype(), &compressor, ctx); - fsstview_from_fsst(&fsst, ctx).expect("fsstview_from_fsst") -} - -/// Reports the byte accounting (compressed and uncompressed) and the dead-byte waste a gap-merged -/// decode would carry, for representative selective filter / shuffle take / dense take. Run with -/// `cargo test -p vortex-fsst byte_stats_report -- --nocapture` to see the numbers. -#[test] -fn byte_stats_report() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let base = make_big_view(1 << 20, 16, &mut ctx); - let n = base.len(); - - // Selective filter (keep ~10%): many small gaps -> high compressed waste if merged. - let mut rng_keep = { - use rand::SeedableRng; - rand::rngs::StdRng::seed_from_u64(3) - }; - let mask = { - use rand::RngExt; - Mask::from_iter((0..n).map(|_| rng_keep.random_bool(0.10))) - }; - let filtered = ::filter(base.as_view(), &mask, &mut ctx)? - .unwrap() - .try_downcast::() - .ok() - .unwrap(); - - // Shuffle take: same elements, reordered -> one run, zero waste. - let mut perm: Vec = (0..n as u64).collect(); - { - use rand::RngExt; - use rand::SeedableRng; - let mut r = rand::rngs::StdRng::seed_from_u64(4); - for i in (1..perm.len()).rev() { - perm.swap(i, r.random_range(0..=i)); - } - } - let shuffled = ::take( - base.as_view(), - &PrimitiveArray::from_iter(perm).into_array(), - &mut ctx, - )? - .unwrap() - .try_downcast::() - .ok() - .unwrap(); - - for (label, view) in [("filter_10pct", &filtered), ("shuffle_take", &shuffled)] { - let s = fsstview_byte_stats(view.as_view(), &mut ctx)?; - // Waste if we instead merged *everything* into a single decode of the whole heap extent - // (the most aggressive gap-merge): all heap bytes minus the live ones are dead. - let full_merge_waste = if s.heap_compressed == 0 { - 0.0 - } else { - (s.heap_compressed - s.live_compressed) as f64 / s.heap_compressed as f64 - }; - println!( - "{label}: elements={} runs={} distinct={} \ - | compressed: live={}B span={}B heap={}B run_waste={:.1}% full_merge_waste={:.1}% \ - | uncompressed: live={}B logical={}B", - s.elements, - s.runs, - s.distinct_spans, - s.live_compressed, - s.span_compressed, - s.heap_compressed, - s.compressed_waste_ratio() * 100.0, - full_merge_waste * 100.0, - s.live_uncompressed, - s.logical_uncompressed, - ); - } - Ok(()) -} - -/// The VarBin exporter must agree with the canonical VarBin filter, across all element-ordered -/// strategies, for a gapped filter over nullable data. +/// The VarBin exporter must agree with the canonical VarBin filter, across the export strategies, +/// for a gapped filter over nullable data. #[rstest] #[case(FsstViewCompaction::Auto)] #[case(FsstViewCompaction::GatherBulk)] -#[case(FsstViewCompaction::PerElement)] +#[case(FsstViewCompaction::RunDecode)] fn varbin_export_matches_canonical(#[case] strategy: FsstViewCompaction) -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); let strings: Vec> = vec![ From 6b7a1b8b7a89e18ed44cdca70b80c3eda7394435 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 00:50:18 +0000 Subject: [PATCH 16/23] FSSTView: benchmark the real FineWeb query predicates Adds fsst_view_fineweb_queries, which materializes a string column under the actual WHERE predicates from the vortex-bench FineWeb queries (dump = ..., date LIKE '2020-10-%', url/text LIKE '%google%', '%espn%', '% vortex %', ...). Each predicate is evaluated once in DuckDB against the real HuggingFace 10BT sample to produce an authentic per-row selection mask (recipe in fineweb_queries_extract.py); the bench applies that mask to the FSST-compressed url/text column and decodes to a VarBinViewArray, fsst vs fsstview. No-ops if FINEWEB_DIR is unset. The real masks span the spectrum: clustered selections (dump_eq 7%/177 runs, date_prefix 12%) vs scattered LIKE-containment (google_or 2%/4046 runs) vs tiny (vortex 0.04%, espn ~0.08%). Real-query medians: fsst view text/date_prefix 63.4ms 43.9ms view 1.4x text/dump_eq 40.9ms 26.0ms view 1.6x text/google_or 26.8ms 21.4ms view 1.25x url/dump_eq 1.13ms 0.94ms view 1.2x url/google_and 30us 164us fsst (tiny, very selective) url/vortex 8us 140us fsst (tiny) Two regimes: on bulk-ish selections over the long text column the view wins (1.3-1.6x) by skipping fsst's per-op heap rewrite; on highly selective predicates over the short url column fsst wins because its filter rewrites an almost-empty heap while the view pays a fixed ~130us floor converting all 200k offsets before filtering discards >99% of them. Both are sub-millisecond there, but it confirms on real query masks that converting the whole column ahead of a very selective predicate is the view's one real weakness. Signed-off-by: Joe Isaacs --- encodings/fsst/Cargo.toml | 4 + .../fsst/benches/fineweb_queries_extract.py | 73 +++++++ .../fsst/benches/fsst_view_fineweb_queries.rs | 183 ++++++++++++++++++ 3 files changed, 260 insertions(+) create mode 100644 encodings/fsst/benches/fineweb_queries_extract.py create mode 100644 encodings/fsst/benches/fsst_view_fineweb_queries.rs diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index 40c96bbb78f..123bf853837 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -64,5 +64,9 @@ harness = false name = "fsst_view_fineweb" harness = false +[[bench]] +name = "fsst_view_fineweb_queries" +harness = false + [package.metadata.cargo-machete] ignored = ["fsst-rs"] diff --git a/encodings/fsst/benches/fineweb_queries_extract.py b/encodings/fsst/benches/fineweb_queries_extract.py new file mode 100644 index 00000000000..8ca1bed1ca1 --- /dev/null +++ b/encodings/fsst/benches/fineweb_queries_extract.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# One-time extraction for the `fsst_view_fineweb_queries` benchmark. +# +# Materializes, from the real HuggingFace FineWeb 10BT sample (the same file `vortex-bench` uses), +# the `url` and `text` string columns plus a per-row selection mask for each real benchmark +# predicate. Writes them as the simple length-prefixed / byte-per-row formats the bench reads. +# +# pip install duckdb +# python3 fineweb_queries_extract.py # -> /tmp/fw_url.bin, fw_text.bin, fw_mask_*.bin +# FINEWEB_DIR=/tmp cargo bench -p vortex-fsst --bench fsst_view_fineweb_queries +# +# The sample is ~2 GB; DuckDB streams it over HTTP range reads, so only the first N rows are read. + +import os +import struct + +import duckdb + +SRC = "https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/v1.4.0/sample/10BT/001_00000.parquet" +N = 200_000 +OUT = os.environ.get("FINEWEB_DIR", "/tmp") + +# The row-selecting `WHERE` clauses of the `SELECT *` FineWeb queries in vortex-bench. +# (`file_path LIKE '%/CC-MAIN-2014-%'` matches zero rows in this sample, so it is omitted.) +QUERIES = { + "dump_eq": "dump = 'CC-MAIN-2016-30'", + "date_prefix": "date LIKE '2020-10-%'", + "google_and": "url LIKE '%google%' AND text LIKE '%Google%'", + "google_or": "url LIKE '%.google.%' OR text LIKE '% Google %'", + "vortex": "text LIKE '% vortex %'", + "espn_and": "url LIKE '%espn%' AND language = 'en' AND language_score > 0.92", + "espn_or": "url LIKE '%espn%' OR url LIKE '%www.espn.go.com%' OR url LIKE '%espn.go.com%'", +} + + +def main() -> None: + con = duckdb.connect() + con.execute("INSTALL httpfs; LOAD httpfs;") + con.execute( + f"""CREATE TABLE fw AS + SELECT row_number() OVER () AS rid, url, text, dump, date, file_path, + language, language_score + FROM read_parquet('{SRC}') LIMIT {N}""" + ) + + def dump_col(col: str) -> None: + rows = con.execute(f"SELECT {col} FROM fw ORDER BY rid").fetchall() + path = os.path.join(OUT, f"fw_{col}.bin") + with open(path, "wb") as f: + f.write(struct.pack(" {path}") + + dump_col("url") + dump_col("text") + + for name, pred in QUERIES.items(): + rids = {r[0] for r in con.execute(f"SELECT rid FROM fw WHERE {pred}").fetchall()} + path = os.path.join(OUT, f"fw_mask_{name}.bin") + with open(path, "wb") as f: + f.write(struct.pack("`: each one +//! evaluates a predicate to a row selection, then materializes the surviving rows. This bench does +//! exactly the materialization half — apply a real predicate's selection mask to an FSST-compressed +//! string column and decode it to a `VarBinViewArray` — comparing fsst (rewrites the code heap) +//! vs fsstview (metadata-only filter, decode once). +//! +//! The predicate masks and the string columns are produced once with DuckDB against the real +//! HuggingFace FineWeb 10BT sample (the same file `vortex-bench` uses). The ~2 GB sample is not +//! downloaded by the bench; the recipe is in `fineweb_queries_extract.py` next to this file, and +//! the resulting files are pointed at via env vars: +//! +//! ```text +//! FINEWEB_DIR=/tmp cargo bench -p vortex-fsst --bench fsst_view_fineweb_queries +//! ``` +//! +//! `FINEWEB_DIR` must contain `fw_url.bin`, `fw_text.bin` (length-prefixed: `u64` count, then per +//! row `u32` len + bytes) and `fw_mask_.bin` (`u64` count, then one byte per row, 1 = kept). +//! If `FINEWEB_DIR` is unset or files are missing, every bench no-ops so CI stays green. +//! +//! The real predicates span the spectrum the view's `Auto` export was built for: clustered +//! selections (`dump = ...`, `date LIKE '2020-10-%'`) where survivors form long runs, and scattered +//! `LIKE '%...%'` containment filters where they don't. + +#![expect(clippy::unwrap_used)] + +use std::path::PathBuf; + +use divan::Bencher; +use divan::black_box; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_fsst::FSST; +use vortex_fsst::FSSTArray; +use vortex_fsst::FSSTView; +use vortex_fsst::FsstViewCompaction; +use vortex_fsst::canonicalize_fsstview_with; +use vortex_fsst::fsst_compress; +use vortex_fsst::fsst_train_compressor; +use vortex_fsst::fsstview_from_fsst; +use vortex_mask::Mask; + +fn main() { + divan::main(); +} + +/// Real FineWeb benchmark predicates that select rows (the `WHERE` clauses of the `SELECT *` +/// queries in `vortex-bench/src/fineweb`). `filepath` matches zero rows so it is omitted. +const QUERIES: &[&str] = &[ + "dump_eq", // dump = 'CC-MAIN-2016-30' — clustered, ~7% + "date_prefix", // date LIKE '2020-10-%' — clustered, ~12% + "google_and", // url LIKE '%google%' AND text LIKE — very selective, scattered + "google_or", // url/text LIKE '%google%' — scattered, ~2% + "vortex", // text LIKE '% vortex %' — tiny + "espn_and", // url LIKE '%espn%' AND lang/score — tiny + "espn_or", // url LIKE '%espn%' OR ... — tiny +]; + +/// The materialized string column. `url` is short (~72 B), `text` is long (~3 KB). +const COLUMNS: &[&str] = &["url", "text"]; + +fn dir() -> Option { + std::env::var_os("FINEWEB_DIR").map(PathBuf::from) +} + +/// Read a length-prefixed column dump into a `VarBinArray`. +fn load_column(name: &str) -> Option { + let path = dir()?.join(format!("fw_{name}.bin")); + if !path.exists() { + return None; + } + let bytes = std::fs::read(path).unwrap(); + let mut pos = 0usize; + #[expect(clippy::cast_possible_truncation)] + let rows = u64::from_le_bytes(bytes[0..8].try_into().unwrap()) as usize; + pos += 8; + let mut values: Vec>> = Vec::with_capacity(rows); + for _ in 0..rows { + let len = u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap()) as usize; + pos += 4; + values.push(Some(bytes[pos..pos + len].to_vec())); + pos += len; + } + Some(VarBinArray::from_iter( + values.into_iter().map(|v| v.map(Vec::into_boxed_slice)), + DType::Utf8(Nullability::NonNullable), + )) +} + +/// Read a one-byte-per-row predicate mask. +fn load_mask(query: &str) -> Option { + let path = dir()?.join(format!("fw_mask_{query}.bin")); + if !path.exists() { + return None; + } + let bytes = std::fs::read(path).unwrap(); + #[expect(clippy::cast_possible_truncation)] + let rows = u64::from_le_bytes(bytes[0..8].try_into().unwrap()) as usize; + Some(Mask::from_iter((0..rows).map(|i| bytes[8 + i] != 0))) +} + +fn compress(varbin: &VarBinArray, ctx: &mut ExecutionCtx) -> FSSTArray { + let compressor = fsst_train_compressor(varbin); + fsst_compress(varbin, varbin.len(), varbin.dtype(), &compressor, ctx) +} + +#[derive(Clone, Copy)] +struct Case { + column: &'static str, + query: &'static str, +} + +impl std::fmt::Display for Case { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", self.column, self.query) + } +} + +fn cases() -> Vec { + let mut v = Vec::new(); + for &column in COLUMNS { + for &query in QUERIES { + v.push(Case { column, query }); + } + } + v +} + +#[divan::bench(args = cases())] +fn fsst(bencher: Bencher, case: Case) { + let (Some(varbin), Some(mask)) = (load_column(case.column), load_mask(case.query)) else { + return; + }; + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let filtered: ArrayRef = ::filter(fsst.as_view(), mask, ctx) + .unwrap() + .unwrap(); + black_box( + filtered + .execute::(ctx) + .unwrap() + .into_array(), + ) + }); +} + +#[divan::bench(args = cases())] +fn view(bencher: Bencher, case: Case) { + let (Some(varbin), Some(mask)) = (load_column(case.column), load_mask(case.query)) else { + return; + }; + let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); + bencher + .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) + .bench_refs(|(fsst, mask, ctx)| { + let view = fsstview_from_fsst(fsst, ctx).unwrap(); + let filtered = ::filter(view.as_view(), mask, ctx) + .unwrap() + .unwrap() + .try_downcast::() + .ok() + .unwrap(); + black_box( + canonicalize_fsstview_with(filtered.as_view(), FsstViewCompaction::Auto, ctx) + .unwrap(), + ) + }); +} From 648efca4abc3f4c4e515e4aeb04231b4cc8e4a84 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 09:42:09 +0000 Subject: [PATCH 17/23] FSSTView: add benches/README summarizing the three benchmarks + numbers Documents what each benchmark measures (fsst_view_compute synthetic shapes, fsst_view_fineweb real columns, fsst_view_fineweb_queries real query predicates), the workloads, and the headline median results, plus how Auto picks the decode strategy. Signed-off-by: Joe Isaacs --- encodings/fsst/benches/README.md | 102 +++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 encodings/fsst/benches/README.md diff --git a/encodings/fsst/benches/README.md b/encodings/fsst/benches/README.md new file mode 100644 index 00000000000..eb68bffd4c0 --- /dev/null +++ b/encodings/fsst/benches/README.md @@ -0,0 +1,102 @@ + + +# FSSTView benchmarks + +`FSSTView` is a ListView-style FSST: it addresses its compressed codes with separate +`offsets` + `sizes` arrays instead of a single monotonic offsets array. That makes +`filter` / `take` / `slice` **metadata-only** (they rewrite only the small +offsets/sizes/lengths/validity arrays and reuse the compressed byte heap), whereas plain +`FSST` delegates those ops to `VarBin` and **rewrites the whole compressed code heap** each +time. The cost moves to a single canonicalization (decode → `VarBinViewArray`) at the end. + +These benchmarks quantify that trade-off. All numbers are divan **medians**, 100 samples, on +one shared machine — treat them as directional; the relative ordering is stable. `fsst` = +stay in `FSST` (rewrite heap per op); `view` = convert to `FSSTView`, metadata-only ops, +decode once. + +## 1. `fsst_view_compute` — synthetic shapes + +Self-contained (no external data). ~2 MiB of synthetic strings in two shapes — `ManyShort` +(~12 B) and `FewLong` (~256 B) — with a clustered 10 % filter and a sorted take. Two +workloads, each ending in a `VarBinViewArray`: + +- `single_filter_{fsst,view}` — one filter, then canonicalize. +- `chain_{fsst,view}` — convert once, then 5 alternating filter/take ops, canonicalize once + (the case the view is designed for). + +| workload | shape | fsst | view | speedup | +| --- | --- | --- | --- | --- | +| single_filter | ManyShort | 0.63 ms | 0.62 ms | ~1× | +| single_filter | FewLong | 65 µs | 53 µs | 1.2× | +| chain (5 ops) | ManyShort | 4.99 ms | 4.12 ms | 1.2× | +| chain (5 ops) | FewLong | 371 µs | 268 µs | 1.4× | + +Takeaway: the gap widens with chain length, because each `fsst` op re-rewrites the heap while +the view stays metadata-only and defers the single decode. + +## 2. `fsst_view_fineweb` — real columns + +Two real columns from the HuggingFace FineWeb 10BT sample: `url` (200 k rows, ~72 B avg) and +`text` (40 k rows, ~3 KB avg). The ~2 GB sample is not downloaded; columns are extracted once +with DuckDB into length-prefixed dumps (see the bench module docs). No-ops unless `FINEWEB_URL` +/ `FINEWEB_TEXT` point at the files. Same two workloads as above. + +| workload | column | fsst | view | speedup | +| --- | --- | --- | --- | --- | +| single_filter | url | 1.02 ms | 0.84 ms | 1.2× | +| single_filter | text | 5.81 ms | 4.38 ms | 1.3× | +| chain (5 ops) | url | 6.23 ms | 3.95 ms | 1.6× | +| chain (5 ops) | text | 44.2 ms | **5.16 ms** | **8.6×** | + +Takeaway: on real data the view wins every case, and decisively for chained ops over long +strings — `fsst` rewrites the ~hundreds-of-MB code heap on every op; the view decodes once. + +## 3. `fsst_view_fineweb_queries` — real query predicates + +The actual `vortex-bench` FineWeb queries are `SELECT * FROM fineweb WHERE `. Each +predicate is evaluated once in DuckDB against the real sample to produce an authentic per-row +selection mask (recipe: `benches/fineweb_queries_extract.py`); the bench applies that mask to +the FSST-compressed `url`/`text` column and decodes to a `VarBinViewArray`. This is the +materialization half of a real query. No-ops unless `FINEWEB_DIR` points at the dumps. + +Mask shapes vary by predicate (over 200 k rows): `dump_eq` 7 %/177 runs and `date_prefix` +12 %/178 runs are clustered; `google_or` 2 %/4046 runs is scattered; `vortex`/`espn` are +~0.04–0.09 % and tiny. + +| query (selectivity) | column | fsst | view | winner | +| --- | --- | --- | --- | --- | +| date_prefix (12 %) | text | 63.4 ms | 43.9 ms | view 1.4× | +| dump_eq (7 %) | text | 40.9 ms | 26.0 ms | view 1.6× | +| google_or (2 %) | text | 26.8 ms | 21.4 ms | view 1.25× | +| dump_eq (7 %) | url | 1.13 ms | 0.94 ms | view 1.2× | +| date_prefix (12 %) | url | 1.67 ms | 1.36 ms | view 1.2× | +| google_or (2 %) | url | 407 µs | 468 µs | fsst | +| google_and (0.19 %) | url | 30 µs | 164 µs | fsst | +| vortex (0.04 %) | url | 8 µs | 140 µs | fsst | + +Takeaway — two regimes: + +- **Bulk-ish selections, and anything on the long `text` column → view wins (1.25–1.6×)** by + skipping the per-op heap rewrite. These are the queries that take tens of milliseconds. +- **Tiny, highly selective predicates on the short `url` column → fsst wins.** `fsst`'s filter + rewrites an almost-empty heap (cheap), while the view pays a fixed ~130 µs floor: + `fsstview_from_fsst` walks all 200 k offsets to derive the `sizes` array even though the + predicate keeps <0.2 % of rows. Both are sub-millisecond there, so it rarely matters, but it + is the view's one real weakness — converting the whole column ahead of a very selective + filter is wasted work. + +## How `Auto` chooses the decode + +Canonicalization picks a decode strategy from the survivor layout (`FsstViewCompaction::Auto`): + +- **Direct** — survivors are one contiguous run (untouched / sliced): one bulk decode, no copy. +- **RunDecode** — offsets still monotonic with few runs (clustered/range filters, sorted + takes): decode each contiguous run straight into the element-ordered output, no gather copy. +- **GatherBulk** — scattered (shuffle take) or heavily fragmented (uniform-random filter): + compact the live codes into one buffer, then a single bulk decode. + +The threshold (`runs <= len / 4` → RunDecode, else GatherBulk) was calibrated with the +synthetic `fsst_view_compute` shapes. From a5b0df2a0ade5b0c28f2461a827f411b28e2485f Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 11:36:35 +0000 Subject: [PATCH 18/23] FSSTView: add handover doc + continuation prompt FSSTVIEW_HANDOVER.md summarizes the encoding, the three benchmarks and their median results, the Auto decode strategy, the known conversion-floor limitation, and the profiling methodology. FSSTVIEW_NEXT_PROMPT.md is a copy-paste prompt to continue the work (eliminate the selective-filter conversion floor). Signed-off-by: Joe Isaacs --- FSSTVIEW_HANDOVER.md | 124 ++++++++++++++++++++++++++++++++++++++++ FSSTVIEW_NEXT_PROMPT.md | 42 ++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 FSSTVIEW_HANDOVER.md create mode 100644 FSSTVIEW_NEXT_PROMPT.md diff --git a/FSSTVIEW_HANDOVER.md b/FSSTVIEW_HANDOVER.md new file mode 100644 index 00000000000..853a6dc0e4b --- /dev/null +++ b/FSSTVIEW_HANDOVER.md @@ -0,0 +1,124 @@ +# FSSTView — Handover + +## TL;DR + +Added a new **`FSSTView`** array encoding to Vortex: a ListView-style FSST that addresses its +compressed codes with separate `offsets` + `sizes` arrays instead of one monotonic offsets array. +This makes `filter` / `take` / `slice` **metadata-only** (rewrite only small index arrays, reuse +the compressed byte heap), where plain `FSST` rewrites the whole compressed code heap per op. The +decode cost moves to a single canonicalization at the end. + +- **Branch:** `claude/fsstview-array-listview-TdW45` (17 commits ahead of `develop`, pushed). +- **Status:** merge-ready. 107 tests pass, `clippy --all-targets --all-features` clean, + `cargo +nightly fmt` clean, `vortex-file` builds, doc tests pass. +- **No PR opened yet** (was waiting on explicit request). +- **Scope:** additive, contained in `encodings/fsst/` plus 2 registration lines in `vortex-file`. + +## What landed + +New encoding `vortex.fsstview` in `encodings/fsst/src/fsstview/`: + +| file | role | +| --- | --- | +| `array.rs` | encoding struct, `#[array_slots]` children (uncompressed_lengths, codes_offsets, codes_sizes, codes_validity), VTable, serde, `fsstview_from_fsst` conversion | +| `compute.rs` | metadata-only `FilterKernel` + `TakeExecute` | +| `ops.rs` | `scalar_at` | +| `slice.rs` | metadata-only `SliceReduce` | +| `from_fsst.rs` | `fsst_filter_to_view` / `fsst_take_to_view` helpers | +| `canonical.rs` | decode → `VarBinViewArray` / `VarBinArray`, with the `Auto` export strategy | +| `kernel.rs` / `rules.rs` | parent kernel + rule registration | +| `tests.rs` | conformance + agreement + nullable/gapped/RunDecode coverage | + +Registered in `vortex-file/src/lib.rs` (`register_default_encodings`). Public API: +`FSSTView`, `FSSTViewArray`, `FsstViewCompaction`, `canonicalize_fsstview_with`, +`canonicalize_fsstview_to_varbin`, `fsst_filter_to_view`, `fsst_take_to_view`, `fsstview_from_fsst`. + +## Canonicalization strategy (`FsstViewCompaction::Auto`) + +After metadata-only ops the survivors are scattered in the original heap; `Auto` picks how to +decode from the survivor layout: + +- **Direct** — one contiguous run (untouched / sliced): single bulk decode, no copy. +- **RunDecode** — offsets monotonic, few runs (clustered/range filters, sorted takes): decode each + contiguous run straight into the element-ordered output, no gather copy. Threshold: + `runs <= len / 4`. +- **GatherBulk** — scattered (shuffle take) or fragmented (uniform-random filter): compact live + codes into one buffer, single bulk decode. + +`RunDecode` and the gather coalescing came from the optimization work; `PerElement` and +`RunCoalesce` were explored, proven worse, and removed before merge. + +## Benchmarks & results + +Three benches in `encodings/fsst/benches/` (full write-up in `benches/README.md`). All numbers are +divan **medians**, 100 samples, single shared machine — directional, relative ordering stable. + +1. **`fsst_view_compute`** — synthetic, no external data. ~2 MiB strings, ManyShort (~12 B) / + FewLong (~256 B). Single filter and a 5-op chain → VarBinView. + - chain FewLong: fsst 371 µs → view **268 µs** (1.4×); chain ManyShort 4.99 ms → **4.12 ms**. + +2. **`fsst_view_fineweb`** — real FineWeb `url` (200k × ~72 B) and `text` (40k × ~3 KB) columns. + - single_filter text: 5.81 ms → **4.38 ms** (1.3×) + - chain text: 44.2 ms → **5.16 ms** (**8.6×**) ← headline + - chain url: 6.23 ms → **3.95 ms** (1.6×) + +3. **`fsst_view_fineweb_queries`** — the real `vortex-bench` query predicates (`dump = ...`, + `date LIKE '2020-10-%'`, `url/text LIKE '%google%'`, `'% vortex %'`, espn filters), evaluated + in DuckDB to authentic per-row masks, then materialize the column → VarBinView. + - text/date_prefix (12%): 63.4 ms → **43.9 ms** (1.4×) + - text/dump_eq (7%): 40.9 ms → **26.0 ms** (1.6×) + - url/vortex (0.04%): fsst **8 µs** vs view 140 µs + +**Two regimes:** the view wins everywhere the work is non-trivial (long `text` column, chained +ops, bulk selections) — up to 8.6×. It loses only on tiny highly-selective predicates over the +short `url` column, where it pays a fixed ~130 µs floor (the conversion walks all 200k offsets to +build `sizes` even though <0.2% survive). Those cases are all sub-millisecond. + +### Reproducing the FineWeb benches + +The ~2 GB sample is **not** downloaded by the benches. Extract columns + query masks once: + +```bash +pip install duckdb +python3 encodings/fsst/benches/fineweb_queries_extract.py # writes /tmp/fw_*.bin +FINEWEB_DIR=/tmp cargo bench -p vortex-fsst --bench fsst_view_fineweb_queries +# for the column bench: +FINEWEB_URL=/tmp/fw_url.bin FINEWEB_TEXT=/tmp/fw_text.bin \ + cargo bench -p vortex-fsst --bench fsst_view_fineweb +``` + +Benches no-op (CI-safe) when the env vars are unset. + +## Known limitation / next step + +The view's one weakness is the **fixed conversion cost on highly selective filters**: +`fsstview_from_fsst` derives the full `sizes` array (`offsets[i+1] - offsets[i]` over all rows) +even when a predicate keeps <1% of rows. Confirmed with samply + cachegrind: the conversion is the +top wall-clock cost on the `url`-selective queries (~130 µs floor), and the loop is already +SIMD-vectorized and memory-bandwidth-bound (it streams `len * 8` bytes for i64 offsets/sizes). + +Possible follow-ups, **not done** (would need care + their own benchmarks): +- Defer / lazily represent `sizes` so a selective filter doesn't materialize it for discarded rows. +- Store `sizes` in the narrowest int width (values are small; offsets are i64), cutting the + conversion's memory traffic. + +Both touch the representation that `filter`/`take` operate on, so they are not drop-in. + +## Verification commands + +```bash +cargo nextest run -p vortex-fsst # (or cargo test -p vortex-fsst) — 107 pass +cargo clippy -p vortex-fsst --all-targets --all-features +cargo clippy -p vortex-file +cargo +nightly fmt --all +``` + +## Methodology notes (for whoever continues) + +- `perf` is unavailable in the dev sandbox (kernel mismatch). Use **samply** (set + `/proc/sys/kernel/perf_event_paranoid` to 1) for wall-clock sampling and **cachegrind** for + cache/instruction modeling. Build the profiled example with + `RUSTFLAGS="-C force-frame-pointers=yes -C debuginfo=2"` and resolve addresses with `addr2line`. +- Caution learned the hard way: **instruction count is not time.** A 12× instruction-count + reduction in the conversion barely moved wall-clock; always confirm with a sampling profiler and + a realistic workload (real FineWeb columns, real query masks), not synthetic micro-loops. diff --git a/FSSTVIEW_NEXT_PROMPT.md b/FSSTVIEW_NEXT_PROMPT.md new file mode 100644 index 00000000000..296cc86263a --- /dev/null +++ b/FSSTVIEW_NEXT_PROMPT.md @@ -0,0 +1,42 @@ +# Copy-paste prompt to continue the FSSTView work + +Paste the block below to a fresh agent session in the Vortex repo. + +--- + +Continue work on the `FSSTView` encoding in the Vortex repo. It's already implemented and +merge-ready on branch `claude/fsstview-array-listview-TdW45` (17 commits ahead of `develop`). +Read `FSSTVIEW_HANDOVER.md` and `encodings/fsst/benches/README.md` first for full context and +benchmark numbers. + +Background: `FSSTView` (in `encodings/fsst/src/fsstview/`) is a ListView-style FSST that stores +compressed codes addressed by separate `offsets` + `sizes` arrays, making `filter`/`take`/`slice` +metadata-only (no code-heap rewrite). On real FineWeb data the view wins up to 8.6× on chained +ops over long strings. Its one measured weakness: on highly selective predicates over short +columns it pays a fixed ~130 µs floor because `fsstview_from_fsst` derives the full `sizes` array +(over all rows) even when <1% survive. + +Task: eliminate that conversion floor without regressing the cases the view already wins. Approach: + +1. Confirm the current behaviour first: run + `python3 encodings/fsst/benches/fineweb_queries_extract.py` (needs `pip install duckdb`, network + to HuggingFace), then + `FINEWEB_DIR=/tmp cargo bench -p vortex-fsst --bench fsst_view_fineweb_queries`. Note the + `url/vortex`, `url/google_and`, `url/espn_*` rows where `view` trails `fsst`. +2. Implement a cheaper `sizes` representation so a selective filter doesn't materialize sizes for + discarded rows — e.g. derive `sizes` lazily from `offsets` at canonicalize time, or store it in + the narrowest int width that fits. `filter`/`take` currently filter a concrete `codes_sizes` + child array, so whatever you choose must keep those ops metadata-only and still composable + across a chain (do NOT fuse conversion into filter). +3. Prove it with the same methodology, not instruction counts: samply (set + `perf_event_paranoid=1`) for wall-clock and the real `fsst_view_fineweb_queries` bench. Show the + selective `url` queries improve AND the winning cases (`chain text`, `dump_eq`, `date_prefix`) + do not regress. +4. Keep it merge-clean: `cargo test -p vortex-fsst` (107 tests), `cargo clippy -p vortex-fsst + --all-targets --all-features`, `cargo +nightly fmt --all`. Add/adjust tests for any new + representation. Update `benches/README.md` and `FSSTVIEW_HANDOVER.md` with new numbers. Commit + with sign-off `Signed-off-by: Joe Isaacs ` and push to the same branch. + +Be rigorous about measurement: instruction count is not time, and synthetic micro-loops mislead — +always validate on the real FineWeb columns/query masks. If a change doesn't actually help the real +workload, say so and revert it rather than shipping it. From 4fdd21679016a5a59a216b3382f0b57419053169 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 13:01:08 +0000 Subject: [PATCH 19/23] FSSTView: store end offsets, eliminating the conversion floor Replace the per-element `codes_sizes` child with `codes_ends` (the end offset, `offset + size`). A freshly converted FSST heap is contiguous, so element `i` occupies `offsets[i]..offsets[i + 1]` and both addressing arrays are now zero-copy slices of the FSST's existing monotonic offsets buffer (`codes_offsets = offsets[0..len]`, `codes_ends = offsets[1..len + 1]`). `fsstview_from_fsst` therefore allocates and copies nothing and no longer materializes a per-row `sizes` array, so a selective `filter`/`take` that keeps a handful of rows never pays an O(rows) cost to derive sizes for the rows it discards. The per-element size is recovered as `codes_ends[i] - codes_offsets[i]` only at canonicalize / `scalar_at`, over the survivors only. `filter`/`take`/`slice` stay metadata-only and compose across a chain (they carry `codes_ends` alongside `codes_offsets`); the conversion is not fused into the filter. Same-machine before/after on the real `fsst_view_fineweb_queries` bench (divan medians): `url/vortex` 140 us -> 9.1 us, `url/espn_and` 146 us -> 14.9 us, `text/espn_and` 407 us -> 271 us (flips to a view win), while the previously winning clustered cases hold (`text/dump_eq` 25.3 ms, 1.68x; `text/date_prefix` 41.4 ms, 1.67x). The view now wins or ties every query in the matrix. 107 tests pass; clippy --all-targets --all-features clean; cargo +nightly fmt clean; vortex-file builds; doc tests pass. README and handover updated. Signed-off-by: Joe Isaacs --- FSSTVIEW_HANDOVER.md | 82 +++++++++++++------- encodings/fsst/benches/README.md | 76 +++++++++++++----- encodings/fsst/src/fsstview/array.rs | 99 ++++++++++++------------ encodings/fsst/src/fsstview/canonical.rs | 10 ++- encodings/fsst/src/fsstview/compute.rs | 12 +-- encodings/fsst/src/fsstview/from_fsst.rs | 6 +- encodings/fsst/src/fsstview/mod.rs | 25 +++--- encodings/fsst/src/fsstview/ops.rs | 4 +- encodings/fsst/src/fsstview/slice.rs | 2 +- 9 files changed, 195 insertions(+), 121 deletions(-) diff --git a/FSSTVIEW_HANDOVER.md b/FSSTVIEW_HANDOVER.md index 853a6dc0e4b..5eb004b4ecf 100644 --- a/FSSTVIEW_HANDOVER.md +++ b/FSSTVIEW_HANDOVER.md @@ -3,12 +3,18 @@ ## TL;DR Added a new **`FSSTView`** array encoding to Vortex: a ListView-style FSST that addresses its -compressed codes with separate `offsets` + `sizes` arrays instead of one monotonic offsets array. -This makes `filter` / `take` / `slice` **metadata-only** (rewrite only small index arrays, reuse -the compressed byte heap), where plain `FSST` rewrites the whole compressed code heap per op. The -decode cost moves to a single canonicalization at the end. - -- **Branch:** `claude/fsstview-array-listview-TdW45` (17 commits ahead of `develop`, pushed). +compressed codes with separate per-element `offsets` + `ends` arrays instead of one monotonic +offsets array. This makes `filter` / `take` / `slice` **metadata-only** (rewrite only small index +arrays, reuse the compressed byte heap), where plain `FSST` rewrites the whole compressed code heap +per op. The decode cost moves to a single canonicalization at the end. + +Storing the per-element **end offset** (rather than the size) makes the `FSST` → `FSSTView` +conversion allocation-free — both addressing arrays are zero-copy slices of the FSST's existing +offsets — which **eliminated the conversion floor** that previously made the view 9–16× slower than +`fsst` on tiny highly selective `url` predicates (see "Conversion floor — resolved" below). + +- **Branch:** `claude/fsstview-conversion-floor-kRAeg` (built on the original + `claude/fsstview-array-listview-TdW45`). - **Status:** merge-ready. 107 tests pass, `clippy --all-targets --all-features` clean, `cargo +nightly fmt` clean, `vortex-file` builds, doc tests pass. - **No PR opened yet** (was waiting on explicit request). @@ -20,7 +26,7 @@ New encoding `vortex.fsstview` in `encodings/fsst/src/fsstview/`: | file | role | | --- | --- | -| `array.rs` | encoding struct, `#[array_slots]` children (uncompressed_lengths, codes_offsets, codes_sizes, codes_validity), VTable, serde, `fsstview_from_fsst` conversion | +| `array.rs` | encoding struct, `#[array_slots]` children (uncompressed_lengths, codes_offsets, codes_ends, codes_validity), VTable, serde, allocation-free `fsstview_from_fsst` conversion | | `compute.rs` | metadata-only `FilterKernel` + `TakeExecute` | | `ops.rs` | `scalar_at` | | `slice.rs` | metadata-only `SliceReduce` | @@ -64,15 +70,18 @@ divan **medians**, 100 samples, single shared machine — directional, relative 3. **`fsst_view_fineweb_queries`** — the real `vortex-bench` query predicates (`dump = ...`, `date LIKE '2020-10-%'`, `url/text LIKE '%google%'`, `'% vortex %'`, espn filters), evaluated - in DuckDB to authentic per-row masks, then materialize the column → VarBinView. - - text/date_prefix (12%): 63.4 ms → **43.9 ms** (1.4×) - - text/dump_eq (7%): 40.9 ms → **26.0 ms** (1.6×) - - url/vortex (0.04%): fsst **8 µs** vs view 140 µs - -**Two regimes:** the view wins everywhere the work is non-trivial (long `text` column, chained -ops, bulk selections) — up to 8.6×. It loses only on tiny highly-selective predicates over the -short `url` column, where it pays a fixed ~130 µs floor (the conversion walks all 200k offsets to -build `sizes` even though <0.2% survive). Those cases are all sub-millisecond. + in DuckDB to authentic per-row masks, then materialize the column → VarBinView. Numbers below + are a same-machine before/after (old `sizes` representation → new `ends` representation): + - text/date_prefix (12%): fsst 69.3 ms vs view **41.4 ms** (1.67×; was 41.0 ms — held) + - text/dump_eq (7%): fsst 42.6 ms vs view **25.3 ms** (1.68×; was 25.3 ms — held) + - url/vortex (0.04%): fsst 8.6 µs vs view **9.1 µs** (was view 140 µs — floor removed) + - url/espn_and (0.08%): fsst 14.5 µs vs view **14.9 µs** (was view 146 µs) + - text/espn_and (0.08%): fsst 284 µs vs view **271 µs** (was view 407 µs — flips to a view win) + +With the `ends` representation the view now **wins or ties every query** in the matrix: the bulk / +clustered / long-`text` cases still win by skipping the per-op heap rewrite (up to 1.68× here, 8.6× +on the chain bench), and the tiny highly selective predicates that used to lose to the conversion +floor now match `fsst` to within noise. Full table in `benches/README.md`. ### Reproducing the FineWeb benches @@ -89,20 +98,33 @@ FINEWEB_URL=/tmp/fw_url.bin FINEWEB_TEXT=/tmp/fw_text.bin \ Benches no-op (CI-safe) when the env vars are unset. -## Known limitation / next step - -The view's one weakness is the **fixed conversion cost on highly selective filters**: -`fsstview_from_fsst` derives the full `sizes` array (`offsets[i+1] - offsets[i]` over all rows) -even when a predicate keeps <1% of rows. Confirmed with samply + cachegrind: the conversion is the -top wall-clock cost on the `url`-selective queries (~130 µs floor), and the loop is already -SIMD-vectorized and memory-bandwidth-bound (it streams `len * 8` bytes for i64 offsets/sizes). - -Possible follow-ups, **not done** (would need care + their own benchmarks): -- Defer / lazily represent `sizes` so a selective filter doesn't materialize it for discarded rows. -- Store `sizes` in the narrowest int width (values are small; offsets are i64), cutting the - conversion's memory traffic. - -Both touch the representation that `filter`/`take` operate on, so they are not drop-in. +## Conversion floor — resolved + +The view's one previous weakness was a **fixed conversion cost on highly selective filters**: the +original `fsstview_from_fsst` derived a full `sizes` array (`offsets[i+1] - offsets[i]` over all +rows) even when a predicate kept <1% of rows. Samply + cachegrind had pinned this as the top +wall-clock cost (~130–150 µs floor) on the `url`-selective queries — a memory-bandwidth-bound loop +streaming `len * 8` bytes. + +**Fix (this branch): store the end offset, not the size.** `codes_sizes` was replaced by +`codes_ends`, where `codes_ends[i] = codes_offsets[i] + size[i]`. Because a freshly converted heap +is contiguous (element `i` occupies `offsets[i]..offsets[i+1]`), **both** addressing arrays are now +zero-copy slices of the FSST's existing monotonic offsets buffer +(`codes_offsets = offsets[0..len]`, `codes_ends = offsets[1..len+1]`). The conversion allocates and +copies nothing; no per-row `sizes` array is materialized, so a selective `filter`/`take` never pays +to derive sizes for the rows it discards. The per-element size is recovered as +`codes_ends[i] - codes_offsets[i]` only at canonicalize / `scalar_at`, over the survivors only. + +This keeps `filter`/`take`/`slice` metadata-only and composable across a chain (they carry +`codes_ends` alongside `codes_offsets`); the conversion is **not** fused into the filter. Measured +result (same-machine before/after, `fsst_view_fineweb_queries`): `url/vortex` 140 µs → **9.1 µs**, +`url/espn_and` 146 µs → **14.9 µs**, and the previously winning clustered cases (`text/dump_eq`, +`text/date_prefix`) held flat. The view now wins or ties every query in the matrix. + +The alternative follow-up (store `sizes` in the narrowest int width) was considered and rejected: +it only halves the *write* traffic, leaving the unavoidable full read of the offsets — whereas the +`ends` representation removes the whole O(rows) pass. Narrowing widths is orthogonal and can still +be layered on the file layer's compression if desired. ## Verification commands diff --git a/encodings/fsst/benches/README.md b/encodings/fsst/benches/README.md index eb68bffd4c0..cb93a6a030a 100644 --- a/encodings/fsst/benches/README.md +++ b/encodings/fsst/benches/README.md @@ -66,27 +66,61 @@ Mask shapes vary by predicate (over 200 k rows): `dump_eq` 7 %/177 runs and `dat 12 %/178 runs are clustered; `google_or` 2 %/4046 runs is scattered; `vortex`/`espn` are ~0.04–0.09 % and tiny. -| query (selectivity) | column | fsst | view | winner | -| --- | --- | --- | --- | --- | -| date_prefix (12 %) | text | 63.4 ms | 43.9 ms | view 1.4× | -| dump_eq (7 %) | text | 40.9 ms | 26.0 ms | view 1.6× | -| google_or (2 %) | text | 26.8 ms | 21.4 ms | view 1.25× | -| dump_eq (7 %) | url | 1.13 ms | 0.94 ms | view 1.2× | -| date_prefix (12 %) | url | 1.67 ms | 1.36 ms | view 1.2× | -| google_or (2 %) | url | 407 µs | 468 µs | fsst | -| google_and (0.19 %) | url | 30 µs | 164 µs | fsst | -| vortex (0.04 %) | url | 8 µs | 140 µs | fsst | - -Takeaway — two regimes: - -- **Bulk-ish selections, and anything on the long `text` column → view wins (1.25–1.6×)** by - skipping the per-op heap rewrite. These are the queries that take tens of milliseconds. -- **Tiny, highly selective predicates on the short `url` column → fsst wins.** `fsst`'s filter - rewrites an almost-empty heap (cheap), while the view pays a fixed ~130 µs floor: - `fsstview_from_fsst` walks all 200 k offsets to derive the `sizes` array even though the - predicate keeps <0.2 % of rows. Both are sub-millisecond there, so it rarely matters, but it - is the view's one real weakness — converting the whole column ahead of a very selective - filter is wasted work. +The `view (before)` column is the original representation, which derived a full `sizes` array in +`fsstview_from_fsst` (one i64 per row, materialized over **all** 200 k rows regardless of +selectivity). The `view` column stores the per-element **end offset** instead — a zero-copy slice +of the FSST's existing monotonic offsets — so the conversion allocates nothing and a selective +predicate never pays to derive sizes for the rows it discards (see "Conversion is allocation-free" +below). `fsst` is unchanged by this work; its small run-to-run drift is machine noise (the two +measurement runs were back-to-back on a shared machine). + +| query (selectivity) | column | fsst | view (before) | view | winner | +| --- | --- | --- | --- | --- | --- | +| date_prefix (12 %) | text | 69.3 ms | 41.0 ms | **41.4 ms** | view 1.67× | +| dump_eq (7 %) | text | 42.6 ms | 25.3 ms | **25.3 ms** | view 1.68× | +| google_or (2 %) | text | 23.9 ms | 23.7 ms | **19.8 ms** | view 1.2× | +| google_and (0.19 %) | text | 708 µs | 782 µs | **642 µs** | view | +| vortex (0.04 %) | text | 529 µs | 606 µs | **456 µs** | view | +| espn_and (0.08 %) | text | 284 µs | 407 µs | **271 µs** | view | +| espn_or (0.09 %) | text | 650 µs* | 418 µs | **281 µs** | view | +| date_prefix (12 %) | url | 1.68 ms | 1.39 ms | **1.25 ms** | view 1.34× | +| dump_eq (7 %) | url | 1.11 ms | 944 µs | **881 µs** | view 1.25× | +| google_or (2 %) | url | 398 µs | 478 µs | **331 µs** | view 1.2× | +| google_and (0.19 %) | url | 30.2 µs | 173 µs | **28.7 µs** | view | +| espn_and (0.08 %) | url | 14.5 µs | 146 µs | **14.9 µs** | ~tie | +| espn_or (0.09 %) | url | 16.4 µs | 152 µs | **16.0 µs** | ~tie | +| vortex (0.04 %) | url | 8.6 µs | 140 µs | **9.1 µs** | ~tie | + +(divan medians. `*` `text/espn_or` `fsst` was noisy that run — fastest 283 µs, mean 578 µs.) + +Takeaway: + +- **The conversion floor is gone.** Every highly selective `url` predicate that previously trailed + `fsst` by 9–16× — it paid a fixed ~140 µs to walk all 200 k offsets building `sizes` even when + <0.2 % of rows survived — now matches `fsst` to within noise (`url/vortex` 140 µs → **9.1 µs**, + `url/espn_and` 146 µs → **14.9 µs**). The same floor that quietly taxed the *short selective + `text`* predicates (`text/vortex`, `text/espn_*`, `text/google_and`) is also gone, flipping each + of those from an `fsst` win to a `view` win. +- **The winning cases do not regress.** The clustered/bulk selections the view was already built + for hold or improve: `text/dump_eq` and `text/date_prefix` stay at ~1.67–1.68× (the decode, not + the conversion, dominates them), while `url/date_prefix`, `url/dump_eq`, and both `google_or` + columns get a touch faster because the conversion no longer allocates. + +With the floor removed the view now wins or ties **every** query in this matrix. + +## Conversion is allocation-free + +`FSSTView` stores the per-element **end offset** (`codes_ends[i] = offset[i] + size[i]`) rather +than the size. A freshly converted heap is contiguous, so element `i` occupies +`offsets[i]..offsets[i + 1]`, which means **both** addressing arrays are zero-copy slices of the +FSST's existing monotonic offsets buffer: `codes_offsets = offsets[0..len]` and +`codes_ends = offsets[1..len + 1]`. `fsstview_from_fsst` therefore allocates and copies nothing — +in particular it never materializes a per-row `sizes` array, so a selective `filter`/`take` that +keeps a handful of rows no longer pays an O(rows) cost to derive sizes for the rows it discards. +The per-element size is recovered as `codes_ends[i] - codes_offsets[i]` only where it is needed +(canonicalize / `scalar_at`), over the survivors only. `filter`/`take`/`slice` stay metadata-only +and compose across a chain exactly as before — they now carry `codes_ends` alongside +`codes_offsets` instead of `codes_sizes`. ## How `Auto` chooses the decode diff --git a/encodings/fsst/src/fsstview/array.rs b/encodings/fsst/src/fsstview/array.rs index eb989692ab4..7c4274f017d 100644 --- a/encodings/fsst/src/fsstview/array.rs +++ b/encodings/fsst/src/fsstview/array.rs @@ -20,7 +20,6 @@ use vortex_array::buffer::BufferHandle; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; -use vortex_array::match_each_integer_ptype; use vortex_array::serde::ArrayChildren; use vortex_array::smallvec::smallvec; use vortex_array::validity::Validity; @@ -29,7 +28,6 @@ use vortex_array::vtable::ValidityVTable; use vortex_array::vtable::child_to_validity; use vortex_array::vtable::validity_to_child; use vortex_buffer::Buffer; -use vortex_buffer::BufferMut; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; @@ -60,7 +58,7 @@ pub struct FSSTView; /// Declared with the [`array_slots`] proc macro, which generates the slot-index constants /// (`FSSTViewSlots::CODES_OFFSETS`, ...), the borrowed [`FSSTViewSlotsView`] struct, and the /// typed accessor trait [`FSSTViewArraySlotsExt`] (`.uncompressed_lengths()`, -/// `.codes_offsets()`, `.codes_sizes()`, `.codes_validity()`). +/// `.codes_offsets()`, `.codes_ends()`, `.codes_validity()`). #[array_slots(FSSTView)] pub struct FSSTViewSlots { /// Length of each original (uncompressed) value. Non-nullable integer. @@ -68,9 +66,17 @@ pub struct FSSTViewSlots { /// Start offset of each element's compressed bytecodes within the code heap. Non-nullable /// integer. Unlike `FSST`, these are **not** required to be monotonic or contiguous. pub codes_offsets: ArrayRef, - /// Length in bytes of each element's compressed bytecodes within the code heap. Non-nullable - /// integer. - pub codes_sizes: ArrayRef, + /// End offset of each element's compressed bytecodes within the code heap, i.e. + /// `offset + size`. Non-nullable integer. Element `i`'s bytecodes are + /// `codes_bytes[codes_offsets[i] .. codes_ends[i]]`. + /// + /// Storing the end offset (rather than the size) keeps the [`FSSTArray`] → [`FSSTView`] + /// conversion allocation-free: for a freshly converted array the heap is contiguous, so + /// `codes_ends` is a zero-copy slice of the monotonic offsets (`offsets[1..len + 1]`), exactly + /// as `codes_offsets` is `offsets[0..len]`. The per-element size is derived as + /// `codes_ends[i] - codes_offsets[i]` only where it is needed (canonicalize / `scalar_at`), + /// never materialized for rows a selective `filter`/`take` discards. + pub codes_ends: ArrayRef, /// Optional validity bitmap for the codes. Absent when the array is non-nullable. pub codes_validity: Option, } @@ -82,7 +88,7 @@ pub struct FSSTViewMetadata { #[prost(enumeration = "PType", tag = "2")] codes_offsets_ptype: i32, #[prost(enumeration = "PType", tag = "3")] - codes_sizes_ptype: i32, + codes_ends_ptype: i32, } impl FSSTViewMetadata { @@ -96,17 +102,18 @@ impl FSSTViewMetadata { .map_err(|_| vortex_err!("Invalid PType {}", self.codes_offsets_ptype)) } - fn get_codes_sizes_ptype(&self) -> VortexResult { - PType::try_from(self.codes_sizes_ptype) - .map_err(|_| vortex_err!("Invalid PType {}", self.codes_sizes_ptype)) + fn get_codes_ends_ptype(&self) -> VortexResult { + PType::try_from(self.codes_ends_ptype) + .map_err(|_| vortex_err!("Invalid PType {}", self.codes_ends_ptype)) } } impl FSSTView { /// Build an [`FSSTViewArray`] from its decomposed components. /// - /// `codes_offsets[i]` and `codes_sizes[i]` address element `i`'s compressed bytecodes inside - /// `codes_bytes`. The offsets do not need to be sorted, contiguous, or non-overlapping. + /// `codes_offsets[i]` and `codes_ends[i]` address element `i`'s compressed bytecodes inside + /// `codes_bytes` as the range `codes_offsets[i]..codes_ends[i]`. The offsets do not need to be + /// sorted, contiguous, or non-overlapping. #[allow(clippy::too_many_arguments)] pub fn try_new( dtype: DType, @@ -114,7 +121,7 @@ impl FSSTView { symbol_lengths: Buffer, codes_bytes: BufferHandle, codes_offsets: ArrayRef, - codes_sizes: ArrayRef, + codes_ends: ArrayRef, uncompressed_lengths: ArrayRef, validity: Validity, ) -> VortexResult { @@ -123,7 +130,7 @@ impl FSSTView { &symbols, &symbol_lengths, &codes_offsets, - &codes_sizes, + &codes_ends, &uncompressed_lengths, &validity, &dtype, @@ -133,7 +140,7 @@ impl FSSTView { let slots = make_slots( uncompressed_lengths, codes_offsets, - codes_sizes, + codes_ends, &validity, len, ); @@ -156,7 +163,7 @@ impl FSSTView { symbol_lengths: Buffer, codes_bytes: BufferHandle, codes_offsets: ArrayRef, - codes_sizes: ArrayRef, + codes_ends: ArrayRef, uncompressed_lengths: ArrayRef, validity: Validity, ) -> FSSTViewArray { @@ -165,7 +172,7 @@ impl FSSTView { let slots = make_slots( uncompressed_lengths, codes_offsets, - codes_sizes, + codes_ends, &validity, len, ); @@ -178,30 +185,26 @@ impl FSSTView { } /// Convert a plain [`FSSTArray`] into an [`FSSTViewArray`], sharing the symbol table and the -/// compressed byte heap (zero-copy) and deriving `sizes[i] = offsets[i + 1] - offsets[i]`. +/// compressed byte heap (zero-copy) and addressing the codes with the FSST's existing monotonic +/// offsets. /// -/// The `offsets` (length `len + 1`) are reused for the view's `codes_offsets` by a zero-copy -/// slice of their first `len` elements; only the `sizes` array is freshly allocated. +/// A freshly converted view's heap is contiguous, so element `i` occupies `offsets[i]..offsets[i + +/// 1]`. Both addressing arrays are therefore **zero-copy slices of the same `offsets` buffer**: +/// `codes_offsets = offsets[0..len]` and `codes_ends = offsets[1..len + 1]`. Nothing is allocated +/// or copied — in particular the per-element size (`codes_ends[i] - codes_offsets[i]`) is never +/// materialized here, so a subsequent selective `filter`/`take` does not pay to derive sizes for +/// the rows it discards. This removes the conversion floor a very selective predicate used to hit. pub fn fsstview_from_fsst(fsst: &FSSTArray, ctx: &mut ExecutionCtx) -> VortexResult { let codes = fsst.codes(); let validity = codes.validity()?; let offsets = codes.offsets().clone().execute::(ctx)?; let len = offsets.len().saturating_sub(1); - // `sizes[i] = offsets[i + 1] - offsets[i]`, built from adjacent windows. `push_unchecked` (with - // the capacity reserved up front) avoids the per-element capacity recheck that `push` does and - // lets the loop vectorize — this conversion is otherwise dominated by the size derivation. - let codes_sizes = match_each_integer_ptype!(offsets.ptype(), |O| { - let offsets = offsets.as_slice::(); - let mut sizes = BufferMut::::with_capacity(len); - for w in offsets.windows(2) { - // SAFETY: `len` slots were reserved above and we push exactly `len` of them. - unsafe { sizes.push_unchecked(w[1] - w[0]) }; - } - sizes.into_array() - }); - // `codes_offsets` is the first `len` offsets — a zero-copy slice of the existing buffer. - let codes_offsets = offsets.into_array().slice(0..len)?; + // Both addressing arrays are zero-copy slices of the `len + 1` monotonic offsets: element `i`'s + // codes are `offsets[i]..offsets[i + 1]`, so `codes_ends` is simply the offsets shifted by one. + let offsets = offsets.into_array(); + let codes_offsets = offsets.slice(0..len)?; + let codes_ends = offsets.slice(1..len + 1)?; FSSTView::try_new( fsst.dtype().clone(), @@ -209,7 +212,7 @@ pub fn fsstview_from_fsst(fsst: &FSSTArray, ctx: &mut ExecutionCtx) -> VortexRes fsst.symbol_lengths().clone(), fsst.codes_bytes_handle().clone(), codes_offsets, - codes_sizes, + codes_ends, fsst.uncompressed_lengths().clone(), validity, ) @@ -218,14 +221,14 @@ pub fn fsstview_from_fsst(fsst: &FSSTArray, ctx: &mut ExecutionCtx) -> VortexRes fn make_slots( uncompressed_lengths: ArrayRef, codes_offsets: ArrayRef, - codes_sizes: ArrayRef, + codes_ends: ArrayRef, validity: &Validity, len: usize, ) -> ArraySlots { smallvec![ Some(uncompressed_lengths), Some(codes_offsets), - Some(codes_sizes), + Some(codes_ends), validity_to_child(validity, len), ] } @@ -235,7 +238,7 @@ fn validate_fsstview( symbols: &Buffer, symbol_lengths: &Buffer, codes_offsets: &ArrayRef, - codes_sizes: &ArrayRef, + codes_ends: &ArrayRef, uncompressed_lengths: &ArrayRef, validity: &Validity, dtype: &DType, @@ -254,8 +257,8 @@ fn validate_fsstview( if codes_offsets.len() != len { vortex_bail!(InvalidArgument: "codes_offsets must have same len as outer array"); } - if codes_sizes.len() != len { - vortex_bail!(InvalidArgument: "codes_sizes must have same len as outer array"); + if codes_ends.len() != len { + vortex_bail!(InvalidArgument: "codes_ends must have same len as outer array"); } if uncompressed_lengths.len() != len { vortex_bail!(InvalidArgument: "uncompressed_lengths must have same len as outer array"); @@ -263,8 +266,8 @@ fn validate_fsstview( if !codes_offsets.dtype().is_int() || codes_offsets.dtype().is_nullable() { vortex_bail!(InvalidArgument: "codes_offsets must be non-nullable integer, found {}", codes_offsets.dtype()); } - if !codes_sizes.dtype().is_int() || codes_sizes.dtype().is_nullable() { - vortex_bail!(InvalidArgument: "codes_sizes must be non-nullable integer, found {}", codes_sizes.dtype()); + if !codes_ends.dtype().is_int() || codes_ends.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes_ends must be non-nullable integer, found {}", codes_ends.dtype()); } if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() { vortex_bail!(InvalidArgument: "uncompressed_lengths must be non-nullable integer, found {}", uncompressed_lengths.dtype()); @@ -311,7 +314,7 @@ impl VTable for FSSTView { data.symbols(), data.symbol_lengths(), view.codes_offsets, - view.codes_sizes, + view.codes_ends, view.uncompressed_lengths, &validity, dtype, @@ -350,7 +353,7 @@ impl VTable for FSSTView { uncompressed_lengths_ptype: PType::try_from(array.uncompressed_lengths().dtype())? as i32, codes_offsets_ptype: PType::try_from(array.codes_offsets().dtype())? as i32, - codes_sizes_ptype: PType::try_from(array.codes_sizes().dtype())? as i32, + codes_ends_ptype: PType::try_from(array.codes_ends().dtype())? as i32, } .encode_to_vec(), )) @@ -392,9 +395,9 @@ impl VTable for FSSTView { ), len, )?; - let codes_sizes = children.get( + let codes_ends = children.get( 2, - &DType::Primitive(metadata.get_codes_sizes_ptype()?, Nullability::NonNullable), + &DType::Primitive(metadata.get_codes_ends_ptype()?, Nullability::NonNullable), len, )?; @@ -410,7 +413,7 @@ impl VTable for FSSTView { &symbols, &symbol_lengths, &codes_offsets, - &codes_sizes, + &codes_ends, &uncompressed_lengths, &validity, dtype, @@ -421,7 +424,7 @@ impl VTable for FSSTView { let slots = make_slots( uncompressed_lengths, codes_offsets, - codes_sizes, + codes_ends, &validity, len, ); diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index ea7cff6d201..16e113132d4 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -153,7 +153,15 @@ fn decode_element_ordered( ctx: &mut ExecutionCtx, ) -> VortexResult { let offsets = load_usize(array.codes_offsets(), ctx)?; - let sizes = load_usize(array.codes_sizes(), ctx)?; + // `codes_ends[i]` is element `i`'s end in the heap; derive the per-element size here (over the + // survivors only — never the discarded rows). Downstream layout analysis and decode work on + // `sizes` exactly as before. + let ends = load_usize(array.codes_ends(), ctx)?; + let sizes: Vec = offsets + .iter() + .zip(&ends) + .map(|(&offset, &end)| end - offset) + .collect(); let ulen_prim = array .uncompressed_lengths() diff --git a/encodings/fsst/src/fsstview/compute.rs b/encodings/fsst/src/fsstview/compute.rs index 5e3dcc2395e..21b11c7b3d0 100644 --- a/encodings/fsst/src/fsstview/compute.rs +++ b/encodings/fsst/src/fsstview/compute.rs @@ -3,7 +3,7 @@ //! Metadata-only `filter` and `take` for [`FSSTView`]. //! -//! Both operations rewrite only the small `offsets`/`sizes`/`uncompressed_lengths`/`validity` +//! Both operations rewrite only the small `offsets`/`ends`/`uncompressed_lengths`/`validity` //! arrays and reuse the compressed byte heap (and symbol table) untouched. This is the core //! "ListView speed" win over plain [`FSST`][crate::FSST], whose `filter`/`take` delegate to //! `VarBin` and therefore rewrite the entire compressed heap. @@ -32,10 +32,10 @@ impl FilterKernel for FSSTView { // Filter only the addressing arrays; the byte heap and symbol table are reused as-is. let validity = array.fsstview_validity().filter(mask)?; let codes_offsets = array.codes_offsets().filter(mask.clone())?; - let codes_sizes = array.codes_sizes().filter(mask.clone())?; + let codes_ends = array.codes_ends().filter(mask.clone())?; let uncompressed_lengths = array.uncompressed_lengths().filter(mask.clone())?; - // SAFETY: filter preserves all `FSSTView` invariants — offsets/sizes/lengths stay + // SAFETY: filter preserves all `FSSTView` invariants — offsets/ends/lengths stay // non-nullable and equal-length, and validity tracks nullness separately. Ok(Some( unsafe { @@ -45,7 +45,7 @@ impl FilterKernel for FSSTView { array.symbol_lengths().clone(), array.codes_bytes_handle().clone(), codes_offsets, - codes_sizes, + codes_ends, uncompressed_lengths, validity, ) @@ -74,7 +74,7 @@ impl TakeExecute for FSSTView { // zero — nullness itself is tracked separately by `validity`. let fill = indices.dtype().is_nullable(); let codes_offsets = take_child(array.codes_offsets(), indices, fill)?; - let codes_sizes = take_child(array.codes_sizes(), indices, fill)?; + let codes_ends = take_child(array.codes_ends(), indices, fill)?; let uncompressed_lengths = take_child(array.uncompressed_lengths(), indices, fill)?; // SAFETY: take preserves all `FSSTView` invariants (see `filter`). @@ -86,7 +86,7 @@ impl TakeExecute for FSSTView { array.symbol_lengths().clone(), array.codes_bytes_handle().clone(), codes_offsets, - codes_sizes, + codes_ends, uncompressed_lengths, validity, ) diff --git a/encodings/fsst/src/fsstview/from_fsst.rs b/encodings/fsst/src/fsstview/from_fsst.rs index f83785d5cdf..10ed6add2d0 100644 --- a/encodings/fsst/src/fsstview/from_fsst.rs +++ b/encodings/fsst/src/fsstview/from_fsst.rs @@ -4,9 +4,9 @@ //! Metadata-only `filter`/`take` that go straight from an [`FSSTArray`] to an [`FSSTViewArray`]. //! //! These are the "first hop" of the view pipeline. They never touch the compressed byte heap: -//! the [`FSSTArray`] is reinterpreted as an [`FSSTViewArray`] (sharing symbols + codes bytes, -//! deriving `sizes` from the consecutive offsets) and then the selection is applied to the small -//! `offsets`/`sizes`/`lengths`/`validity` arrays only. +//! the [`FSSTArray`] is reinterpreted as an [`FSSTViewArray`] (sharing symbols + codes bytes, and +//! addressing the codes with zero-copy slices of the existing offsets) and then the selection is +//! applied to the small `offsets`/`ends`/`lengths`/`validity` arrays only. use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; diff --git a/encodings/fsst/src/fsstview/mod.rs b/encodings/fsst/src/fsstview/mod.rs index f84affeb5eb..d2c3a574ad9 100644 --- a/encodings/fsst/src/fsstview/mod.rs +++ b/encodings/fsst/src/fsstview/mod.rs @@ -5,16 +5,23 @@ //! //! Where [`FSST`][crate::FSST] addresses its compressed codes with a single monotonic //! offsets array (`len + 1` offsets, exactly like `VarBin`/`List`), [`FSSTView`] addresses -//! them with a pair of `offsets` **and** `sizes` arrays (exactly like -//! [`ListView`][vortex_array::arrays::ListView]). Element `i`'s compressed bytecodes live in -//! `codes_bytes[offsets[i] .. offsets[i] + sizes[i]]`. +//! them with a pair of per-element `offsets` **and** `ends` arrays (the ListView idea, storing +//! the end offset rather than the size — see [`ListView`][vortex_array::arrays::ListView]). +//! Element `i`'s compressed bytecodes live in `codes_bytes[offsets[i] .. ends[i]]`, and its size +//! is the derived `ends[i] - offsets[i]`. //! -//! Decoupling the start (`offset`) from the length (`size`) means the offsets are no longer -//! required to be monotonic or contiguous, so `filter`, `take`, and `slice` become -//! metadata-only operations: they rewrite only the (small) `offsets`/`sizes`/lengths/validity -//! arrays and **reuse the compressed byte heap untouched**. The plain [`FSST`][crate::FSST] -//! encoding has to rewrite the entire compressed heap for `filter`/`take` because it delegates -//! to `VarBin`. This is the same trade-off `ListView` makes over `List`. +//! Decoupling the start (`offset`) from the end means the offsets are no longer required to be +//! monotonic or contiguous, so `filter`, `take`, and `slice` become metadata-only operations: +//! they rewrite only the (small) `offsets`/`ends`/lengths/validity arrays and **reuse the +//! compressed byte heap untouched**. The plain [`FSST`][crate::FSST] encoding has to rewrite the +//! entire compressed heap for `filter`/`take` because it delegates to `VarBin`. This is the same +//! trade-off `ListView` makes over `List`. +//! +//! Storing the *end* offset (instead of the size) additionally makes the [`FSSTArray`] → +//! [`FSSTViewArray`] conversion allocation-free: a freshly converted heap is contiguous, so both +//! `offsets` and `ends` are zero-copy slices of the FSST's monotonic offsets buffer +//! (`offsets[0..len]` and `offsets[1..len + 1]`). A selective `filter`/`take` therefore never +//! pays to derive sizes for the rows it discards. mod array; mod canonical; diff --git a/encodings/fsst/src/fsstview/ops.rs b/encodings/fsst/src/fsstview/ops.rs index 46fef3554bb..10f31ef1aa9 100644 --- a/encodings/fsst/src/fsstview/ops.rs +++ b/encodings/fsst/src/fsstview/ops.rs @@ -20,9 +20,9 @@ impl OperationsVTable for FSSTView { ) -> VortexResult { // Preconditions (see `OperationsVTable`): `index` is in bounds and non-null. let offset: usize = (&array.codes_offsets().execute_scalar(index, ctx)?).try_into()?; - let size: usize = (&array.codes_sizes().execute_scalar(index, ctx)?).try_into()?; + let end: usize = (&array.codes_ends().execute_scalar(index, ctx)?).try_into()?; - let compressed = &array.codes_bytes()[offset..offset + size]; + let compressed = &array.codes_bytes()[offset..end]; let decoded = ByteBuffer::from(array.decompressor().decompress(compressed)); Ok(varbin_scalar(decoded, array.dtype())) } diff --git a/encodings/fsst/src/fsstview/slice.rs b/encodings/fsst/src/fsstview/slice.rs index 4b77ce16e9f..f3100032237 100644 --- a/encodings/fsst/src/fsstview/slice.rs +++ b/encodings/fsst/src/fsstview/slice.rs @@ -25,7 +25,7 @@ impl SliceReduce for FSSTView { array.symbol_lengths().clone(), array.codes_bytes_handle().clone(), array.codes_offsets().slice(range.clone())?, - array.codes_sizes().slice(range.clone())?, + array.codes_ends().slice(range.clone())?, array.uncompressed_lengths().slice(range.clone())?, array.fsstview_validity().slice(range)?, ) From 6f6c8e6442c799315a7646fe098f5c5de91a0191 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 13:46:24 +0000 Subject: [PATCH 20/23] FSSTView: trim decode-path allocations, redundant validation, and a bench Hot-path cleanups on top of the codes_ends representation: - canonicalize: derive each survivor's size in place from the widened `ends` buffer instead of allocating a third index Vec, and sum `live` (total compressed bytes) only on the bulk-decode paths that use it rather than unconditionally up front (RunDecode never needs it). - fsstview_from_fsst: construct via `new_unchecked`. Every FSSTView invariant is already guaranteed by the source FSSTArray, so re-running `validate_fsstview` on the hot conversion path is wasted work. Trim the test/bench surface for merge: - Drop the `fsst_view_fineweb` bench: its multi-op chain is already covered synthetically by `fsst_view_compute`, and its column materialization overlaps `fsst_view_fineweb_queries`. - Remove the filter/take/slice `*_matches_canonical` smoke tests; the framework conformance tests and the strategy-agreement tests already cover those paths. 104 tests pass; clippy --all-targets --all-features clean; cargo +nightly fmt clean; vortex-file builds; doc tests pass. README and handover updated. Signed-off-by: Joe Isaacs --- FSSTVIEW_HANDOVER.md | 33 +-- encodings/fsst/Cargo.toml | 4 - encodings/fsst/benches/README.md | 19 +- encodings/fsst/benches/fsst_view_fineweb.rs | 295 -------------------- encodings/fsst/src/fsstview/array.rs | 27 +- encodings/fsst/src/fsstview/canonical.rs | 25 +- encodings/fsst/src/fsstview/tests.rs | 66 ----- 7 files changed, 45 insertions(+), 424 deletions(-) delete mode 100644 encodings/fsst/benches/fsst_view_fineweb.rs diff --git a/FSSTVIEW_HANDOVER.md b/FSSTVIEW_HANDOVER.md index 5eb004b4ecf..d340c552118 100644 --- a/FSSTVIEW_HANDOVER.md +++ b/FSSTVIEW_HANDOVER.md @@ -15,7 +15,7 @@ offsets — which **eliminated the conversion floor** that previously made the v - **Branch:** `claude/fsstview-conversion-floor-kRAeg` (built on the original `claude/fsstview-array-listview-TdW45`). -- **Status:** merge-ready. 107 tests pass, `clippy --all-targets --all-features` clean, +- **Status:** merge-ready. 104 tests pass, `clippy --all-targets --all-features` clean, `cargo +nightly fmt` clean, `vortex-file` builds, doc tests pass. - **No PR opened yet** (was waiting on explicit request). - **Scope:** additive, contained in `encodings/fsst/` plus 2 registration lines in `vortex-file`. @@ -56,19 +56,15 @@ decode from the survivor layout: ## Benchmarks & results -Three benches in `encodings/fsst/benches/` (full write-up in `benches/README.md`). All numbers are +Two benches in `encodings/fsst/benches/` (full write-up in `benches/README.md`). All numbers are divan **medians**, 100 samples, single shared machine — directional, relative ordering stable. -1. **`fsst_view_compute`** — synthetic, no external data. ~2 MiB strings, ManyShort (~12 B) / - FewLong (~256 B). Single filter and a 5-op chain → VarBinView. +1. **`fsst_view_compute`** — synthetic, no external data, **runs in CI**. ~2 MiB strings, ManyShort + (~12 B) / FewLong (~256 B). Single filter and a 5-op chain → VarBinView. The chain is where the + view's advantage compounds (each `fsst` op re-rewrites the heap; the view stays metadata-only): - chain FewLong: fsst 371 µs → view **268 µs** (1.4×); chain ManyShort 4.99 ms → **4.12 ms**. -2. **`fsst_view_fineweb`** — real FineWeb `url` (200k × ~72 B) and `text` (40k × ~3 KB) columns. - - single_filter text: 5.81 ms → **4.38 ms** (1.3×) - - chain text: 44.2 ms → **5.16 ms** (**8.6×**) ← headline - - chain url: 6.23 ms → **3.95 ms** (1.6×) - -3. **`fsst_view_fineweb_queries`** — the real `vortex-bench` query predicates (`dump = ...`, +2. **`fsst_view_fineweb_queries`** — the real `vortex-bench` query predicates (`dump = ...`, `date LIKE '2020-10-%'`, `url/text LIKE '%google%'`, `'% vortex %'`, espn filters), evaluated in DuckDB to authentic per-row masks, then materialize the column → VarBinView. Numbers below are a same-machine before/after (old `sizes` representation → new `ends` representation): @@ -79,24 +75,21 @@ divan **medians**, 100 samples, single shared machine — directional, relative - text/espn_and (0.08%): fsst 284 µs vs view **271 µs** (was view 407 µs — flips to a view win) With the `ends` representation the view now **wins or ties every query** in the matrix: the bulk / -clustered / long-`text` cases still win by skipping the per-op heap rewrite (up to 1.68× here, 8.6× -on the chain bench), and the tiny highly selective predicates that used to lose to the conversion -floor now match `fsst` to within noise. Full table in `benches/README.md`. +clustered / long-`text` cases still win by skipping the per-op heap rewrite (up to 1.68× here), and +the tiny highly selective predicates that used to lose to the conversion floor now match `fsst` to +within noise. Full table in `benches/README.md`. -### Reproducing the FineWeb benches +### Reproducing the FineWeb queries bench -The ~2 GB sample is **not** downloaded by the benches. Extract columns + query masks once: +The ~2 GB sample is **not** downloaded by the bench. Extract columns + query masks once: ```bash pip install duckdb python3 encodings/fsst/benches/fineweb_queries_extract.py # writes /tmp/fw_*.bin FINEWEB_DIR=/tmp cargo bench -p vortex-fsst --bench fsst_view_fineweb_queries -# for the column bench: -FINEWEB_URL=/tmp/fw_url.bin FINEWEB_TEXT=/tmp/fw_text.bin \ - cargo bench -p vortex-fsst --bench fsst_view_fineweb ``` -Benches no-op (CI-safe) when the env vars are unset. +The bench no-ops (CI-safe) when `FINEWEB_DIR` is unset. ## Conversion floor — resolved @@ -129,7 +122,7 @@ be layered on the file layer's compression if desired. ## Verification commands ```bash -cargo nextest run -p vortex-fsst # (or cargo test -p vortex-fsst) — 107 pass +cargo nextest run -p vortex-fsst # (or cargo test -p vortex-fsst) — 104 pass cargo clippy -p vortex-fsst --all-targets --all-features cargo clippy -p vortex-file cargo +nightly fmt --all diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index 123bf853837..e2881ae4742 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -60,10 +60,6 @@ required-features = ["_test-harness"] name = "fsst_view_compute" harness = false -[[bench]] -name = "fsst_view_fineweb" -harness = false - [[bench]] name = "fsst_view_fineweb_queries" harness = false diff --git a/encodings/fsst/benches/README.md b/encodings/fsst/benches/README.md index cb93a6a030a..ec02eeba992 100644 --- a/encodings/fsst/benches/README.md +++ b/encodings/fsst/benches/README.md @@ -37,24 +37,7 @@ workloads, each ending in a `VarBinViewArray`: Takeaway: the gap widens with chain length, because each `fsst` op re-rewrites the heap while the view stays metadata-only and defers the single decode. -## 2. `fsst_view_fineweb` — real columns - -Two real columns from the HuggingFace FineWeb 10BT sample: `url` (200 k rows, ~72 B avg) and -`text` (40 k rows, ~3 KB avg). The ~2 GB sample is not downloaded; columns are extracted once -with DuckDB into length-prefixed dumps (see the bench module docs). No-ops unless `FINEWEB_URL` -/ `FINEWEB_TEXT` point at the files. Same two workloads as above. - -| workload | column | fsst | view | speedup | -| --- | --- | --- | --- | --- | -| single_filter | url | 1.02 ms | 0.84 ms | 1.2× | -| single_filter | text | 5.81 ms | 4.38 ms | 1.3× | -| chain (5 ops) | url | 6.23 ms | 3.95 ms | 1.6× | -| chain (5 ops) | text | 44.2 ms | **5.16 ms** | **8.6×** | - -Takeaway: on real data the view wins every case, and decisively for chained ops over long -strings — `fsst` rewrites the ~hundreds-of-MB code heap on every op; the view decodes once. - -## 3. `fsst_view_fineweb_queries` — real query predicates +## 2. `fsst_view_fineweb_queries` — real query predicates The actual `vortex-bench` FineWeb queries are `SELECT * FROM fineweb WHERE `. Each predicate is evaluated once in DuckDB against the real sample to produce an authentic per-row diff --git a/encodings/fsst/benches/fsst_view_fineweb.rs b/encodings/fsst/benches/fsst_view_fineweb.rs deleted file mode 100644 index 0249e424101..00000000000 --- a/encodings/fsst/benches/fsst_view_fineweb.rs +++ /dev/null @@ -1,295 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! FSST vs FSSTView on **real FineWeb columns** (not synthetic data). -//! -//! The HuggingFace FineWeb `10BT` sample is ~2 GB, so this bench does not download it. Instead it -//! reads two length-prefixed binary dumps of real columns, produced once with DuckDB: -//! -//! ```text -//! python3 - <<'PY' -//! import duckdb, struct -//! con = duckdb.connect(); con.execute("INSTALL httpfs; LOAD httpfs;") -//! url = "https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/v1.4.0/sample/10BT/001_00000.parquet" -//! con.execute(f"COPY (SELECT url, text FROM read_parquet('{url}') LIMIT 200000) TO '/tmp/s.parquet' (FORMAT PARQUET)") -//! def dump(col, path, limit): -//! rows = con.execute(f"SELECT {col} FROM read_parquet('/tmp/s.parquet') WHERE {col} IS NOT NULL LIMIT {limit}").fetchall() -//! with open(path, "wb") as f: -//! f.write(struct.pack(") -> std::fmt::Result { - f.write_str(match self { - Column::Url => "url", - Column::Text => "text", - }) - } -} - -impl Column { - fn env_var(self) -> &'static str { - match self { - Column::Url => "FINEWEB_URL", - Column::Text => "FINEWEB_TEXT", - } - } - - fn path(self) -> Option { - std::env::var_os(self.env_var()) - .map(PathBuf::from) - .filter(|p| p.exists()) - } -} - -const COLUMNS: &[Column] = &[Column::Url, Column::Text]; - -/// Read a length-prefixed dump into a `VarBinArray`. Returns `None` if the column isn't configured -/// (so the bench no-ops cleanly when the data isn't present). -fn load_column(col: Column) -> Option { - let bytes = std::fs::read(col.path()?).unwrap(); - let mut pos = 0usize; - #[expect(clippy::cast_possible_truncation)] - let row_count = u64::from_le_bytes(bytes[0..8].try_into().unwrap()) as usize; - pos += 8; - let mut values: Vec>> = Vec::with_capacity(row_count); - for _ in 0..row_count { - let len = u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap()) as usize; - pos += 4; - values.push(Some(bytes[pos..pos + len].to_vec())); - pos += len; - } - Some(VarBinArray::from_iter( - values.into_iter().map(|v| v.map(Vec::into_boxed_slice)), - DType::Utf8(Nullability::NonNullable), - )) -} - -fn compress(varbin: &VarBinArray, ctx: &mut ExecutionCtx) -> FSSTArray { - let compressor = fsst_train_compressor(varbin); - fsst_compress(varbin, varbin.len(), varbin.dtype(), &compressor, ctx) -} - -/// Clustered selection (32 bursts, ~`keep` fraction) — a realistic correlated predicate, the shape -/// where survivors form runs rather than scattering uniformly. -fn clustered_mask(len: usize, keep: f64) -> Mask { - let mut rng = StdRng::seed_from_u64(9); - #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] - let total = (len as f64 * keep) as usize; - let bursts = 32usize; - let burst_len = (total / bursts).max(1); - let mut keep_set = vec![false; len]; - for _ in 0..bursts { - let start = rng.random_range(0..len.saturating_sub(burst_len).max(1)); - for j in start..(start + burst_len).min(len) { - keep_set[j] = true; - } - } - Mask::from_iter(keep_set) -} - -/// Sorted-index take (~`keep` fraction) — an index lookup / RID-list join; preserves heap order. -fn sorted_take(len: usize, keep: f64) -> ArrayRef { - let mut rng = StdRng::seed_from_u64(13); - #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] - let n = (len as f64 * keep) as usize; - let mut idx: Vec = (0..n).map(|_| rng.random_range(0..len as u64)).collect(); - idx.sort_unstable(); - PrimitiveArray::from_iter(idx).into_array() -} - -fn fsst_filter(array: &FSSTArray, mask: &Mask, ctx: &mut ExecutionCtx) -> FSSTArray { - ::filter(array.as_view(), mask, ctx) - .unwrap() - .unwrap() - .try_downcast::() - .ok() - .unwrap() -} - -fn fsst_take(array: &FSSTArray, indices: &ArrayRef, ctx: &mut ExecutionCtx) -> FSSTArray { - ::take(array.as_view(), indices, ctx) - .unwrap() - .unwrap() - .try_downcast::() - .ok() - .unwrap() -} - -fn fsst_to_vbv(array: &FSSTArray, ctx: &mut ExecutionCtx) -> ArrayRef { - array - .clone() - .into_array() - .execute::(ctx) - .unwrap() - .into_array() -} - -// =============================== SINGLE FILTER -> VarBinView =================================== - -#[divan::bench(args = COLUMNS)] -fn single_filter_fsst(bencher: Bencher, col: Column) { - let Some(varbin) = load_column(col) else { - return; - }; - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = clustered_mask(fsst.len(), 0.10); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let filtered = fsst_filter(fsst, mask, ctx); - black_box(fsst_to_vbv(&filtered, ctx)) - }); -} - -#[divan::bench(args = COLUMNS)] -fn single_filter_view(bencher: Bencher, col: Column) { - let Some(varbin) = load_column(col) else { - return; - }; - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - let mask = clustered_mask(fsst.len(), 0.10); - bencher - .with_inputs(|| (&fsst, &mask, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, mask, ctx)| { - let view = fsstview_from_fsst(fsst, ctx).unwrap(); - let filtered = ::filter(view.as_view(), mask, ctx) - .unwrap() - .unwrap() - .try_downcast::() - .ok() - .unwrap(); - black_box( - canonicalize_fsstview_with(filtered.as_view(), FsstViewCompaction::Auto, ctx) - .unwrap(), - ) - }); -} - -// =============================== CHAIN (convert once, N ops, export once) ====================== - -const CHAIN_LEN: usize = 5; - -#[divan::bench(args = COLUMNS)] -fn chain_fsst(bencher: Bencher, col: Column) { - let Some(varbin) = load_column(col) else { - return; - }; - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - bencher - .with_inputs(|| (&fsst, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, ctx)| { - let mut cur = (*fsst).clone(); - for op in 0..CHAIN_LEN { - if op % 2 == 0 { - let mask = clustered_mask(cur.len(), 0.80); - cur = fsst_filter(&cur, &mask, ctx); - } else { - let indices = sorted_take(cur.len(), 0.80); - cur = fsst_take(&cur, &indices, ctx); - } - } - black_box(fsst_to_vbv(&cur, ctx)) - }); -} - -#[divan::bench(args = COLUMNS)] -fn chain_view(bencher: Bencher, col: Column) { - let Some(varbin) = load_column(col) else { - return; - }; - let fsst = compress(&varbin, &mut LEGACY_SESSION.create_execution_ctx()); - bencher - .with_inputs(|| (&fsst, LEGACY_SESSION.create_execution_ctx())) - .bench_refs(|(fsst, ctx)| { - // Convert once, then chain metadata-only ops, canonicalize once. - let mut cur = fsstview_from_fsst(fsst, ctx).unwrap(); - for op in 0..CHAIN_LEN { - let next = if op % 2 == 0 { - let mask = clustered_mask(cur.len(), 0.80); - ::filter(cur.as_view(), &mask, ctx) - .unwrap() - .unwrap() - } else { - let indices = sorted_take(cur.len(), 0.80); - ::take(cur.as_view(), &indices, ctx) - .unwrap() - .unwrap() - }; - cur = next.try_downcast::().ok().unwrap(); - } - black_box( - canonicalize_fsstview_with(cur.as_view(), FsstViewCompaction::Auto, ctx).unwrap(), - ) - }); -} diff --git a/encodings/fsst/src/fsstview/array.rs b/encodings/fsst/src/fsstview/array.rs index 7c4274f017d..3ab1cb381c3 100644 --- a/encodings/fsst/src/fsstview/array.rs +++ b/encodings/fsst/src/fsstview/array.rs @@ -206,16 +206,23 @@ pub fn fsstview_from_fsst(fsst: &FSSTArray, ctx: &mut ExecutionCtx) -> VortexRes let codes_offsets = offsets.slice(0..len)?; let codes_ends = offsets.slice(1..len + 1)?; - FSSTView::try_new( - fsst.dtype().clone(), - fsst.symbols().clone(), - fsst.symbol_lengths().clone(), - fsst.codes_bytes_handle().clone(), - codes_offsets, - codes_ends, - fsst.uncompressed_lengths().clone(), - validity, - ) + // SAFETY: every `FSSTView` invariant is already guaranteed by the source `FSSTArray`: the dtype + // is Binary/Utf8, the offsets are non-nullable integers (so are the two zero-copy slices of + // them, which share `len`), the uncompressed lengths are non-nullable integers of the same + // length, and the validity nullability matches the dtype. Re-validating here would only repeat + // those checks on the hot conversion path. + Ok(unsafe { + FSSTView::new_unchecked( + fsst.dtype().clone(), + fsst.symbols().clone(), + fsst.symbol_lengths().clone(), + fsst.codes_bytes_handle().clone(), + codes_offsets, + codes_ends, + fsst.uncompressed_lengths().clone(), + validity, + ) + }) } fn make_slots( diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index 16e113132d4..9d4208cda0e 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -153,15 +153,13 @@ fn decode_element_ordered( ctx: &mut ExecutionCtx, ) -> VortexResult { let offsets = load_usize(array.codes_offsets(), ctx)?; - // `codes_ends[i]` is element `i`'s end in the heap; derive the per-element size here (over the - // survivors only — never the discarded rows). Downstream layout analysis and decode work on - // `sizes` exactly as before. - let ends = load_usize(array.codes_ends(), ctx)?; - let sizes: Vec = offsets - .iter() - .zip(&ends) - .map(|(&offset, &end)| end - offset) - .collect(); + // Derive each survivor's size in place from its end offset (`codes_ends[i] - codes_offsets[i]`), + // reusing the widened `ends` buffer as `sizes` so we don't allocate a third index array. + // Downstream layout analysis and decode work on `sizes` exactly as before. + let mut sizes = load_usize(array.codes_ends(), ctx)?; + for (size, &offset) in sizes.iter_mut().zip(&offsets) { + *size -= offset; + } let ulen_prim = array .uncompressed_lengths() @@ -173,7 +171,6 @@ fn decode_element_ordered( let total_size: usize = match_each_integer_ptype!(ulen_prim.ptype(), |P| { ulen_prim.as_slice::

().iter().map(|x| *x as usize).sum() }); - let live: usize = sizes.iter().sum(); let heap_buffer = array.codes_bytes(); let heap = heap_buffer.as_slice(); @@ -201,6 +198,9 @@ fn decode_element_ordered( let uncompressed = match chosen { FsstViewCompaction::Direct => { let start = offsets.first().copied().unwrap_or(0); + // `live` (total compressed bytes) is only needed by the bulk-decode paths, not by + // `RunDecode`, so it is summed here rather than unconditionally up front. + let live: usize = sizes.iter().sum(); decompress_direct(&decompressor, heap, start, live, total_size) } FsstViewCompaction::RunDecode => { @@ -208,7 +208,10 @@ fn decode_element_ordered( decompress_run_decode(&decompressor, heap, &offsets, &sizes, &ulens, total_size) } // `Auto` is resolved above; `GatherBulk` is the catch-all. - _ => decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size), + _ => { + let live: usize = sizes.iter().sum(); + decompress_gather(&decompressor, heap, &offsets, &sizes, live, total_size) + } }; Ok(Decoded { diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs index c0decae0363..d9a74eb94ac 100644 --- a/encodings/fsst/src/fsstview/tests.rs +++ b/encodings/fsst/src/fsstview/tests.rs @@ -78,72 +78,6 @@ fn canonicalizes_to_same_values() -> VortexResult<()> { Ok(()) } -#[test] -fn filter_matches_canonical() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let view = make_fsstview(&SAMPLE_NULLABLE, Nullability::Nullable, &mut ctx); - - let mask = Mask::from_iter([true, false, true, false, true, true]); - - // The filtered FSSTView reuses the original byte heap untouched. - let filtered = view.into_array().filter(mask.clone())?; - let result = filtered.execute::(&mut ctx)?; - - let expected = VarBinArray::from_iter( - SAMPLE_NULLABLE.iter().copied(), - DType::Utf8(Nullability::Nullable), - ) - .into_array() - .filter(mask)? - .execute::(&mut ctx)?; - - assert_arrays_eq!(result.into_array(), expected.into_array()); - Ok(()) -} - -#[test] -fn take_matches_canonical() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let view = make_fsstview(&SAMPLE, Nullability::NonNullable, &mut ctx); - - // Reorders and duplicates, which is fine for offsets+sizes addressing. - let indices = PrimitiveArray::from_iter([5u64, 0, 0, 3, 1]).into_array(); - - let taken = view.into_array().take(indices.clone())?; - let result = taken.execute::(&mut ctx)?; - - let expected = VarBinArray::from_iter( - SAMPLE.iter().copied(), - DType::Utf8(Nullability::NonNullable), - ) - .into_array() - .take(indices)? - .execute::(&mut ctx)?; - - assert_arrays_eq!(result.into_array(), expected.into_array()); - Ok(()) -} - -#[test] -fn slice_matches_canonical() -> VortexResult<()> { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - let view = make_fsstview(&SAMPLE, Nullability::NonNullable, &mut ctx); - - let sliced = view.into_array().slice(1..4)?; - let result = sliced.execute::(&mut ctx)?; - - let expected = VarBinArray::from_iter( - SAMPLE.iter().copied(), - DType::Utf8(Nullability::NonNullable), - ) - .into_array() - .slice(1..4)? - .execute::(&mut ctx)?; - - assert_arrays_eq!(result.into_array(), expected.into_array()); - Ok(()) -} - #[test] fn scalar_at_decodes_each_element() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); From ab48ea98b4061d265dedea76ea3109f677038e5f Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 14:03:10 +0000 Subject: [PATCH 21/23] FSSTView: add zero-copy conversion regression guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The conversion-floor fix rests on codes_offsets/codes_ends being zero-copy slices of the FSST's single monotonic offsets buffer (offsets[0..len] and offsets[1..len+1]) — no copy, no per-element sizes array. Nothing tested that invariant directly: the value/agreement tests would still pass if the conversion were reverted to materialize sizes (silently reintroducing the floor), and the bench that measures the floor is gated out of CI. Add conversion_shares_offsets_buffer_zero_copy, which asserts structurally that a freshly converted view's codes_ends slice begins exactly one element past codes_offsets in the same allocation. Deterministic, no timing. 105 tests pass; clippy --all-targets --all-features clean; fmt clean. Signed-off-by: Joe Isaacs --- FSSTVIEW_HANDOVER.md | 12 ++++++-- encodings/fsst/src/fsstview/tests.rs | 41 ++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/FSSTVIEW_HANDOVER.md b/FSSTVIEW_HANDOVER.md index d340c552118..30602f3ac14 100644 --- a/FSSTVIEW_HANDOVER.md +++ b/FSSTVIEW_HANDOVER.md @@ -15,7 +15,7 @@ offsets — which **eliminated the conversion floor** that previously made the v - **Branch:** `claude/fsstview-conversion-floor-kRAeg` (built on the original `claude/fsstview-array-listview-TdW45`). -- **Status:** merge-ready. 104 tests pass, `clippy --all-targets --all-features` clean, +- **Status:** merge-ready. 105 tests pass, `clippy --all-targets --all-features` clean, `cargo +nightly fmt` clean, `vortex-file` builds, doc tests pass. - **No PR opened yet** (was waiting on explicit request). - **Scope:** additive, contained in `encodings/fsst/` plus 2 registration lines in `vortex-file`. @@ -33,7 +33,7 @@ New encoding `vortex.fsstview` in `encodings/fsst/src/fsstview/`: | `from_fsst.rs` | `fsst_filter_to_view` / `fsst_take_to_view` helpers | | `canonical.rs` | decode → `VarBinViewArray` / `VarBinArray`, with the `Auto` export strategy | | `kernel.rs` / `rules.rs` | parent kernel + rule registration | -| `tests.rs` | conformance + agreement + nullable/gapped/RunDecode coverage | +| `tests.rs` | conformance + agreement + nullable/gapped/RunDecode coverage + zero-copy conversion guard | Registered in `vortex-file/src/lib.rs` (`register_default_encodings`). Public API: `FSSTView`, `FSSTViewArray`, `FsstViewCompaction`, `canonicalize_fsstview_with`, @@ -114,6 +114,12 @@ result (same-machine before/after, `fsst_view_fineweb_queries`): `url/vortex` 14 `url/espn_and` 146 µs → **14.9 µs**, and the previously winning clustered cases (`text/dump_eq`, `text/date_prefix`) held flat. The view now wins or ties every query in the matrix. +A regression guard (`conversion_shares_offsets_buffer_zero_copy` in `tests.rs`) asserts the +structural invariant the fix relies on: a freshly converted view's `codes_ends` slice begins exactly +one element past `codes_offsets` in the *same allocation*. This catches a silent revert to a +size-materializing conversion — which the value/agreement tests would not, since the decoded values +would still match — without depending on the FineWeb bench (gated out of CI). + The alternative follow-up (store `sizes` in the narrowest int width) was considered and rejected: it only halves the *write* traffic, leaving the unavoidable full read of the offsets — whereas the `ends` representation removes the whole O(rows) pass. Narrowing widths is orthogonal and can still @@ -122,7 +128,7 @@ be layered on the file layer's compression if desired. ## Verification commands ```bash -cargo nextest run -p vortex-fsst # (or cargo test -p vortex-fsst) — 104 pass +cargo nextest run -p vortex-fsst # (or cargo test -p vortex-fsst) — 105 pass cargo clippy -p vortex-fsst --all-targets --all-features cargo clippy -p vortex-file cargo +nightly fmt --all diff --git a/encodings/fsst/src/fsstview/tests.rs b/encodings/fsst/src/fsstview/tests.rs index d9a74eb94ac..fddbbc2ec5e 100644 --- a/encodings/fsst/src/fsstview/tests.rs +++ b/encodings/fsst/src/fsstview/tests.rs @@ -6,6 +6,7 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; +use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::VarBinViewArray; @@ -16,9 +17,12 @@ use vortex_array::compute::conformance::filter::test_filter_conformance; use vortex_array::compute::conformance::take::test_take_conformance; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; +use vortex_array::match_each_integer_ptype; use vortex_error::VortexResult; +use vortex_error::vortex_err; use vortex_mask::Mask; +use super::array::FSSTViewArraySlotsExt; use crate::FSSTArray; use crate::FSSTView; use crate::FSSTViewArray; @@ -78,6 +82,43 @@ fn canonicalizes_to_same_values() -> VortexResult<()> { Ok(()) } +/// The conversion-floor fix depends on `codes_offsets`/`codes_ends` being **zero-copy** slices of +/// the FSST's single monotonic offsets buffer (`offsets[0..len]` and `offsets[1..len + 1]`) — +/// nothing copied, no per-element `sizes` array materialized. Verify it structurally: a freshly +/// converted view's `codes_ends` must begin exactly one element past `codes_offsets` *in the same +/// allocation*. A regression to a size-materializing conversion would break this even though the +/// decoded values would still agree (so the value/agreement tests would not catch it). The bench +/// that measures the resulting floor (`fsst_view_fineweb_queries`) is gated out of CI. +#[test] +fn conversion_shares_offsets_buffer_zero_copy() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let fsst = make_fsst(&SAMPLE, Nullability::NonNullable, &mut ctx); + + let view = fsstview_from_fsst(&fsst, &mut ctx)?; + let offsets = view + .codes_offsets() + .clone() + .try_downcast::() + .map_err(|_| vortex_err!("codes_offsets should be a primitive slice"))?; + let ends = view + .codes_ends() + .clone() + .try_downcast::() + .map_err(|_| vortex_err!("codes_ends should be a primitive slice"))?; + + assert_eq!(offsets.ptype(), ends.ptype()); + match_each_integer_ptype!(offsets.ptype(), |P| { + let off = offsets.as_slice::

(); + let end = ends.as_slice::

(); + assert_eq!( + end.as_ptr(), + off.as_ptr().wrapping_add(1), + "codes_ends must be codes_offsets shifted by one within the same buffer (zero-copy)" + ); + }); + Ok(()) +} + #[test] fn scalar_at_decodes_each_element() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); From b08f0fd62a13e1a0066a9aae356aafeaf9dd3c32 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 14:05:56 +0000 Subject: [PATCH 22/23] FSSTView: drop the RunDecode uncompressed-lengths precompute RunDecode walked the uncompressed-lengths array into a Vec solely to advance its output cursor by each run's total uncompressed length. But Decompressor::decompress_into already returns the exact decoded byte count for the bytes it just wrote (the same value Direct uses for set_len), and that count equals the run's uncompressed length. Advance out_pos by the return value instead. This removes one O(survivors) allocation on the clustered/range path (the text/dump_eq, text/date_prefix wins) and the per-element run_uncompressed accumulation, and deletes the now-unused widen_ulens helper. A debug_assert_eq!(out_pos, total_size) documents and checks the cursor invariant. The inter-run 7-byte decode slack behaviour is unchanged: each run still decodes exactly its own compressed input. 105 tests pass (incl. the RunDecode-exercising gaps/monotonic-filter tests); clippy --all-targets --all-features clean; fmt clean. Signed-off-by: Joe Isaacs --- encodings/fsst/src/fsstview/canonical.rs | 31 ++++++++---------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index 9d4208cda0e..fa7cc2458be 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -204,8 +204,7 @@ fn decode_element_ordered( decompress_direct(&decompressor, heap, start, live, total_size) } FsstViewCompaction::RunDecode => { - let ulens = widen_ulens(&ulen_prim); - decompress_run_decode(&decompressor, heap, &offsets, &sizes, &ulens, total_size) + decompress_run_decode(&decompressor, heap, &offsets, &sizes, total_size) } // `Auto` is resolved above; `GatherBulk` is the catch-all. _ => { @@ -247,20 +246,6 @@ fn cumulative_offsets(ulen_prim: &PrimitiveArray) -> ArrayRef { offsets.into_array() } -/// Widen an already-executed uncompressed-lengths primitive array into `Vec`. Only -/// `RunDecode` needs this; `Direct`/`GatherBulk` work without it. -fn widen_ulens(ulen_prim: &PrimitiveArray) -> Vec { - #[expect(clippy::cast_possible_truncation)] - let out: Vec = match_each_integer_ptype!(ulen_prim.ptype(), |P| { - ulen_prim - .as_slice::

() - .iter() - .map(|x| *x as usize) - .collect() - }); - out -} - /// The survivor layout in the heap, used to pick an export strategy. enum Layout { /// Survivors are one contiguous in-order run (untouched / sliced view) — `Direct`. @@ -307,7 +292,6 @@ fn decompress_run_decode( heap: &[u8], offsets: &[usize], sizes: &[usize], - ulens: &[usize], total_size: usize, ) -> ByteBufferMut { let mut out = ByteBufferMut::with_capacity(total_size + 7); @@ -324,7 +308,6 @@ fn decompress_run_decode( } let run_heap_start = offsets[i]; let mut run_heap_end = run_heap_start; - let mut run_uncompressed = 0usize; let mut j = i; while j < offsets.len() { if sizes[j] == 0 { @@ -335,14 +318,20 @@ fn decompress_run_decode( break; } run_heap_end += sizes[j]; - run_uncompressed += ulens[j]; j += 1; } - decompressor + // `decompress_into` returns the exact decoded byte count for this run, which equals the + // run's total uncompressed length; advance by it instead of precomputing per-element + // uncompressed lengths. + let written = decompressor .decompress_into(&heap[run_heap_start..run_heap_end], &mut spare[out_pos..]); - out_pos += run_uncompressed; + out_pos += written; i = j; } + debug_assert_eq!( + out_pos, total_size, + "run-decode must fill exactly total_size bytes" + ); } unsafe { out.set_len(total_size) }; out From d1418cfa44dc7658a4808dd420360d0b0f4fb4ef Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 20:56:06 +0000 Subject: [PATCH 23/23] FSSTView: inline the metadata PType getters, drop a stale comment The three FSSTViewMetadata::get_*_ptype getters were each used once, in deserialize, and only wrapped PType::try_from with a custom error message. The sibling FSST encoding's own deserialize already inlines PType::try_from(metadata.x)? directly (its TryFrom error converts to VortexError via ?), so match that: inline the three calls and delete the getter impl block. This also drops the now-unused vortex_err import. Also refresh a comment in canonical.rs that still referenced the ulens: Vec precompute removed in the previous commit. 105 tests pass; clippy --all-targets --all-features clean; fmt clean. Signed-off-by: Joe Isaacs --- encodings/fsst/src/fsstview/array.rs | 27 ++++++------------------ encodings/fsst/src/fsstview/canonical.rs | 4 ++-- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/encodings/fsst/src/fsstview/array.rs b/encodings/fsst/src/fsstview/array.rs index 3ab1cb381c3..47d7baa9cad 100644 --- a/encodings/fsst/src/fsstview/array.rs +++ b/encodings/fsst/src/fsstview/array.rs @@ -31,7 +31,6 @@ use vortex_buffer::Buffer; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; -use vortex_error::vortex_err; use vortex_error::vortex_panic; use vortex_session::VortexSession; use vortex_session::registry::CachedId; @@ -91,23 +90,6 @@ pub struct FSSTViewMetadata { codes_ends_ptype: i32, } -impl FSSTViewMetadata { - fn get_uncompressed_lengths_ptype(&self) -> VortexResult { - PType::try_from(self.uncompressed_lengths_ptype) - .map_err(|_| vortex_err!("Invalid PType {}", self.uncompressed_lengths_ptype)) - } - - fn get_codes_offsets_ptype(&self) -> VortexResult { - PType::try_from(self.codes_offsets_ptype) - .map_err(|_| vortex_err!("Invalid PType {}", self.codes_offsets_ptype)) - } - - fn get_codes_ends_ptype(&self) -> VortexResult { - PType::try_from(self.codes_ends_ptype) - .map_err(|_| vortex_err!("Invalid PType {}", self.codes_ends_ptype)) - } -} - impl FSSTView { /// Build an [`FSSTViewArray`] from its decomposed components. /// @@ -389,7 +371,7 @@ impl VTable for FSSTView { let uncompressed_lengths = children.get( 0, &DType::Primitive( - metadata.get_uncompressed_lengths_ptype()?, + PType::try_from(metadata.uncompressed_lengths_ptype)?, Nullability::NonNullable, ), len, @@ -397,14 +379,17 @@ impl VTable for FSSTView { let codes_offsets = children.get( 1, &DType::Primitive( - metadata.get_codes_offsets_ptype()?, + PType::try_from(metadata.codes_offsets_ptype)?, Nullability::NonNullable, ), len, )?; let codes_ends = children.get( 2, - &DType::Primitive(metadata.get_codes_ends_ptype()?, Nullability::NonNullable), + &DType::Primitive( + PType::try_from(metadata.codes_ends_ptype)?, + Nullability::NonNullable, + ), len, )?; diff --git a/encodings/fsst/src/fsstview/canonical.rs b/encodings/fsst/src/fsstview/canonical.rs index fa7cc2458be..6d5b2b5165f 100644 --- a/encodings/fsst/src/fsstview/canonical.rs +++ b/encodings/fsst/src/fsstview/canonical.rs @@ -165,8 +165,8 @@ fn decode_element_ordered( .uncompressed_lengths() .clone() .execute::(ctx)?; - // `total_size` is needed by every path; sum it from the typed slice. The widened - // `ulens: Vec` is only needed by `RunDecode`, so defer it. + // Total decoded length, used by every path to size the output buffer. Summed straight from the + // typed slice — no need to widen the uncompressed lengths into a `Vec`. #[expect(clippy::cast_possible_truncation)] let total_size: usize = match_each_integer_ptype!(ulen_prim.ptype(), |P| { ulen_prim.as_slice::

().iter().map(|x| *x as usize).sum()