diff --git a/encodings/bytebool/public-api.lock b/encodings/bytebool/public-api.lock index 8edd17d2912..53c71e3bfb6 100644 --- a/encodings/bytebool/public-api.lock +++ b/encodings/bytebool/public-api.lock @@ -76,17 +76,15 @@ pub struct vortex_bytebool::ByteBoolData impl vortex_bytebool::ByteBoolData -pub fn vortex_bytebool::ByteBoolData::as_slice(&self) -> &[bool] - pub fn vortex_bytebool::ByteBoolData::buffer(&self) -> &vortex_array::buffer::BufferHandle -pub fn vortex_bytebool::ByteBoolData::from_vec>(data: alloc::vec::Vec, validity: V) -> Self - pub fn vortex_bytebool::ByteBoolData::is_empty(&self) -> bool pub fn vortex_bytebool::ByteBoolData::len(&self) -> usize -pub fn vortex_bytebool::ByteBoolData::new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> Self +pub fn vortex_bytebool::ByteBoolData::new(buffer: vortex_array::buffer::BufferHandle) -> Self + +pub fn vortex_bytebool::ByteBoolData::truthy_bytes(&self) -> &[u8] pub fn vortex_bytebool::ByteBoolData::validate(buffer: &vortex_array::buffer::BufferHandle, validity: &vortex_array::validity::Validity, dtype: &vortex_array::dtype::DType, len: usize) -> vortex_error::VortexResult<()> @@ -94,14 +92,6 @@ impl core::clone::Clone for vortex_bytebool::ByteBoolData pub fn vortex_bytebool::ByteBoolData::clone(&self) -> vortex_bytebool::ByteBoolData -impl core::convert::From> for vortex_bytebool::ByteBoolData - -pub fn vortex_bytebool::ByteBoolData::from(value: alloc::vec::Vec) -> Self - -impl core::convert::From>> for vortex_bytebool::ByteBoolData - -pub fn vortex_bytebool::ByteBoolData::from(value: alloc::vec::Vec>) -> Self - impl core::fmt::Debug for vortex_bytebool::ByteBoolData pub fn vortex_bytebool::ByteBoolData::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/encodings/bytebool/src/array.rs b/encodings/bytebool/src/array.rs index 1043aefa967..e1604b7642e 100644 --- a/encodings/bytebool/src/array.rs +++ b/encodings/bytebool/src/array.rs @@ -29,7 +29,7 @@ use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityVTable; use vortex_array::vtable::child_to_validity; use vortex_array::vtable::validity_to_child; -use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -131,7 +131,7 @@ impl VTable for ByteBool { } let buffer = buffers[0].clone(); - let data = ByteBoolData::new(buffer, validity.clone()); + let data = ByteBoolData::new(buffer); let slots = ByteBoolData::make_slots(&validity, len); Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) } @@ -149,7 +149,8 @@ impl VTable for ByteBool { } fn execute(array: Array, _ctx: &mut ExecutionCtx) -> VortexResult { - let boolean_buffer = BitBuffer::from(array.as_slice()); + // convert truthy values to set/unset bits + let boolean_buffer = BitBufferMut::from(array.truthy_bytes()).freeze(); let validity = array.validity()?; Ok(ExecutionResult::done( BoolArray::new(boolean_buffer, validity).into_array(), @@ -198,9 +199,17 @@ pub struct ByteBool; impl ByteBool { pub fn new(buffer: BufferHandle, validity: Validity) -> ByteBoolArray { + if let Some(len) = validity.maybe_len() { + assert_eq!( + buffer.len(), + len, + "ByteBool validity and bytes must have same length" + ); + } let dtype = DType::Bool(validity.nullability()); + let slots = ByteBoolData::make_slots(&validity, buffer.len()); - let data = ByteBoolData::new(buffer, validity); + let data = ByteBoolData::new(buffer); let len = data.len(); unsafe { Array::from_parts_unchecked( @@ -212,29 +221,22 @@ impl ByteBool { /// Construct a [`ByteBoolArray`] from a `Vec` and validity. pub fn from_vec>(data: Vec, validity: V) -> ByteBoolArray { let validity = validity.into(); - let data = ByteBoolData::from_vec(data, validity.clone()); - let dtype = DType::Bool(validity.nullability()); - let len = data.len(); - let slots = ByteBoolData::make_slots(&validity, len); - unsafe { - Array::from_parts_unchecked( - ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots), - ) - } + // NOTE: this will not cause allocation on release builds + let bytes: Vec = data.into_iter().map(|b| b as u8).collect(); + let handle = BufferHandle::new_host(ByteBuffer::from(bytes)); + ByteBool::new(handle, validity) } /// Construct a [`ByteBoolArray`] from optional bools. pub fn from_option_vec(data: Vec>) -> ByteBoolArray { let validity = Validity::from_iter(data.iter().map(|v| v.is_some())); - let data = ByteBoolData::from(data); - let dtype = DType::Bool(validity.nullability()); - let len = data.len(); - let slots = ByteBoolData::make_slots(&validity, len); - unsafe { - Array::from_parts_unchecked( - ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots), - ) - } + // NOTE: this will not cause allocation on release builds + let bytes: Vec = data + .into_iter() + .map(|b| b.unwrap_or_default() as u8) + .collect(); + let handle = BufferHandle::new_host(ByteBuffer::from(bytes)); + ByteBool::new(handle, validity) } } @@ -265,17 +267,7 @@ impl ByteBoolData { vec![validity_to_child(validity, len)] } - pub fn new(buffer: BufferHandle, validity: Validity) -> Self { - let length = buffer.len(); - if let Some(vlen) = validity.maybe_len() - && length != vlen - { - vortex_panic!( - "Buffer length ({}) does not match validity length ({})", - length, - vlen - ); - } + pub fn new(buffer: BufferHandle) -> Self { Self { buffer } } @@ -289,21 +281,15 @@ impl ByteBoolData { self.buffer.len() == 0 } - // TODO(ngates): deprecate construction from vec - pub fn from_vec>(data: Vec, validity: V) -> Self { - let validity = validity.into(); - // SAFETY: we are transmuting a Vec into a Vec - let data: Vec = unsafe { std::mem::transmute(data) }; - Self::new(BufferHandle::new_host(ByteBuffer::from(data)), validity) - } - pub fn buffer(&self) -> &BufferHandle { &self.buffer } - pub fn as_slice(&self) -> &[bool] { - // Safety: The internal buffer contains byte-sized bools - unsafe { std::mem::transmute(self.buffer().as_host().as_slice()) } + /// Get access to the underlying 8-bit truthy values. + /// + /// The zero byte indicates `false`, and any non-zero byte is a `true`. + pub fn truthy_bytes(&self) -> &[u8] { + self.buffer().as_host().as_slice() } } @@ -326,23 +312,6 @@ impl OperationsVTable for ByteBool { } } -impl From> for ByteBoolData { - fn from(value: Vec) -> Self { - Self::from_vec(value, Validity::AllValid) - } -} - -impl From>> for ByteBoolData { - fn from(value: Vec>) -> Self { - let validity = Validity::from_iter(value.iter().map(|v| v.is_some())); - - // This doesn't reallocate, and the compiler even vectorizes it - let data = value.into_iter().map(Option::unwrap_or_default).collect(); - - Self::from_vec(data, validity) - } -} - #[cfg(test)] mod tests { use vortex_array::ArrayContext; diff --git a/encodings/bytebool/src/compute.rs b/encodings/bytebool/src/compute.rs index 0fd75cb8123..609a7bc553f 100644 --- a/encodings/bytebool/src/compute.rs +++ b/encodings/bytebool/src/compute.rs @@ -8,11 +8,13 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::TakeExecute; +use vortex_array::buffer::BufferHandle; use vortex_array::dtype::DType; use vortex_array::match_each_integer_ptype; use vortex_array::scalar_fn::fns::cast::CastReduce; use vortex_array::scalar_fn::fns::mask::MaskReduce; use vortex_array::validity::Validity; +use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use super::ByteBool; @@ -58,23 +60,25 @@ impl TakeExecute for ByteBool { ctx: &mut ExecutionCtx, ) -> VortexResult> { let indices = indices.clone().execute::(ctx)?; - let bools = array.as_slice(); + let values = array.truthy_bytes(); // This handles combining validity from both source array and nullable indices let validity = array.validity()?.take(&indices.clone().into_array())?; - let taken_bools = match_each_integer_ptype!(indices.ptype(), |I| { + let taken = match_each_integer_ptype!(indices.ptype(), |I| { indices .as_slice::() .iter() .map(|&idx| { let idx: usize = idx.as_(); - bools[idx] + values[idx] }) - .collect::>() + .collect::() }); - Ok(Some(ByteBool::from_vec(taken_bools, validity).into_array())) + Ok(Some( + ByteBool::new(BufferHandle::new_host(taken), validity).into_array(), + )) } } diff --git a/encodings/bytebool/src/lib.rs b/encodings/bytebool/src/lib.rs index 35579a89dc8..3258341cdb9 100644 --- a/encodings/bytebool/src/lib.rs +++ b/encodings/bytebool/src/lib.rs @@ -1,6 +1,45 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! A Vortex encoding that mirrors Arrow's [8-bit Boolean canonical extension type][spec]. +//! +//! Each element is stored as a single byte. The zero byte represents `false` and any +//! non-zero byte represents `true`, matching the truthy semantics of the Arrow spec. This +//! trades 8x the storage of the bit-packed `Bool` layout for cheaper per-byte access — +//! useful when data arrives from a C ABI or other source that already emits byte-wide +//! booleans. On execution the array materializes into the standard bit-packed +//! [`BoolArray`][vortex_array::arrays::BoolArray]. +//! +//! # Examples +//! +//! Any non-zero byte in the backing buffer is treated as `true` when the array executes +//! to a canonical [`BoolArray`][vortex_array::arrays::BoolArray]: +//! +//! ``` +//! # use vortex_array::{IntoArray, LEGACY_SESSION, VortexSessionExecute}; +//! # use vortex_array::arrays::BoolArray; +//! # use vortex_array::arrays::bool::BoolArrayExt; +//! # use vortex_array::buffer::BufferHandle; +//! # use vortex_array::validity::Validity; +//! # use vortex_buffer::ByteBuffer; +//! # use vortex_bytebool::ByteBool; +//! # use vortex_error::VortexResult; +//! # fn main() -> VortexResult<()> { +//! # let mut ctx = LEGACY_SESSION.create_execution_ctx(); +//! let handle = BufferHandle::new_host(ByteBuffer::from(vec![0u8, 1, 42, 0])); +//! let array = ByteBool::new(handle, Validity::NonNullable); +//! +//! let bits = array.into_array().execute::(&mut ctx)?.to_bit_buffer(); +//! assert!(!bits.value(0)); +//! assert!(bits.value(1)); +//! assert!(bits.value(2)); // byte 42 is truthy +//! assert!(!bits.value(3)); +//! # Ok(()) +//! # } +//! ``` +//! +//! [spec]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean + pub use array::*; mod array; diff --git a/vortex-buffer/public-api.lock b/vortex-buffer/public-api.lock index c89264da253..cd9e0900247 100644 --- a/vortex-buffer/public-api.lock +++ b/vortex-buffer/public-api.lock @@ -504,6 +504,10 @@ impl core::convert::From<&[bool]> for vortex_buffer::BitBufferMut pub fn vortex_buffer::BitBufferMut::from(value: &[bool]) -> Self +impl core::convert::From<&[u8]> for vortex_buffer::BitBufferMut + +pub fn vortex_buffer::BitBufferMut::from(value: &[u8]) -> Self + impl core::convert::From> for vortex_buffer::BitBufferMut pub fn vortex_buffer::BitBufferMut::from(value: alloc::vec::Vec) -> Self diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index f46a00a8fe0..bf42e92d571 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -573,6 +573,13 @@ impl From<&[bool]> for BitBufferMut { } } +// allow building a buffer from a set of truthy byte values. +impl From<&[u8]> for BitBufferMut { + fn from(value: &[u8]) -> Self { + BitBufferMut::collect_bool(value.len(), |i| value[i] > 0) + } +} + impl From> for BitBufferMut { fn from(value: Vec) -> Self { value.as_slice().into()