From 655082cecce00ca33f59bc3ffa37d5ff6208b5b1 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Apr 2026 11:45:36 -0400 Subject: [PATCH 1/5] fix potential UB in ByteBool constructor Signed-off-by: Andrew Duffy --- encodings/bytebool/public-api.lock | 10 ++ encodings/bytebool/src/array.rs | 159 ++++++++++++++++++++++++++--- 2 files changed, 157 insertions(+), 12 deletions(-) diff --git a/encodings/bytebool/public-api.lock b/encodings/bytebool/public-api.lock index 8edd17d2912..49d9d35eef0 100644 --- a/encodings/bytebool/public-api.lock +++ b/encodings/bytebool/public-api.lock @@ -10,6 +10,10 @@ pub fn vortex_bytebool::ByteBool::from_vec vortex_bytebool::ByteBoolArray +pub unsafe fn vortex_bytebool::ByteBool::new_unchecked(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> vortex_bytebool::ByteBoolArray + +pub fn vortex_bytebool::ByteBool::try_new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> vortex_error::VortexResult + impl core::clone::Clone for vortex_bytebool::ByteBool pub fn vortex_bytebool::ByteBool::clone(&self) -> vortex_bytebool::ByteBool @@ -88,8 +92,14 @@ pub fn vortex_bytebool::ByteBoolData::len(&self) -> usize pub fn vortex_bytebool::ByteBoolData::new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> Self +pub unsafe fn vortex_bytebool::ByteBoolData::new_unchecked(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> Self + +pub fn vortex_bytebool::ByteBoolData::try_new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> vortex_error::VortexResult + pub fn vortex_bytebool::ByteBoolData::validate(buffer: &vortex_array::buffer::BufferHandle, validity: &vortex_array::validity::Validity, dtype: &vortex_array::dtype::DType, len: usize) -> vortex_error::VortexResult<()> +pub fn vortex_bytebool::ByteBoolData::validate_bytes(buffer: &vortex_array::buffer::BufferHandle) -> vortex_error::VortexResult<()> + impl core::clone::Clone for vortex_bytebool::ByteBoolData pub fn vortex_bytebool::ByteBoolData::clone(&self) -> vortex_bytebool::ByteBoolData diff --git a/encodings/bytebool/src/array.rs b/encodings/bytebool/src/array.rs index 1043aefa967..bffa1363615 100644 --- a/encodings/bytebool/src/array.rs +++ b/encodings/bytebool/src/array.rs @@ -31,9 +31,11 @@ use vortex_array::vtable::child_to_validity; use vortex_array::vtable::validity_to_child; use vortex_buffer::BitBuffer; use vortex_buffer::ByteBuffer; +use vortex_error::VortexExpect as _; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; +use vortex_error::vortex_ensure_eq; use vortex_error::vortex_panic; use vortex_session::VortexSession; use vortex_session::registry::CachedId; @@ -131,7 +133,7 @@ impl VTable for ByteBool { } let buffer = buffers[0].clone(); - let data = ByteBoolData::new(buffer, validity.clone()); + let data = ByteBoolData::try_new(buffer, validity.clone())?; let slots = ByteBoolData::make_slots(&validity, len); Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) } @@ -197,11 +199,41 @@ impl> ByteBoolArrayExt for T {} pub struct ByteBool; impl ByteBool { + /// Construct a [`ByteBoolArray`] from a raw bytes buffer and validity. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented in + /// [`ByteBool::new_unchecked`]. pub fn new(buffer: BufferHandle, validity: Validity) -> ByteBoolArray { + Self::try_new(buffer, validity).vortex_expect("ByteBoolArray construction failed") + } + + /// Construct a [`ByteBoolArray`] from a raw bytes buffer and validity, returning + /// an error if the provided components do not satisfy the invariants documented + /// in [`ByteBool::new_unchecked`]. + pub fn try_new(buffer: BufferHandle, validity: Validity) -> VortexResult { let dtype = DType::Bool(validity.nullability()); - let slots = ByteBoolData::make_slots(&validity, buffer.len()); - let data = ByteBoolData::new(buffer, validity); - let len = data.len(); + let len = buffer.len(); + let slots = ByteBoolData::make_slots(&validity, len); + let data = ByteBoolData::try_new(buffer, validity)?; + Array::try_from_parts(ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots)) + } + + /// Construct a [`ByteBoolArray`] without validating the buffer contents. + /// + /// # Safety + /// + /// Every byte of `buffer` must be `0x00` or `0x01`. Any other byte value is + /// Undefined Behavior because [`ByteBoolData::as_slice`] reinterprets the buffer + /// as `&[bool]`, and a `bool` with any bit pattern other than 0 or 1 is UB. + /// If `validity` is [`Validity::Array`], its length must equal `buffer.len()`. + pub unsafe fn new_unchecked(buffer: BufferHandle, validity: Validity) -> ByteBoolArray { + let dtype = DType::Bool(validity.nullability()); + let len = buffer.len(); + let slots = ByteBoolData::make_slots(&validity, len); + // SAFETY: caller guarantees every buffer byte is 0 or 1. + let data = unsafe { ByteBoolData::new_unchecked(buffer, validity) }; unsafe { Array::from_parts_unchecked( ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots), @@ -261,22 +293,81 @@ impl ByteBoolData { Ok(()) } + /// Validate that every byte of `buffer` is `0x00` or `0x01`. + /// + /// [`ByteBoolData::as_slice`] transmutes the buffer's bytes to `&[bool]`; any byte + /// other than `0x00` or `0x01` would produce a `bool` with an invalid bit pattern, + /// which is Undefined Behavior per the Rust reference. + /// + /// Device-resident buffers are not host-readable and are skipped. + pub fn validate_bytes(buffer: &BufferHandle) -> VortexResult<()> { + let Some(bytes) = buffer.as_host_opt() else { + return Ok(()); + }; + // Count over a flat `&[u8]` vectorizes to pcmpgtb/pmovmskb on x86 and + // cmhi/addv on aarch64, so the fast path runs at ~16 bytes/cycle. + // See https://godbolt.org/z/z797nT1c8 + let invalid = bytes.as_slice().iter().filter(|&&b| b > 1).count(); + vortex_ensure_eq!( + invalid, + 0, + "ByteBoolArray buffer contains {invalid} bytes that are not 0 or 1", + ); + Ok(()) + } + fn make_slots(validity: &Validity, len: usize) -> Vec> { vec![validity_to_child(validity, len)] } + /// Construct [`ByteBoolData`] from a raw bytes buffer and validity. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented in + /// [`ByteBoolData::new_unchecked`]. pub fn new(buffer: BufferHandle, validity: Validity) -> Self { - let length = buffer.len(); - if let Some(vlen) = validity.maybe_len() - && length != vlen - { - vortex_panic!( + Self::try_new(buffer, validity).vortex_expect("ByteBoolData construction failed") + } + + /// Construct [`ByteBoolData`] from a raw bytes buffer and validity, returning an + /// error if the provided components do not satisfy the invariants documented in + /// [`ByteBoolData::new_unchecked`]. + pub fn try_new(buffer: BufferHandle, validity: Validity) -> VortexResult { + Self::check_validity_len(&buffer, &validity)?; + Self::validate_bytes(&buffer)?; + // SAFETY: buffer bytes and validity length validated above. + Ok(unsafe { Self::new_unchecked(buffer, validity) }) + } + + /// Construct [`ByteBoolData`] without validating the buffer contents. + /// + /// # Safety + /// + /// Every byte of `buffer` must be `0x00` or `0x01`. Any other byte value is + /// Undefined Behavior because [`ByteBoolData::as_slice`] reinterprets the buffer + /// as `&[bool]`, and a `bool` with any bit pattern other than 0 or 1 is UB. + /// If `validity` is [`Validity::Array`], its length must equal `buffer.len()`. + pub unsafe fn new_unchecked(buffer: BufferHandle, validity: Validity) -> Self { + debug_assert!( + Self::validate_bytes(&buffer).is_ok(), + "ByteBoolData::new_unchecked called with non-boolean bytes", + ); + Self::check_validity_len(&buffer, &validity) + .vortex_expect("ByteBoolData::new_unchecked called with mismatched validity length"); + Self { buffer } + } + + fn check_validity_len(buffer: &BufferHandle, validity: &Validity) -> VortexResult<()> { + if let Some(vlen) = validity.maybe_len() { + vortex_ensure!( + buffer.len() == vlen, "Buffer length ({}) does not match validity length ({})", - length, + buffer.len(), vlen ); } - Self { buffer } + Ok(()) } /// Returns the number of elements in the array. @@ -294,7 +385,9 @@ impl ByteBoolData { let validity = validity.into(); // SAFETY: we are transmuting a Vec into a Vec let data: Vec = unsafe { std::mem::transmute(data) }; - Self::new(BufferHandle::new_host(ByteBuffer::from(data)), validity) + let buffer = BufferHandle::new_host(ByteBuffer::from(data)); + // SAFETY: bytes came from `Vec`, which guarantees values of 0 or 1. + unsafe { Self::new_unchecked(buffer, validity) } } pub fn buffer(&self) -> &BufferHandle { @@ -392,6 +485,48 @@ mod tests { assert_eq!(arr.len(), 2); } + #[test] + fn try_new_rejects_invalid_byte() { + // `ByteBoolData::as_slice` transmutes the underlying bytes into `&[bool]`. + // A bool with any bit pattern other than 0 or 1 is Undefined Behavior per + // the Rust reference, so `try_new` must reject these buffers. + let raw = ByteBuffer::from(vec![0x02u8, 0x01, 0xFFu8]); + let handle = BufferHandle::new_host(raw); + let err = ByteBool::try_new(handle, Validity::NonNullable).unwrap_err(); + assert!( + err.to_string().contains("bytes that are not 0 or 1"), + "unexpected error: {err}", + ); + } + + #[test] + #[should_panic(expected = "bytes that are not 0 or 1")] + fn new_panics_on_invalid_byte() { + let raw = ByteBuffer::from(vec![0x02u8]); + let handle = BufferHandle::new_host(raw); + drop(ByteBool::new(handle, Validity::NonNullable)); + } + + #[test] + fn new_unchecked_accepts_valid_bytes() { + let raw = ByteBuffer::from(vec![0x00u8, 0x01, 0x01, 0x00]); + let handle = BufferHandle::new_host(raw); + // SAFETY: all bytes are 0 or 1. + let arr = unsafe { ByteBool::new_unchecked(handle, Validity::NonNullable) }; + assert_eq!(arr.len(), 4); + assert_eq!(arr.as_slice(), &[false, true, true, false]); + } + + #[test] + #[cfg(debug_assertions)] + #[should_panic(expected = "non-boolean bytes")] + fn new_unchecked_debug_asserts_invalid_bytes() { + let raw = ByteBuffer::from(vec![0x02u8]); + let handle = BufferHandle::new_host(raw); + // SAFETY: intentionally violated to exercise the debug assertion. + drop(unsafe { ByteBool::new_unchecked(handle, Validity::NonNullable) }); + } + #[test] fn test_nullable_bytebool_serde_roundtrip() { let array = ByteBool::from_option_vec(vec![Some(true), None, Some(false), None]); From 9b08bf2205a20929df5d6b82d0066e9dc4114fc9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Apr 2026 12:32:54 -0400 Subject: [PATCH 2/5] save Signed-off-by: Andrew Duffy --- encodings/bytebool/src/array.rs | 21 ++++++++++++--------- encodings/bytebool/src/compute.rs | 17 ++++++++++++----- vortex-buffer/src/bit/buf_mut.rs | 7 +++++++ 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/encodings/bytebool/src/array.rs b/encodings/bytebool/src/array.rs index bffa1363615..47c9b791b64 100644 --- a/encodings/bytebool/src/array.rs +++ b/encodings/bytebool/src/array.rs @@ -29,8 +29,8 @@ use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityVTable; use vortex_array::vtable::child_to_validity; use vortex_array::vtable::validity_to_child; -use vortex_buffer::BitBuffer; use vortex_buffer::ByteBuffer; +use vortex_buffer::{BitBuffer, BitBufferMut}; use vortex_error::VortexExpect as _; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -151,7 +151,8 @@ impl VTable for ByteBool { } fn execute(array: Array, _ctx: &mut ExecutionCtx) -> VortexResult { - let boolean_buffer = BitBuffer::from(array.as_slice()); + // convert truthy values to set/unset bits + let boolean_buffer = BitBufferMut::from(array.truthy_bytes()).freeze(); let validity = array.validity()?; Ok(ExecutionResult::done( BoolArray::new(boolean_buffer, validity).into_array(), @@ -225,7 +226,7 @@ impl ByteBool { /// # Safety /// /// Every byte of `buffer` must be `0x00` or `0x01`. Any other byte value is - /// Undefined Behavior because [`ByteBoolData::as_slice`] reinterprets the buffer + /// Undefined Behavior because [`ByteBoolData::truthy_bytes`] reinterprets the buffer /// as `&[bool]`, and a `bool` with any bit pattern other than 0 or 1 is UB. /// If `validity` is [`Validity::Array`], its length must equal `buffer.len()`. pub unsafe fn new_unchecked(buffer: BufferHandle, validity: Validity) -> ByteBoolArray { @@ -295,7 +296,7 @@ impl ByteBoolData { /// Validate that every byte of `buffer` is `0x00` or `0x01`. /// - /// [`ByteBoolData::as_slice`] transmutes the buffer's bytes to `&[bool]`; any byte + /// [`ByteBoolData::truthy_bytes`] transmutes the buffer's bytes to `&[bool]`; any byte /// other than `0x00` or `0x01` would produce a `bool` with an invalid bit pattern, /// which is Undefined Behavior per the Rust reference. /// @@ -345,7 +346,7 @@ impl ByteBoolData { /// # Safety /// /// Every byte of `buffer` must be `0x00` or `0x01`. Any other byte value is - /// Undefined Behavior because [`ByteBoolData::as_slice`] reinterprets the buffer + /// Undefined Behavior because [`ByteBoolData::truthy_bytes`] reinterprets the buffer /// as `&[bool]`, and a `bool` with any bit pattern other than 0 or 1 is UB. /// If `validity` is [`Validity::Array`], its length must equal `buffer.len()`. pub unsafe fn new_unchecked(buffer: BufferHandle, validity: Validity) -> Self { @@ -394,9 +395,11 @@ impl ByteBoolData { &self.buffer } - pub fn as_slice(&self) -> &[bool] { - // Safety: The internal buffer contains byte-sized bools - unsafe { std::mem::transmute(self.buffer().as_host().as_slice()) } + /// Get access to the underlying 8-bit truthy values. + /// + /// The zero byte indicates `false`, and any non-zero byte is a `true`. + pub fn truthy_bytes(&self) -> &[u8] { + self.buffer().as_host().as_slice() } } @@ -514,7 +517,7 @@ mod tests { // SAFETY: all bytes are 0 or 1. let arr = unsafe { ByteBool::new_unchecked(handle, Validity::NonNullable) }; assert_eq!(arr.len(), 4); - assert_eq!(arr.as_slice(), &[false, true, true, false]); + assert_eq!(arr.truthy_bytes(), &[false, true, true, false]); } #[test] diff --git a/encodings/bytebool/src/compute.rs b/encodings/bytebool/src/compute.rs index 0fd75cb8123..877529aeb52 100644 --- a/encodings/bytebool/src/compute.rs +++ b/encodings/bytebool/src/compute.rs @@ -8,11 +8,13 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::dict::TakeExecute; +use vortex_array::buffer::BufferHandle; use vortex_array::dtype::DType; use vortex_array::match_each_integer_ptype; use vortex_array::scalar_fn::fns::cast::CastReduce; use vortex_array::scalar_fn::fns::mask::MaskReduce; use vortex_array::validity::Validity; +use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use super::ByteBool; @@ -58,23 +60,28 @@ impl TakeExecute for ByteBool { ctx: &mut ExecutionCtx, ) -> VortexResult> { let indices = indices.clone().execute::(ctx)?; - let bools = array.as_slice(); + let values = array.truthy_bytes(); // This handles combining validity from both source array and nullable indices let validity = array.validity()?.take(&indices.clone().into_array())?; - let taken_bools = match_each_integer_ptype!(indices.ptype(), |I| { + let taken = match_each_integer_ptype!(indices.ptype(), |I| { indices .as_slice::() .iter() .map(|&idx| { let idx: usize = idx.as_(); - bools[idx] + values[idx] }) - .collect::>() + .collect::() }); - Ok(Some(ByteBool::from_vec(taken_bools, validity).into_array())) + // SAFETY: + unsafe { + Ok(Some( + ByteBool::new_unchecked(BufferHandle::new_host(taken), validity).into_array(), + )) + } } } diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs index f46a00a8fe0..bf42e92d571 100644 --- a/vortex-buffer/src/bit/buf_mut.rs +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -573,6 +573,13 @@ impl From<&[bool]> for BitBufferMut { } } +// allow building a buffer from a set of truthy byte values. +impl From<&[u8]> for BitBufferMut { + fn from(value: &[u8]) -> Self { + BitBufferMut::collect_bool(value.len(), |i| value[i] > 0) + } +} + impl From> for BitBufferMut { fn from(value: Vec) -> Self { value.as_slice().into() From d9e61d0cc69274ff7201b99c2bd4edf25914fb44 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Apr 2026 12:37:24 -0400 Subject: [PATCH 3/5] Revert "fix potential UB in ByteBool constructor" This reverts commit 655082cecce00ca33f59bc3ffa37d5ff6208b5b1. Signed-off-by: Andrew Duffy --- encodings/bytebool/public-api.lock | 12 +-- encodings/bytebool/src/array.rs | 160 +++-------------------------- encodings/bytebool/src/compute.rs | 9 +- vortex-buffer/public-api.lock | 4 + 4 files changed, 21 insertions(+), 164 deletions(-) diff --git a/encodings/bytebool/public-api.lock b/encodings/bytebool/public-api.lock index 49d9d35eef0..db95fd287a4 100644 --- a/encodings/bytebool/public-api.lock +++ b/encodings/bytebool/public-api.lock @@ -10,10 +10,6 @@ pub fn vortex_bytebool::ByteBool::from_vec vortex_bytebool::ByteBoolArray -pub unsafe fn vortex_bytebool::ByteBool::new_unchecked(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> vortex_bytebool::ByteBoolArray - -pub fn vortex_bytebool::ByteBool::try_new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> vortex_error::VortexResult - impl core::clone::Clone for vortex_bytebool::ByteBool pub fn vortex_bytebool::ByteBool::clone(&self) -> vortex_bytebool::ByteBool @@ -80,8 +76,6 @@ pub struct vortex_bytebool::ByteBoolData impl vortex_bytebool::ByteBoolData -pub fn vortex_bytebool::ByteBoolData::as_slice(&self) -> &[bool] - pub fn vortex_bytebool::ByteBoolData::buffer(&self) -> &vortex_array::buffer::BufferHandle pub fn vortex_bytebool::ByteBoolData::from_vec>(data: alloc::vec::Vec, validity: V) -> Self @@ -92,14 +86,10 @@ pub fn vortex_bytebool::ByteBoolData::len(&self) -> usize pub fn vortex_bytebool::ByteBoolData::new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> Self -pub unsafe fn vortex_bytebool::ByteBoolData::new_unchecked(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> Self - -pub fn vortex_bytebool::ByteBoolData::try_new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> vortex_error::VortexResult +pub fn vortex_bytebool::ByteBoolData::truthy_bytes(&self) -> &[u8] pub fn vortex_bytebool::ByteBoolData::validate(buffer: &vortex_array::buffer::BufferHandle, validity: &vortex_array::validity::Validity, dtype: &vortex_array::dtype::DType, len: usize) -> vortex_error::VortexResult<()> -pub fn vortex_bytebool::ByteBoolData::validate_bytes(buffer: &vortex_array::buffer::BufferHandle) -> vortex_error::VortexResult<()> - impl core::clone::Clone for vortex_bytebool::ByteBoolData pub fn vortex_bytebool::ByteBoolData::clone(&self) -> vortex_bytebool::ByteBoolData diff --git a/encodings/bytebool/src/array.rs b/encodings/bytebool/src/array.rs index 47c9b791b64..c9c61ac047e 100644 --- a/encodings/bytebool/src/array.rs +++ b/encodings/bytebool/src/array.rs @@ -29,13 +29,11 @@ use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityVTable; use vortex_array::vtable::child_to_validity; use vortex_array::vtable::validity_to_child; +use vortex_buffer::BitBufferMut; use vortex_buffer::ByteBuffer; -use vortex_buffer::{BitBuffer, BitBufferMut}; -use vortex_error::VortexExpect as _; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; -use vortex_error::vortex_ensure_eq; use vortex_error::vortex_panic; use vortex_session::VortexSession; use vortex_session::registry::CachedId; @@ -133,7 +131,7 @@ impl VTable for ByteBool { } let buffer = buffers[0].clone(); - let data = ByteBoolData::try_new(buffer, validity.clone())?; + let data = ByteBoolData::new(buffer, validity.clone()); let slots = ByteBoolData::make_slots(&validity, len); Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) } @@ -200,41 +198,12 @@ impl> ByteBoolArrayExt for T {} pub struct ByteBool; impl ByteBool { - /// Construct a [`ByteBoolArray`] from a raw bytes buffer and validity. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented in - /// [`ByteBool::new_unchecked`]. pub fn new(buffer: BufferHandle, validity: Validity) -> ByteBoolArray { - Self::try_new(buffer, validity).vortex_expect("ByteBoolArray construction failed") - } - - /// Construct a [`ByteBoolArray`] from a raw bytes buffer and validity, returning - /// an error if the provided components do not satisfy the invariants documented - /// in [`ByteBool::new_unchecked`]. - pub fn try_new(buffer: BufferHandle, validity: Validity) -> VortexResult { let dtype = DType::Bool(validity.nullability()); - let len = buffer.len(); - let slots = ByteBoolData::make_slots(&validity, len); - let data = ByteBoolData::try_new(buffer, validity)?; - Array::try_from_parts(ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots)) - } - /// Construct a [`ByteBoolArray`] without validating the buffer contents. - /// - /// # Safety - /// - /// Every byte of `buffer` must be `0x00` or `0x01`. Any other byte value is - /// Undefined Behavior because [`ByteBoolData::truthy_bytes`] reinterprets the buffer - /// as `&[bool]`, and a `bool` with any bit pattern other than 0 or 1 is UB. - /// If `validity` is [`Validity::Array`], its length must equal `buffer.len()`. - pub unsafe fn new_unchecked(buffer: BufferHandle, validity: Validity) -> ByteBoolArray { - let dtype = DType::Bool(validity.nullability()); - let len = buffer.len(); - let slots = ByteBoolData::make_slots(&validity, len); - // SAFETY: caller guarantees every buffer byte is 0 or 1. - let data = unsafe { ByteBoolData::new_unchecked(buffer, validity) }; + let slots = ByteBoolData::make_slots(&validity, buffer.len()); + let data = ByteBoolData::new(buffer, validity); + let len = data.len(); unsafe { Array::from_parts_unchecked( ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots), @@ -294,81 +263,22 @@ impl ByteBoolData { Ok(()) } - /// Validate that every byte of `buffer` is `0x00` or `0x01`. - /// - /// [`ByteBoolData::truthy_bytes`] transmutes the buffer's bytes to `&[bool]`; any byte - /// other than `0x00` or `0x01` would produce a `bool` with an invalid bit pattern, - /// which is Undefined Behavior per the Rust reference. - /// - /// Device-resident buffers are not host-readable and are skipped. - pub fn validate_bytes(buffer: &BufferHandle) -> VortexResult<()> { - let Some(bytes) = buffer.as_host_opt() else { - return Ok(()); - }; - // Count over a flat `&[u8]` vectorizes to pcmpgtb/pmovmskb on x86 and - // cmhi/addv on aarch64, so the fast path runs at ~16 bytes/cycle. - // See https://godbolt.org/z/z797nT1c8 - let invalid = bytes.as_slice().iter().filter(|&&b| b > 1).count(); - vortex_ensure_eq!( - invalid, - 0, - "ByteBoolArray buffer contains {invalid} bytes that are not 0 or 1", - ); - Ok(()) - } - fn make_slots(validity: &Validity, len: usize) -> Vec> { vec![validity_to_child(validity, len)] } - /// Construct [`ByteBoolData`] from a raw bytes buffer and validity. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented in - /// [`ByteBoolData::new_unchecked`]. pub fn new(buffer: BufferHandle, validity: Validity) -> Self { - Self::try_new(buffer, validity).vortex_expect("ByteBoolData construction failed") - } - - /// Construct [`ByteBoolData`] from a raw bytes buffer and validity, returning an - /// error if the provided components do not satisfy the invariants documented in - /// [`ByteBoolData::new_unchecked`]. - pub fn try_new(buffer: BufferHandle, validity: Validity) -> VortexResult { - Self::check_validity_len(&buffer, &validity)?; - Self::validate_bytes(&buffer)?; - // SAFETY: buffer bytes and validity length validated above. - Ok(unsafe { Self::new_unchecked(buffer, validity) }) - } - - /// Construct [`ByteBoolData`] without validating the buffer contents. - /// - /// # Safety - /// - /// Every byte of `buffer` must be `0x00` or `0x01`. Any other byte value is - /// Undefined Behavior because [`ByteBoolData::truthy_bytes`] reinterprets the buffer - /// as `&[bool]`, and a `bool` with any bit pattern other than 0 or 1 is UB. - /// If `validity` is [`Validity::Array`], its length must equal `buffer.len()`. - pub unsafe fn new_unchecked(buffer: BufferHandle, validity: Validity) -> Self { - debug_assert!( - Self::validate_bytes(&buffer).is_ok(), - "ByteBoolData::new_unchecked called with non-boolean bytes", - ); - Self::check_validity_len(&buffer, &validity) - .vortex_expect("ByteBoolData::new_unchecked called with mismatched validity length"); - Self { buffer } - } - - fn check_validity_len(buffer: &BufferHandle, validity: &Validity) -> VortexResult<()> { - if let Some(vlen) = validity.maybe_len() { - vortex_ensure!( - buffer.len() == vlen, + let length = buffer.len(); + if let Some(vlen) = validity.maybe_len() + && length != vlen + { + vortex_panic!( "Buffer length ({}) does not match validity length ({})", - buffer.len(), + length, vlen ); } - Ok(()) + Self { buffer } } /// Returns the number of elements in the array. @@ -386,9 +296,7 @@ impl ByteBoolData { let validity = validity.into(); // SAFETY: we are transmuting a Vec into a Vec let data: Vec = unsafe { std::mem::transmute(data) }; - let buffer = BufferHandle::new_host(ByteBuffer::from(data)); - // SAFETY: bytes came from `Vec`, which guarantees values of 0 or 1. - unsafe { Self::new_unchecked(buffer, validity) } + Self::new(BufferHandle::new_host(ByteBuffer::from(data)), validity) } pub fn buffer(&self) -> &BufferHandle { @@ -488,48 +396,6 @@ mod tests { assert_eq!(arr.len(), 2); } - #[test] - fn try_new_rejects_invalid_byte() { - // `ByteBoolData::as_slice` transmutes the underlying bytes into `&[bool]`. - // A bool with any bit pattern other than 0 or 1 is Undefined Behavior per - // the Rust reference, so `try_new` must reject these buffers. - let raw = ByteBuffer::from(vec![0x02u8, 0x01, 0xFFu8]); - let handle = BufferHandle::new_host(raw); - let err = ByteBool::try_new(handle, Validity::NonNullable).unwrap_err(); - assert!( - err.to_string().contains("bytes that are not 0 or 1"), - "unexpected error: {err}", - ); - } - - #[test] - #[should_panic(expected = "bytes that are not 0 or 1")] - fn new_panics_on_invalid_byte() { - let raw = ByteBuffer::from(vec![0x02u8]); - let handle = BufferHandle::new_host(raw); - drop(ByteBool::new(handle, Validity::NonNullable)); - } - - #[test] - fn new_unchecked_accepts_valid_bytes() { - let raw = ByteBuffer::from(vec![0x00u8, 0x01, 0x01, 0x00]); - let handle = BufferHandle::new_host(raw); - // SAFETY: all bytes are 0 or 1. - let arr = unsafe { ByteBool::new_unchecked(handle, Validity::NonNullable) }; - assert_eq!(arr.len(), 4); - assert_eq!(arr.truthy_bytes(), &[false, true, true, false]); - } - - #[test] - #[cfg(debug_assertions)] - #[should_panic(expected = "non-boolean bytes")] - fn new_unchecked_debug_asserts_invalid_bytes() { - let raw = ByteBuffer::from(vec![0x02u8]); - let handle = BufferHandle::new_host(raw); - // SAFETY: intentionally violated to exercise the debug assertion. - drop(unsafe { ByteBool::new_unchecked(handle, Validity::NonNullable) }); - } - #[test] fn test_nullable_bytebool_serde_roundtrip() { let array = ByteBool::from_option_vec(vec![Some(true), None, Some(false), None]); diff --git a/encodings/bytebool/src/compute.rs b/encodings/bytebool/src/compute.rs index 877529aeb52..609a7bc553f 100644 --- a/encodings/bytebool/src/compute.rs +++ b/encodings/bytebool/src/compute.rs @@ -76,12 +76,9 @@ impl TakeExecute for ByteBool { .collect::() }); - // SAFETY: - unsafe { - Ok(Some( - ByteBool::new_unchecked(BufferHandle::new_host(taken), validity).into_array(), - )) - } + Ok(Some( + ByteBool::new(BufferHandle::new_host(taken), validity).into_array(), + )) } } diff --git a/vortex-buffer/public-api.lock b/vortex-buffer/public-api.lock index c89264da253..cd9e0900247 100644 --- a/vortex-buffer/public-api.lock +++ b/vortex-buffer/public-api.lock @@ -504,6 +504,10 @@ impl core::convert::From<&[bool]> for vortex_buffer::BitBufferMut pub fn vortex_buffer::BitBufferMut::from(value: &[bool]) -> Self +impl core::convert::From<&[u8]> for vortex_buffer::BitBufferMut + +pub fn vortex_buffer::BitBufferMut::from(value: &[u8]) -> Self + impl core::convert::From> for vortex_buffer::BitBufferMut pub fn vortex_buffer::BitBufferMut::from(value: alloc::vec::Vec) -> Self From 932055eb687ae9d6d0b94668ee8cb1896a9bb0dc Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Apr 2026 12:57:15 -0400 Subject: [PATCH 4/5] from_vec Signed-off-by: Andrew Duffy --- encodings/bytebool/src/array.rs | 77 +++++++++------------------------ 1 file changed, 21 insertions(+), 56 deletions(-) diff --git a/encodings/bytebool/src/array.rs b/encodings/bytebool/src/array.rs index c9c61ac047e..e1604b7642e 100644 --- a/encodings/bytebool/src/array.rs +++ b/encodings/bytebool/src/array.rs @@ -131,7 +131,7 @@ impl VTable for ByteBool { } let buffer = buffers[0].clone(); - let data = ByteBoolData::new(buffer, validity.clone()); + let data = ByteBoolData::new(buffer); let slots = ByteBoolData::make_slots(&validity, len); Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) } @@ -199,10 +199,17 @@ pub struct ByteBool; impl ByteBool { pub fn new(buffer: BufferHandle, validity: Validity) -> ByteBoolArray { + if let Some(len) = validity.maybe_len() { + assert_eq!( + buffer.len(), + len, + "ByteBool validity and bytes must have same length" + ); + } let dtype = DType::Bool(validity.nullability()); let slots = ByteBoolData::make_slots(&validity, buffer.len()); - let data = ByteBoolData::new(buffer, validity); + let data = ByteBoolData::new(buffer); let len = data.len(); unsafe { Array::from_parts_unchecked( @@ -214,29 +221,22 @@ impl ByteBool { /// Construct a [`ByteBoolArray`] from a `Vec` and validity. pub fn from_vec>(data: Vec, validity: V) -> ByteBoolArray { let validity = validity.into(); - let data = ByteBoolData::from_vec(data, validity.clone()); - let dtype = DType::Bool(validity.nullability()); - let len = data.len(); - let slots = ByteBoolData::make_slots(&validity, len); - unsafe { - Array::from_parts_unchecked( - ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots), - ) - } + // NOTE: this will not cause allocation on release builds + let bytes: Vec = data.into_iter().map(|b| b as u8).collect(); + let handle = BufferHandle::new_host(ByteBuffer::from(bytes)); + ByteBool::new(handle, validity) } /// Construct a [`ByteBoolArray`] from optional bools. pub fn from_option_vec(data: Vec>) -> ByteBoolArray { let validity = Validity::from_iter(data.iter().map(|v| v.is_some())); - let data = ByteBoolData::from(data); - let dtype = DType::Bool(validity.nullability()); - let len = data.len(); - let slots = ByteBoolData::make_slots(&validity, len); - unsafe { - Array::from_parts_unchecked( - ArrayParts::new(ByteBool, dtype, len, data).with_slots(slots), - ) - } + // NOTE: this will not cause allocation on release builds + let bytes: Vec = data + .into_iter() + .map(|b| b.unwrap_or_default() as u8) + .collect(); + let handle = BufferHandle::new_host(ByteBuffer::from(bytes)); + ByteBool::new(handle, validity) } } @@ -267,17 +267,7 @@ impl ByteBoolData { vec![validity_to_child(validity, len)] } - pub fn new(buffer: BufferHandle, validity: Validity) -> Self { - let length = buffer.len(); - if let Some(vlen) = validity.maybe_len() - && length != vlen - { - vortex_panic!( - "Buffer length ({}) does not match validity length ({})", - length, - vlen - ); - } + pub fn new(buffer: BufferHandle) -> Self { Self { buffer } } @@ -291,14 +281,6 @@ impl ByteBoolData { self.buffer.len() == 0 } - // TODO(ngates): deprecate construction from vec - pub fn from_vec>(data: Vec, validity: V) -> Self { - let validity = validity.into(); - // SAFETY: we are transmuting a Vec into a Vec - let data: Vec = unsafe { std::mem::transmute(data) }; - Self::new(BufferHandle::new_host(ByteBuffer::from(data)), validity) - } - pub fn buffer(&self) -> &BufferHandle { &self.buffer } @@ -330,23 +312,6 @@ impl OperationsVTable for ByteBool { } } -impl From> for ByteBoolData { - fn from(value: Vec) -> Self { - Self::from_vec(value, Validity::AllValid) - } -} - -impl From>> for ByteBoolData { - fn from(value: Vec>) -> Self { - let validity = Validity::from_iter(value.iter().map(|v| v.is_some())); - - // This doesn't reallocate, and the compiler even vectorizes it - let data = value.into_iter().map(Option::unwrap_or_default).collect(); - - Self::from_vec(data, validity) - } -} - #[cfg(test)] mod tests { use vortex_array::ArrayContext; From cd23309ef49a28349d290a0c2450503067a3ed2a Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Apr 2026 13:08:05 -0400 Subject: [PATCH 5/5] mod comment Signed-off-by: Andrew Duffy --- encodings/bytebool/public-api.lock | 12 +-------- encodings/bytebool/src/lib.rs | 39 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/encodings/bytebool/public-api.lock b/encodings/bytebool/public-api.lock index db95fd287a4..53c71e3bfb6 100644 --- a/encodings/bytebool/public-api.lock +++ b/encodings/bytebool/public-api.lock @@ -78,13 +78,11 @@ impl vortex_bytebool::ByteBoolData pub fn vortex_bytebool::ByteBoolData::buffer(&self) -> &vortex_array::buffer::BufferHandle -pub fn vortex_bytebool::ByteBoolData::from_vec>(data: alloc::vec::Vec, validity: V) -> Self - pub fn vortex_bytebool::ByteBoolData::is_empty(&self) -> bool pub fn vortex_bytebool::ByteBoolData::len(&self) -> usize -pub fn vortex_bytebool::ByteBoolData::new(buffer: vortex_array::buffer::BufferHandle, validity: vortex_array::validity::Validity) -> Self +pub fn vortex_bytebool::ByteBoolData::new(buffer: vortex_array::buffer::BufferHandle) -> Self pub fn vortex_bytebool::ByteBoolData::truthy_bytes(&self) -> &[u8] @@ -94,14 +92,6 @@ impl core::clone::Clone for vortex_bytebool::ByteBoolData pub fn vortex_bytebool::ByteBoolData::clone(&self) -> vortex_bytebool::ByteBoolData -impl core::convert::From> for vortex_bytebool::ByteBoolData - -pub fn vortex_bytebool::ByteBoolData::from(value: alloc::vec::Vec) -> Self - -impl core::convert::From>> for vortex_bytebool::ByteBoolData - -pub fn vortex_bytebool::ByteBoolData::from(value: alloc::vec::Vec>) -> Self - impl core::fmt::Debug for vortex_bytebool::ByteBoolData pub fn vortex_bytebool::ByteBoolData::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/encodings/bytebool/src/lib.rs b/encodings/bytebool/src/lib.rs index 35579a89dc8..3258341cdb9 100644 --- a/encodings/bytebool/src/lib.rs +++ b/encodings/bytebool/src/lib.rs @@ -1,6 +1,45 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +//! A Vortex encoding that mirrors Arrow's [8-bit Boolean canonical extension type][spec]. +//! +//! Each element is stored as a single byte. The zero byte represents `false` and any +//! non-zero byte represents `true`, matching the truthy semantics of the Arrow spec. This +//! trades 8x the storage of the bit-packed `Bool` layout for cheaper per-byte access — +//! useful when data arrives from a C ABI or other source that already emits byte-wide +//! booleans. On execution the array materializes into the standard bit-packed +//! [`BoolArray`][vortex_array::arrays::BoolArray]. +//! +//! # Examples +//! +//! Any non-zero byte in the backing buffer is treated as `true` when the array executes +//! to a canonical [`BoolArray`][vortex_array::arrays::BoolArray]: +//! +//! ``` +//! # use vortex_array::{IntoArray, LEGACY_SESSION, VortexSessionExecute}; +//! # use vortex_array::arrays::BoolArray; +//! # use vortex_array::arrays::bool::BoolArrayExt; +//! # use vortex_array::buffer::BufferHandle; +//! # use vortex_array::validity::Validity; +//! # use vortex_buffer::ByteBuffer; +//! # use vortex_bytebool::ByteBool; +//! # use vortex_error::VortexResult; +//! # fn main() -> VortexResult<()> { +//! # let mut ctx = LEGACY_SESSION.create_execution_ctx(); +//! let handle = BufferHandle::new_host(ByteBuffer::from(vec![0u8, 1, 42, 0])); +//! let array = ByteBool::new(handle, Validity::NonNullable); +//! +//! let bits = array.into_array().execute::(&mut ctx)?.to_bit_buffer(); +//! assert!(!bits.value(0)); +//! assert!(bits.value(1)); +//! assert!(bits.value(2)); // byte 42 is truthy +//! assert!(!bits.value(3)); +//! # Ok(()) +//! # } +//! ``` +//! +//! [spec]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean + pub use array::*; mod array;