vortex-data · a10y · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock
@@ -3368,6 +3368,10 @@ pub struct vortex_array::arrays::patched::Patched
 
 impl vortex_array::arrays::patched::Patched
 
+pub const vortex_array::arrays::patched::Patched::ID: vortex_array::ArrayId
+
+impl vortex_array::arrays::patched::Patched
+
 pub fn vortex_array::arrays::patched::Patched::from_array_and_patches(inner: vortex_array::ArrayRef, patches: &vortex_array::patches::Patches, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::Array<vortex_array::arrays::patched::Patched>>
 
 impl core::clone::Clone for vortex_array::arrays::patched::Patched
@@ -3584,6 +3588,8 @@ pub fn T::patch_values(&self) -> &vortex_array::ArrayRef
 
 pub fn T::slots_view(&self) -> vortex_array::arrays::patched::PatchedSlotsView<'_>
 
+pub fn vortex_array::arrays::patched::apply_patches_primitive<V: vortex_array::dtype::NativePType>(output: &mut [V], offset: usize, len: usize, n_lanes: usize, lane_offsets: &[u32], indices: &[u16], values: &[V])
+
 pub type vortex_array::arrays::patched::PatchedArray = vortex_array::Array<vortex_array::arrays::patched::Patched>
 
 pub mod vortex_array::arrays::primitive
@@ -6218,6 +6224,10 @@ pub struct vortex_array::arrays::Patched
 
 impl vortex_array::arrays::patched::Patched
 
+pub const vortex_array::arrays::patched::Patched::ID: vortex_array::ArrayId
+
+impl vortex_array::arrays::patched::Patched
+
 pub fn vortex_array::arrays::patched::Patched::from_array_and_patches(inner: vortex_array::ArrayRef, patches: &vortex_array::patches::Patches, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::Array<vortex_array::arrays::patched::Patched>>
 
 impl core::clone::Clone for vortex_array::arrays::patched::Patched

diff --git a/vortex-array/src/arrays/patched/vtable/mod.rs b/vortex-array/src/arrays/patched/vtable/mod.rs
@@ -57,6 +57,11 @@ pub type PatchedArray = Array<Patched>;
 #[derive(Clone, Debug)]
 pub struct Patched;
 
+impl Patched {
+    /// The array ID for Patched arrays.
+    pub const ID: ArrayId = ArrayId::new_ref("vortex.patched");
+}
+
 impl ValidityChild<Patched> for Patched {
     fn validity_child(array: ArrayView<'_, Patched>) -> ArrayRef {
         array.inner().clone()
@@ -99,7 +104,7 @@ impl VTable for Patched {
     type ValidityVTable = ValidityVTableFromChild;
 
     fn id(&self) -> ArrayId {
-        ArrayId::new_ref("vortex.patched")
+        Self::ID
     }
 
     fn validate(
@@ -318,7 +323,12 @@ impl VTable for Patched {
 }
 
 /// Apply patches on top of the existing value types.
-fn apply_patches_primitive<V: NativePType>(
+///
+/// This function is used to overwrite values in the output buffer with patch values
+/// at the specified indices. It handles the chunked layout where patches are organized
+/// by lanes within 1024-element chunks.
+#[allow(clippy::too_many_arguments)]
+pub fn apply_patches_primitive<V: NativePType>(
     output: &mut [V],
     offset: usize,
     len: usize,

diff --git a/vortex-cuda/kernels/src/patched.cu b/vortex-cuda/kernels/src/patched.cu
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include "types.cuh"
+
+/// Apply patches to an output array using the transposed Patched array format.
+///
+/// This kernel uses a thread-per-lane model where each thread is assigned to
+/// one (chunk, lane) slot and applies all patches in that slot.
+template <typename ValueT>
+__device__ void patched(ValueT *const output,
+                        const uint32_t *const lane_offsets,
+                        const uint16_t *const patch_indices,
+                        const ValueT *const patch_values,
+                        uint32_t n_lanes,
+                        uint32_t total_lane_slots,
+                        uint64_t offset,
+                        uint64_t len) {
+    const uint32_t lane_slot = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Early return if this thread is beyond the number of lane slots
+    if (lane_slot >= total_lane_slots) {
+        return;
+    }
+
+    // Determine which chunk this lane slot belongs to
+    const uint32_t chunk = lane_slot / n_lanes;
+
+    // Get the range of patches for this lane slot
+    const uint32_t start = lane_offsets[lane_slot];
+    const uint32_t stop = lane_offsets[lane_slot + 1];
+
+    // Apply all patches in this lane
+    for (uint32_t p = start; p < stop; p++) {
+        // Get within-chunk index and compute global position
+        const uint16_t within_chunk_idx = patch_indices[p];
+        const uint64_t global_idx = static_cast<uint64_t>(chunk) * 1024 + within_chunk_idx;
+
+        // Check bounds (for sliced arrays)
+        if (global_idx < offset) {
+            continue;
+        }
+
+        if (global_idx >= offset + len) {
+            break;
+        }
+
+        output[global_idx - offset] = patch_values[p];
+    }
+}
+
+#define GENERATE_PATCHED_KERNEL(value_suffix, ValueT)                                                        \
+    extern "C" __global__ void patched_##value_suffix(ValueT *const output,                                  \
+                                                      const uint32_t *const lane_offsets,                    \
+                                                      const uint16_t *const patch_indices,                   \
+                                                      const ValueT *const patch_values,                      \
+                                                      uint32_t n_lanes,                                      \
+                                                      uint32_t total_lane_slots,                             \
+                                                      uint64_t offset,                                       \
+                                                      uint64_t len) {                                        \
+        patched(output, lane_offsets, patch_indices, patch_values, n_lanes, total_lane_slots, offset, len);  \
+    }
+
+// Generate for all native SIMD ptypes
+FOR_EACH_NATIVE_SIMD_PTYPE(GENERATE_PATCHED_KERNEL)
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -23,12 +23,14 @@ use vortex::encodings::fastlanes::unpack_iter::BitPacked as BitPackedUnpack;
 use vortex::error::VortexResult;
 use vortex::error::vortex_ensure;
 use vortex::error::vortex_err;
+use vortex_error::vortex_bail;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
 use crate::kernel::patches::gpu::GPUPatches;
+use crate::kernel::patches::types::DevicePatches;
 use crate::kernel::patches::types::transpose_patches;
 
 /// CUDA decoder for bit-packed arrays.
@@ -53,7 +55,7 @@ impl CudaExecute for BitPackedExecutor {
             Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected BitPackedArray"))?;
 
         match_each_integer_ptype!(array.ptype(array.dtype()), |A| {
-            decode_bitpacked::<A>(array, A::default(), ctx).await
+            decode_bitpacked::<A>(array, A::default(), None, ctx).await
         })
     }
 }
@@ -90,6 +92,7 @@ unsafe impl DeviceRepr for GPUPatches {}
 pub(crate) async fn decode_bitpacked<A>(
     array: BitPackedArray,
     reference: A,
+    device_patches: Option<DevicePatches>,
     ctx: &mut CudaExecutionCtx,
 ) -> VortexResult<Canonical>
 where
@@ -101,7 +104,7 @@ where
         bit_width,
         len,
         packed,
-        patches,
+        patches: interior_patches,
         validity,
     } = BitPacked::into_parts(array);
 
@@ -122,11 +125,14 @@ where
     let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, ctx)?;
     let config = bitpacked_cuda_launch_config(output_width, len)?;
 
-    // We hold this here to keep the device buffers alive.
-    let device_patches = if let Some(patches) = patches {
-        Some(transpose_patches(&patches, ctx).await?)
-    } else {
-        None
+    // Execute the patch kind to get device patches
+    let device_patches = match (interior_patches, device_patches) {
+        (None, None) => None,
+        (Some(patches), None) => Some(transpose_patches(&patches, ctx).await?),
+        (None, Some(device_patches)) => Some(device_patches),
+        (Some(_), Some(_)) => {
+            vortex_bail!("Cannot execute bitpacked array with interior and exterior patches")
+        }
     };
 
     let patches_arg = if let Some(p) = &device_patches {

diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs
@@ -56,7 +56,7 @@ impl CudaExecute for FoRExecutor {
         if let Some(bitpacked) = array.encoded().as_opt::<BitPacked>() {
             match_each_integer_ptype!(bitpacked.ptype(bitpacked.dtype()), |P| {
                 let reference: P = array.reference_scalar().try_into()?;
-                return decode_bitpacked(bitpacked.into_owned(), reference, ctx).await;
+                return decode_bitpacked(bitpacked.into_owned(), reference, None, ctx).await;
             })
         }
 
@@ -67,7 +67,7 @@ impl CudaExecute for FoRExecutor {
             let slice_range = slice_array.slice_range().clone();
             let unpacked = match_each_integer_ptype!(bitpacked.ptype(bitpacked.dtype()), |P| {
                 let reference: P = array.reference_scalar().try_into()?;
-                decode_bitpacked(bitpacked.into_owned(), reference, ctx).await?
+                decode_bitpacked(bitpacked.into_owned(), reference, None, ctx).await?
             });
 
             return unpacked

diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs
@@ -6,6 +6,7 @@ mod bitpacked;
 mod date_time_parts;
 mod decimal_byte_parts;
 mod for_;
+mod patched;
 mod runend;
 mod sequence;
 mod zigzag;
@@ -18,6 +19,7 @@ pub(crate) use bitpacked::BitPackedExecutor;
 pub(crate) use date_time_parts::DateTimePartsExecutor;
 pub(crate) use decimal_byte_parts::DecimalBytePartsExecutor;
 pub(crate) use for_::FoRExecutor;
+pub(crate) use patched::PatchedExecutor;
 pub(crate) use runend::RunEndExecutor;
 pub(crate) use sequence::SequenceExecutor;
 pub(crate) use zigzag::ZigZagExecutor;