Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions vortex-array/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3368,6 +3368,10 @@ pub struct vortex_array::arrays::patched::Patched

impl vortex_array::arrays::patched::Patched

pub const vortex_array::arrays::patched::Patched::ID: vortex_array::ArrayId

impl vortex_array::arrays::patched::Patched

pub fn vortex_array::arrays::patched::Patched::from_array_and_patches(inner: vortex_array::ArrayRef, patches: &vortex_array::patches::Patches, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::Array<vortex_array::arrays::patched::Patched>>

impl core::clone::Clone for vortex_array::arrays::patched::Patched
Expand Down Expand Up @@ -3584,6 +3588,8 @@ pub fn T::patch_values(&self) -> &vortex_array::ArrayRef

pub fn T::slots_view(&self) -> vortex_array::arrays::patched::PatchedSlotsView<'_>

pub fn vortex_array::arrays::patched::apply_patches_primitive<V: vortex_array::dtype::NativePType>(output: &mut [V], offset: usize, len: usize, n_lanes: usize, lane_offsets: &[u32], indices: &[u16], values: &[V])

pub type vortex_array::arrays::patched::PatchedArray = vortex_array::Array<vortex_array::arrays::patched::Patched>

pub mod vortex_array::arrays::primitive
Expand Down Expand Up @@ -6218,6 +6224,10 @@ pub struct vortex_array::arrays::Patched

impl vortex_array::arrays::patched::Patched

pub const vortex_array::arrays::patched::Patched::ID: vortex_array::ArrayId

impl vortex_array::arrays::patched::Patched

pub fn vortex_array::arrays::patched::Patched::from_array_and_patches(inner: vortex_array::ArrayRef, patches: &vortex_array::patches::Patches, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::Array<vortex_array::arrays::patched::Patched>>

impl core::clone::Clone for vortex_array::arrays::patched::Patched
Expand Down
14 changes: 12 additions & 2 deletions vortex-array/src/arrays/patched/vtable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ pub type PatchedArray = Array<Patched>;
#[derive(Clone, Debug)]
pub struct Patched;

impl Patched {
/// The array ID for Patched arrays.
pub const ID: ArrayId = ArrayId::new_ref("vortex.patched");
}

impl ValidityChild<Patched> for Patched {
fn validity_child(array: ArrayView<'_, Patched>) -> ArrayRef {
array.inner().clone()
Expand Down Expand Up @@ -99,7 +104,7 @@ impl VTable for Patched {
type ValidityVTable = ValidityVTableFromChild;

fn id(&self) -> ArrayId {
ArrayId::new_ref("vortex.patched")
Self::ID
}

fn validate(
Expand Down Expand Up @@ -318,7 +323,12 @@ impl VTable for Patched {
}

/// Apply patches on top of the existing value types.
fn apply_patches_primitive<V: NativePType>(
///
/// This function is used to overwrite values in the output buffer with patch values
/// at the specified indices. It handles the chunked layout where patches are organized
/// by lanes within 1024-element chunks.
#[allow(clippy::too_many_arguments)]
pub fn apply_patches_primitive<V: NativePType>(
output: &mut [V],
offset: usize,
len: usize,
Expand Down
65 changes: 65 additions & 0 deletions vortex-cuda/kernels/src/patched.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#include "types.cuh"

/// Apply patches to an output array using the transposed Patched array format.
///
/// This kernel uses a thread-per-lane model where each thread is assigned to
/// one (chunk, lane) slot and applies all patches in that slot.
template <typename ValueT>
__device__ void patched(ValueT *const output,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! Ideally we could move to a world where we drop the other patches.cu eventually and only operate on transposed patches on the GPU, maybe.

const uint32_t *const lane_offsets,
const uint16_t *const patch_indices,
const ValueT *const patch_values,
uint32_t n_lanes,
uint32_t total_lane_slots,
uint64_t offset,
uint64_t len) {
const uint32_t lane_slot = blockIdx.x * blockDim.x + threadIdx.x;

// Early return if this thread is beyond the number of lane slots
if (lane_slot >= total_lane_slots) {
return;
}

// Determine which chunk this lane slot belongs to
const uint32_t chunk = lane_slot / n_lanes;

// Get the range of patches for this lane slot
const uint32_t start = lane_offsets[lane_slot];
const uint32_t stop = lane_offsets[lane_slot + 1];

// Apply all patches in this lane
for (uint32_t p = start; p < stop; p++) {
// Get within-chunk index and compute global position
const uint16_t within_chunk_idx = patch_indices[p];
const uint64_t global_idx = static_cast<uint64_t>(chunk) * 1024 + within_chunk_idx;

// Check bounds (for sliced arrays)
if (global_idx < offset) {
continue;
}

if (global_idx >= offset + len) {
break;
}

output[global_idx - offset] = patch_values[p];
}
}

#define GENERATE_PATCHED_KERNEL(value_suffix, ValueT) \
extern "C" __global__ void patched_##value_suffix(ValueT *const output, \
const uint32_t *const lane_offsets, \
const uint16_t *const patch_indices, \
const ValueT *const patch_values, \
uint32_t n_lanes, \
uint32_t total_lane_slots, \
uint64_t offset, \
uint64_t len) { \
patched(output, lane_offsets, patch_indices, patch_values, n_lanes, total_lane_slots, offset, len); \
}

// Generate for all native SIMD ptypes
FOR_EACH_NATIVE_SIMD_PTYPE(GENERATE_PATCHED_KERNEL)
20 changes: 13 additions & 7 deletions vortex-cuda/src/kernel/encodings/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ use vortex::encodings::fastlanes::unpack_iter::BitPacked as BitPackedUnpack;
use vortex::error::VortexResult;
use vortex::error::vortex_ensure;
use vortex::error::vortex_err;
use vortex_error::vortex_bail;

use crate::CudaBufferExt;
use crate::CudaDeviceBuffer;
use crate::executor::CudaExecute;
use crate::executor::CudaExecutionCtx;
use crate::kernel::patches::gpu::GPUPatches;
use crate::kernel::patches::types::DevicePatches;
use crate::kernel::patches::types::transpose_patches;

/// CUDA decoder for bit-packed arrays.
Expand All @@ -53,7 +55,7 @@ impl CudaExecute for BitPackedExecutor {
Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected BitPackedArray"))?;

match_each_integer_ptype!(array.ptype(array.dtype()), |A| {
decode_bitpacked::<A>(array, A::default(), ctx).await
decode_bitpacked::<A>(array, A::default(), None, ctx).await
})
}
}
Expand Down Expand Up @@ -90,6 +92,7 @@ unsafe impl DeviceRepr for GPUPatches {}
pub(crate) async fn decode_bitpacked<A>(
array: BitPackedArray,
reference: A,
device_patches: Option<DevicePatches>,
ctx: &mut CudaExecutionCtx,
) -> VortexResult<Canonical>
where
Expand All @@ -101,7 +104,7 @@ where
bit_width,
len,
packed,
patches,
patches: interior_patches,
validity,
} = BitPacked::into_parts(array);

Expand All @@ -122,11 +125,14 @@ where
let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, ctx)?;
let config = bitpacked_cuda_launch_config(output_width, len)?;

// We hold this here to keep the device buffers alive.
let device_patches = if let Some(patches) = patches {
Some(transpose_patches(&patches, ctx).await?)
} else {
None
// Execute the patch kind to get device patches
let device_patches = match (interior_patches, device_patches) {
(None, None) => None,
(Some(patches), None) => Some(transpose_patches(&patches, ctx).await?),
(None, Some(device_patches)) => Some(device_patches),
(Some(_), Some(_)) => {
vortex_bail!("Cannot execute bitpacked array with interior and exterior patches")
}
};

let patches_arg = if let Some(p) = &device_patches {
Expand Down
4 changes: 2 additions & 2 deletions vortex-cuda/src/kernel/encodings/for_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ impl CudaExecute for FoRExecutor {
if let Some(bitpacked) = array.encoded().as_opt::<BitPacked>() {
match_each_integer_ptype!(bitpacked.ptype(bitpacked.dtype()), |P| {
let reference: P = array.reference_scalar().try_into()?;
return decode_bitpacked(bitpacked.into_owned(), reference, ctx).await;
return decode_bitpacked(bitpacked.into_owned(), reference, None, ctx).await;
})
}

Expand All @@ -67,7 +67,7 @@ impl CudaExecute for FoRExecutor {
let slice_range = slice_array.slice_range().clone();
let unpacked = match_each_integer_ptype!(bitpacked.ptype(bitpacked.dtype()), |P| {
let reference: P = array.reference_scalar().try_into()?;
decode_bitpacked(bitpacked.into_owned(), reference, ctx).await?
decode_bitpacked(bitpacked.into_owned(), reference, None, ctx).await?
});

return unpacked
Expand Down
2 changes: 2 additions & 0 deletions vortex-cuda/src/kernel/encodings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod bitpacked;
mod date_time_parts;
mod decimal_byte_parts;
mod for_;
mod patched;
mod runend;
mod sequence;
mod zigzag;
Expand All @@ -18,6 +19,7 @@ pub(crate) use bitpacked::BitPackedExecutor;
pub(crate) use date_time_parts::DateTimePartsExecutor;
pub(crate) use decimal_byte_parts::DecimalBytePartsExecutor;
pub(crate) use for_::FoRExecutor;
pub(crate) use patched::PatchedExecutor;
pub(crate) use runend::RunEndExecutor;
pub(crate) use sequence::SequenceExecutor;
pub(crate) use zigzag::ZigZagExecutor;
Expand Down
Loading
Loading