Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
cd086d9
Add Mojo AOT-compiled SIMD take kernels for primitive arrays
claude Apr 10, 2026
3ad4a64
Add Mojo SIMD filter kernel for primitive arrays
claude Apr 10, 2026
079e0da
Add divan benchmark comparing scalar vs AVX2 vs Mojo take kernel
claude Apr 10, 2026
6bfda92
Install Mojo SDK in codspeed benchmark CI for vortex-array
claude Apr 10, 2026
59bb1ea
Fix SIGILL in CI: pin Mojo target to x86-64-v3 (AVX2)
claude Apr 10, 2026
64fdd36
Fix nightly rustfmt: split grouped imports, reorder super:: imports
claude Apr 10, 2026
457d81a
Deprioritize Mojo below AVX2 on x86_64 in take dispatch
claude Apr 10, 2026
2afb139
Fix Mojo build in Cargo: pass --target-triple from TARGET env var
claude Apr 10, 2026
f14c2f8
Optimize Mojo gather: 4x unroll + skylake target for vpgatherqd
claude Apr 10, 2026
d60f190
Add --mtune to Mojo build for better instruction scheduling
claude Apr 10, 2026
0d0fd77
Promote Mojo to top-priority take kernel when available
claude Apr 10, 2026
f214933
Add Mojo SIMD broadcast decode for run-end primitive arrays
claude Apr 10, 2026
900985a
Merge develop, resolve conflicts in Cargo.toml and take/mod.rs
claude Apr 10, 2026
f2f14b4
Add scalar baseline to runend decode benchmark for codspeed comparison
claude Apr 10, 2026
d51800a
Support u64 ends in Mojo runend decode to hit existing benchmarks
claude Apr 10, 2026
7ec0e46
Clean up PR: split kernels per crate, remove unnecessary benchmarks
claude Apr 10, 2026
4d7e86d
Fix lint: use #[allow(unused)] not #[expect(unused)] for TakeKernelSc…
claude Apr 10, 2026
5f2a781
Fix Mojo build on macOS: skip --mcpu=native and --target-triple on Ap…
joseph-isaacs Apr 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -742,9 +742,15 @@ jobs:
uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995
with:
tool: cargo-codspeed
- name: Install Mojo SDK
if: contains(matrix.packages, 'vortex-array') || contains(matrix.packages, 'vortex-runend')
run: |
pip install --user mojo
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
- name: Build benchmarks
env:
RUSTFLAGS: "-C target-feature=+avx2"
MOJO_MCPU: "skylake"
run: cargo codspeed build ${{ matrix.features }} $(printf -- '-p %s ' ${{ matrix.packages }}) --profile bench
- name: Run benchmarks
uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ unused_lifetimes = "deny"
unused_qualifications = "deny"
unexpected_cfgs = { level = "deny", check-cfg = [
"cfg(codspeed)",
"cfg(vortex_mojo)",
'cfg(target_os, values("unknown"))',
] }
warnings = "warn"
Expand Down
123 changes: 123 additions & 0 deletions encodings/runend/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#![allow(clippy::unwrap_used)]
#![allow(clippy::expect_used)]

//! Build script for vortex-runend.
//!
//! Compiles the shared Mojo SIMD kernel (which includes run-end decode functions)
//! and links it as a static library. The `vortex_mojo` cfg flag is emitted so
//! Rust code can conditionally use the Mojo decode path.

use std::env;
use std::path::Path;
use std::path::PathBuf;
use std::process::Command;

fn main() {
// The Mojo kernel lives alongside this crate.
let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set");
let kernel_src = Path::new(&manifest_dir).join("kernels/decode.mojo");

println!("cargo:rerun-if-changed={}", kernel_src.display());

let mojo_bin = find_mojo();
let mojo_bin = match mojo_bin {
Some(p) => p,
None => return,
};

let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set"));
let obj_path = out_dir.join("vortex_mojo_runend.o");

// Use MOJO_MCPU to control target CPU (defaults to "native").
// CI sets this to "skylake" for vpgatherqd and SIMD broadcast.
let mcpu = env::var("MOJO_MCPU").unwrap_or_else(|_| "native".to_owned());

// On macOS, Mojo's host detection works correctly but it rejects the Cargo
// triple format and "native" CPU also triggers the broken host triple detection.
// Skip both flags on Apple targets; Mojo auto-detects correctly without them.
let is_apple = env::var("TARGET")
.map(|t| t.contains("apple"))
.unwrap_or(false);
let target_triple = env::var("TARGET")
.ok()
.filter(|_| !is_apple);

let mut cmd = Command::new(&mojo_bin);
cmd.arg("build").arg("--emit").arg("object");

if !is_apple || mcpu != "native" {
cmd.arg("--mcpu").arg(&mcpu).arg("--mtune").arg(&mcpu);
}

if let Some(triple) = &target_triple {
cmd.arg("--target-triple").arg(triple);
}

let status = cmd.arg("-o").arg(&obj_path).arg(&kernel_src).status();

let status = match status {
Ok(s) => s,
Err(e) => {
println!("cargo:warning=Mojo compilation failed to launch: {e}");
return;
}
};

if !status.success() {
println!(
"cargo:warning=Mojo AOT compilation failed (exit {}), falling back to Rust decode",
status
);
return;
}

let lib_path = out_dir.join("libvortex_mojo_runend.a");
let ar_status = Command::new("ar")
.args(["rcs"])
.arg(&lib_path)
.arg(&obj_path)
.status();

match ar_status {
Ok(s) if s.success() => {}
Ok(s) => {
println!("cargo:warning=ar failed (exit {s}), falling back to Rust decode");
return;
}
Err(e) => {
println!("cargo:warning=ar not found: {e}, falling back to Rust decode");
return;
}
}

println!("cargo:rustc-link-search=native={}", out_dir.display());
println!("cargo:rustc-link-lib=static=vortex_mojo_runend");
println!("cargo:rustc-cfg=vortex_mojo");
}

fn find_mojo() -> Option<PathBuf> {
if Command::new("mojo")
.arg("--version")
.output()
.is_ok_and(|o| o.status.success())
{
return Some(PathBuf::from("mojo"));
}

if let Ok(home) = env::var("HOME") {
let pip_mojo = PathBuf::from(home).join(".local/bin/mojo");
if pip_mojo.exists()
&& Command::new(&pip_mojo)
.arg("--version")
.output()
.is_ok_and(|o| o.status.success())
{
return Some(pip_mojo);
}
}

None
}
97 changes: 97 additions & 0 deletions encodings/runend/kernels/decode.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors

# Mojo AOT-compiled SIMD run-end decode kernels for Vortex.
#
# Decodes run-end encoded primitive arrays using SIMD broadcast + store.
# For each run, the value is broadcast to a SIMD register and written
# in chunks (vpbroadcastd + vmovdqu on AVX2). 2-4x faster than scalar
# fill for run_length >= 8.

from std.memory import UnsafePointer

# SIMD lane counts matching 256-bit registers (AVX2 baseline).
comptime W1: Int = 32 # 1-byte values
comptime W2: Int = 16 # 2-byte values
comptime W4: Int = 8 # 4-byte values
comptime W8: Int = 4 # 8-byte values


@always_inline
fn _runend_decode[VT: DType, ET: DType, W: Int](
ends_addr: Int,
values_addr: Int,
dst_addr: Int,
num_runs: Int,
):
"""Decode run-end encoded data using SIMD broadcast fill."""
var _e: Scalar[ET] = 0
var _v: Scalar[VT] = 0
comptime EP = type_of(UnsafePointer(to=_e))
comptime VP = type_of(UnsafePointer(to=_v))

var ends = EP(unsafe_from_address=ends_addr)
var values = VP(unsafe_from_address=values_addr)
var dst = VP(unsafe_from_address=dst_addr)

var pos = 0
for run in range(num_runs):
var end = Int(ends[run])
var val = values[run]
var run_len = end - pos

# 4x unrolled SIMD broadcast fill
var vec = SIMD[VT, W](val)
var i = 0
while i + W * 4 <= run_len:
dst.store[width=W](pos + i, vec)
dst.store[width=W](pos + i + W, vec)
dst.store[width=W](pos + i + W * 2, vec)
dst.store[width=W](pos + i + W * 3, vec)
i += W * 4

while i + W <= run_len:
dst.store[width=W](pos + i, vec)
i += W

# Scalar remainder
while i < run_len:
dst[pos + i] = val
i += 1

pos = end


# u32 ends variants
@export("vortex_runend_decode_1byte")
fn runend_decode_1byte(ends: Int, values: Int, dst: Int, n: Int):
_runend_decode[DType.uint8, DType.uint32, W1](ends, values, dst, n)

@export("vortex_runend_decode_2byte")
fn runend_decode_2byte(ends: Int, values: Int, dst: Int, n: Int):
_runend_decode[DType.uint16, DType.uint32, W2](ends, values, dst, n)

@export("vortex_runend_decode_4byte")
fn runend_decode_4byte(ends: Int, values: Int, dst: Int, n: Int):
_runend_decode[DType.uint32, DType.uint32, W4](ends, values, dst, n)

@export("vortex_runend_decode_8byte")
fn runend_decode_8byte(ends: Int, values: Int, dst: Int, n: Int):
_runend_decode[DType.uint64, DType.uint32, W8](ends, values, dst, n)

# u64 ends variants
@export("vortex_runend_decode_1byte_u64ends")
fn runend_decode_1byte_u64ends(ends: Int, values: Int, dst: Int, n: Int):
_runend_decode[DType.uint8, DType.uint64, W1](ends, values, dst, n)

@export("vortex_runend_decode_2byte_u64ends")
fn runend_decode_2byte_u64ends(ends: Int, values: Int, dst: Int, n: Int):
_runend_decode[DType.uint16, DType.uint64, W2](ends, values, dst, n)

@export("vortex_runend_decode_4byte_u64ends")
fn runend_decode_4byte_u64ends(ends: Int, values: Int, dst: Int, n: Int):
_runend_decode[DType.uint32, DType.uint64, W4](ends, values, dst, n)

@export("vortex_runend_decode_8byte_u64ends")
fn runend_decode_8byte_u64ends(ends: Int, values: Int, dst: Int, n: Int):
_runend_decode[DType.uint64, DType.uint64, W8](ends, values, dst, n)
92 changes: 92 additions & 0 deletions encodings/runend/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,15 @@ pub fn runend_decode_primitive(
offset: usize,
length: usize,
) -> VortexResult<PrimitiveArray> {
// Fast path: Mojo SIMD broadcast decode for non-nullable u32-ended arrays
// with no offset (the common case for full-array canonicalization).
#[cfg(vortex_mojo)]
{
if let Some(result) = mojo_decode::try_mojo_decode(&ends, &values, offset, length)? {
return Ok(result);
}
}

let validity_mask = values.validity_mask()?;
Ok(match_each_native_ptype!(values.ptype(), |P| {
match_each_unsigned_integer_ptype!(ends.ptype(), |E| {
Expand Down Expand Up @@ -369,3 +378,86 @@ mod test {
Ok(())
}
}

// ---------------------------------------------------------------------------
// Mojo SIMD broadcast decode — used when the Mojo SDK was available at build time.
// ---------------------------------------------------------------------------

#[cfg(vortex_mojo)]
mod mojo_decode {
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::primitive::PrimitiveArrayExt;
use vortex_array::dtype::PType;
use vortex_array::match_each_native_ptype;
use vortex_array::match_each_unsigned_integer_ptype;
use vortex_buffer::BufferMut;
use vortex_error::VortexResult;

unsafe extern "C" {
// u32 ends
fn vortex_runend_decode_1byte(ends: usize, vals: usize, dst: usize, n: usize);
fn vortex_runend_decode_2byte(ends: usize, vals: usize, dst: usize, n: usize);
fn vortex_runend_decode_4byte(ends: usize, vals: usize, dst: usize, n: usize);
fn vortex_runend_decode_8byte(ends: usize, vals: usize, dst: usize, n: usize);
// u64 ends
fn vortex_runend_decode_1byte_u64ends(ends: usize, vals: usize, dst: usize, n: usize);
fn vortex_runend_decode_2byte_u64ends(ends: usize, vals: usize, dst: usize, n: usize);
fn vortex_runend_decode_4byte_u64ends(ends: usize, vals: usize, dst: usize, n: usize);
fn vortex_runend_decode_8byte_u64ends(ends: usize, vals: usize, dst: usize, n: usize);
}

/// Try the Mojo SIMD decode path. Returns `Some` on success, `None` to fall through
/// to the generic Rust path (e.g. for nullable values or with offset).
pub(super) fn try_mojo_decode(
ends: &PrimitiveArray,
values: &PrimitiveArray,
offset: usize,
length: usize,
) -> VortexResult<Option<PrimitiveArray>> {
// Only handle non-nullable, no offset.
if offset != 0 || values.dtype().is_nullable() {
return Ok(None);
}

let val_width = values.ptype().byte_width();

let kernel: unsafe extern "C" fn(usize, usize, usize, usize) =
match (ends.ptype(), val_width) {
(PType::U32, 1) => vortex_runend_decode_1byte,
(PType::U32, 2) => vortex_runend_decode_2byte,
(PType::U32, 4) => vortex_runend_decode_4byte,
(PType::U32, 8) => vortex_runend_decode_8byte,
(PType::U64, 1) => vortex_runend_decode_1byte_u64ends,
(PType::U64, 2) => vortex_runend_decode_2byte_u64ends,
(PType::U64, 4) => vortex_runend_decode_4byte_u64ends,
(PType::U64, 8) => vortex_runend_decode_8byte_u64ends,
_ => return Ok(None),
};

match_each_unsigned_integer_ptype!(ends.ptype(), |E| {
match_each_native_ptype!(values.ptype(), |T| {
let ends_slice = ends.as_slice::<E>();
let values_slice: &[T] = values.as_slice();
let num_runs = ends_slice.len();
let mut buffer = BufferMut::<T>::with_capacity(length);

// SAFETY: The Mojo kernel reads `num_runs` ends and values, writes up to
// `length` elements to dst. All buffers are pre-allocated.
unsafe {
kernel(
ends_slice.as_ptr() as usize,
values_slice.as_ptr() as usize,
buffer.spare_capacity_mut().as_mut_ptr() as usize,
num_runs,
);
buffer.set_len(length);
}

Ok(Some(PrimitiveArray::new(
buffer.freeze(),
values.dtype().nullability().into(),
)))
})
})
}
}
Loading
Loading