diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fb7b86d204c..44119b2f751 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -742,9 +742,15 @@ jobs:
         uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995
         with:
           tool: cargo-codspeed
+      - name: Install Mojo SDK
+        if: contains(matrix.packages, 'vortex-array') || contains(matrix.packages, 'vortex-runend')
+        run: |
+          pip install --user mojo
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
       - name: Build benchmarks
         env:
           RUSTFLAGS: "-C target-feature=+avx2"
+          MOJO_MCPU: "skylake"
         run: cargo codspeed build ${{ matrix.features }} $(printf -- '-p %s ' ${{ matrix.packages }}) --profile bench
       - name: Run benchmarks
         uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
diff --git a/Cargo.toml b/Cargo.toml
index 9853cf94ed9..0cf26602bfc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -316,6 +316,7 @@ unused_lifetimes = "deny"
 unused_qualifications = "deny"
 unexpected_cfgs = { level = "deny", check-cfg = [
     "cfg(codspeed)",
+    "cfg(vortex_mojo)",
     'cfg(target_os, values("unknown"))',
 ] }
 warnings = "warn"
diff --git a/encodings/runend/build.rs b/encodings/runend/build.rs
new file mode 100644
index 00000000000..356bc1d258c
--- /dev/null
+++ b/encodings/runend/build.rs
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(clippy::unwrap_used)]
+#![allow(clippy::expect_used)]
+
+//! Build script for vortex-runend.
+//!
+//! Compiles the shared Mojo SIMD kernel (which includes run-end decode functions)
+//! and links it as a static library. The `vortex_mojo` cfg flag is emitted so
+//! Rust code can conditionally use the Mojo decode path.
+
+use std::env;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+fn main() {
+    // The Mojo kernel lives alongside this crate.
+    let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set");
+    let kernel_src = Path::new(&manifest_dir).join("kernels/decode.mojo");
+
+    println!("cargo:rerun-if-changed={}", kernel_src.display());
+
+    let mojo_bin = find_mojo();
+    let mojo_bin = match mojo_bin {
+        Some(p) => p,
+        None => return,
+    };
+
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set"));
+    let obj_path = out_dir.join("vortex_mojo_runend.o");
+
+    // Use MOJO_MCPU to control target CPU (defaults to "native").
+    // CI sets this to "skylake" for vpgatherqd and SIMD broadcast.
+    let mcpu = env::var("MOJO_MCPU").unwrap_or_else(|_| "native".to_owned());
+
+    // On macOS, Mojo's host detection works correctly but it rejects the Cargo
+    // triple format and "native" CPU also triggers the broken host triple detection.
+    // Skip both flags on Apple targets; Mojo auto-detects correctly without them.
+    let is_apple = env::var("TARGET")
+        .map(|t| t.contains("apple"))
+        .unwrap_or(false);
+    let target_triple = env::var("TARGET")
+        .ok()
+        .filter(|_| !is_apple);
+
+    let mut cmd = Command::new(&mojo_bin);
+    cmd.arg("build").arg("--emit").arg("object");
+
+    if !is_apple || mcpu != "native" {
+        cmd.arg("--mcpu").arg(&mcpu).arg("--mtune").arg(&mcpu);
+    }
+
+    if let Some(triple) = &target_triple {
+        cmd.arg("--target-triple").arg(triple);
+    }
+
+    let status = cmd.arg("-o").arg(&obj_path).arg(&kernel_src).status();
+
+    let status = match status {
+        Ok(s) => s,
+        Err(e) => {
+            println!("cargo:warning=Mojo compilation failed to launch: {e}");
+            return;
+        }
+    };
+
+    if !status.success() {
+        println!(
+            "cargo:warning=Mojo AOT compilation failed (exit {}), falling back to Rust decode",
+            status
+        );
+        return;
+    }
+
+    let lib_path = out_dir.join("libvortex_mojo_runend.a");
+    let ar_status = Command::new("ar")
+        .args(["rcs"])
+        .arg(&lib_path)
+        .arg(&obj_path)
+        .status();
+
+    match ar_status {
+        Ok(s) if s.success() => {}
+        Ok(s) => {
+            println!("cargo:warning=ar failed (exit {s}), falling back to Rust decode");
+            return;
+        }
+        Err(e) => {
+            println!("cargo:warning=ar not found: {e}, falling back to Rust decode");
+            return;
+        }
+    }
+
+    println!("cargo:rustc-link-search=native={}", out_dir.display());
+    println!("cargo:rustc-link-lib=static=vortex_mojo_runend");
+    println!("cargo:rustc-cfg=vortex_mojo");
+}
+
+fn find_mojo() -> Option<PathBuf> {
+    if Command::new("mojo")
+        .arg("--version")
+        .output()
+        .is_ok_and(|o| o.status.success())
+    {
+        return Some(PathBuf::from("mojo"));
+    }
+
+    if let Ok(home) = env::var("HOME") {
+        let pip_mojo = PathBuf::from(home).join(".local/bin/mojo");
+        if pip_mojo.exists()
+            && Command::new(&pip_mojo)
+                .arg("--version")
+                .output()
+                .is_ok_and(|o| o.status.success())
+        {
+            return Some(pip_mojo);
+        }
+    }
+
+    None
+}
diff --git a/encodings/runend/kernels/decode.mojo b/encodings/runend/kernels/decode.mojo
new file mode 100644
index 00000000000..76855dd0a1f
--- /dev/null
+++ b/encodings/runend/kernels/decode.mojo
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+# Mojo AOT-compiled SIMD run-end decode kernels for Vortex.
+#
+# Decodes run-end encoded primitive arrays using SIMD broadcast + store.
+# For each run, the value is broadcast to a SIMD register and written
+# in chunks (vpbroadcastd + vmovdqu on AVX2). 2-4x faster than scalar
+# fill for run_length >= 8.
+
+from std.memory import UnsafePointer
+
+# SIMD lane counts matching 256-bit registers (AVX2 baseline).
+comptime W1: Int = 32  # 1-byte values
+comptime W2: Int = 16  # 2-byte values
+comptime W4: Int = 8   # 4-byte values
+comptime W8: Int = 4   # 8-byte values
+
+
+@always_inline
+fn _runend_decode[VT: DType, ET: DType, W: Int](
+    ends_addr: Int,
+    values_addr: Int,
+    dst_addr: Int,
+    num_runs: Int,
+):
+    """Decode run-end encoded data using SIMD broadcast fill."""
+    var _e: Scalar[ET] = 0
+    var _v: Scalar[VT] = 0
+    comptime EP = type_of(UnsafePointer(to=_e))
+    comptime VP = type_of(UnsafePointer(to=_v))
+
+    var ends = EP(unsafe_from_address=ends_addr)
+    var values = VP(unsafe_from_address=values_addr)
+    var dst = VP(unsafe_from_address=dst_addr)
+
+    var pos = 0
+    for run in range(num_runs):
+        var end = Int(ends[run])
+        var val = values[run]
+        var run_len = end - pos
+
+        # 4x unrolled SIMD broadcast fill
+        var vec = SIMD[VT, W](val)
+        var i = 0
+        while i + W * 4 <= run_len:
+            dst.store[width=W](pos + i, vec)
+            dst.store[width=W](pos + i + W, vec)
+            dst.store[width=W](pos + i + W * 2, vec)
+            dst.store[width=W](pos + i + W * 3, vec)
+            i += W * 4
+
+        while i + W <= run_len:
+            dst.store[width=W](pos + i, vec)
+            i += W
+
+        # Scalar remainder
+        while i < run_len:
+            dst[pos + i] = val
+            i += 1
+
+        pos = end
+
+
+# u32 ends variants
+@export("vortex_runend_decode_1byte")
+fn runend_decode_1byte(ends: Int, values: Int, dst: Int, n: Int):
+    _runend_decode[DType.uint8, DType.uint32, W1](ends, values, dst, n)
+
+@export("vortex_runend_decode_2byte")
+fn runend_decode_2byte(ends: Int, values: Int, dst: Int, n: Int):
+    _runend_decode[DType.uint16, DType.uint32, W2](ends, values, dst, n)
+
+@export("vortex_runend_decode_4byte")
+fn runend_decode_4byte(ends: Int, values: Int, dst: Int, n: Int):
+    _runend_decode[DType.uint32, DType.uint32, W4](ends, values, dst, n)
+
+@export("vortex_runend_decode_8byte")
+fn runend_decode_8byte(ends: Int, values: Int, dst: Int, n: Int):
+    _runend_decode[DType.uint64, DType.uint32, W8](ends, values, dst, n)
+
+# u64 ends variants
+@export("vortex_runend_decode_1byte_u64ends")
+fn runend_decode_1byte_u64ends(ends: Int, values: Int, dst: Int, n: Int):
+    _runend_decode[DType.uint8, DType.uint64, W1](ends, values, dst, n)
+
+@export("vortex_runend_decode_2byte_u64ends")
+fn runend_decode_2byte_u64ends(ends: Int, values: Int, dst: Int, n: Int):
+    _runend_decode[DType.uint16, DType.uint64, W2](ends, values, dst, n)
+
+@export("vortex_runend_decode_4byte_u64ends")
+fn runend_decode_4byte_u64ends(ends: Int, values: Int, dst: Int, n: Int):
+    _runend_decode[DType.uint32, DType.uint64, W4](ends, values, dst, n)
+
+@export("vortex_runend_decode_8byte_u64ends")
+fn runend_decode_8byte_u64ends(ends: Int, values: Int, dst: Int, n: Int):
+    _runend_decode[DType.uint64, DType.uint64, W8](ends, values, dst, n)
diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs
index df86d6cc5fb..8179c49b840 100644
--- a/encodings/runend/src/compress.rs
+++ b/encodings/runend/src/compress.rs
@@ -177,6 +177,15 @@ pub fn runend_decode_primitive(
     offset: usize,
     length: usize,
 ) -> VortexResult<PrimitiveArray> {
+    // Fast path: Mojo SIMD broadcast decode for non-nullable u32-ended arrays
+    // with no offset (the common case for full-array canonicalization).
+    #[cfg(vortex_mojo)]
+    {
+        if let Some(result) = mojo_decode::try_mojo_decode(&ends, &values, offset, length)? {
+            return Ok(result);
+        }
+    }
+
     let validity_mask = values.validity_mask()?;
     Ok(match_each_native_ptype!(values.ptype(), |P| {
         match_each_unsigned_integer_ptype!(ends.ptype(), |E| {
@@ -369,3 +378,86 @@ mod test {
         Ok(())
     }
 }
+
+// ---------------------------------------------------------------------------
+// Mojo SIMD broadcast decode — used when the Mojo SDK was available at build time.
+// ---------------------------------------------------------------------------
+
+#[cfg(vortex_mojo)]
+mod mojo_decode {
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::primitive::PrimitiveArrayExt;
+    use vortex_array::dtype::PType;
+    use vortex_array::match_each_native_ptype;
+    use vortex_array::match_each_unsigned_integer_ptype;
+    use vortex_buffer::BufferMut;
+    use vortex_error::VortexResult;
+
+    unsafe extern "C" {
+        // u32 ends
+        fn vortex_runend_decode_1byte(ends: usize, vals: usize, dst: usize, n: usize);
+        fn vortex_runend_decode_2byte(ends: usize, vals: usize, dst: usize, n: usize);
+        fn vortex_runend_decode_4byte(ends: usize, vals: usize, dst: usize, n: usize);
+        fn vortex_runend_decode_8byte(ends: usize, vals: usize, dst: usize, n: usize);
+        // u64 ends
+        fn vortex_runend_decode_1byte_u64ends(ends: usize, vals: usize, dst: usize, n: usize);
+        fn vortex_runend_decode_2byte_u64ends(ends: usize, vals: usize, dst: usize, n: usize);
+        fn vortex_runend_decode_4byte_u64ends(ends: usize, vals: usize, dst: usize, n: usize);
+        fn vortex_runend_decode_8byte_u64ends(ends: usize, vals: usize, dst: usize, n: usize);
+    }
+
+    /// Try the Mojo SIMD decode path. Returns `Some` on success, `None` to fall through
+    /// to the generic Rust path (e.g. for nullable values or with offset).
+    pub(super) fn try_mojo_decode(
+        ends: &PrimitiveArray,
+        values: &PrimitiveArray,
+        offset: usize,
+        length: usize,
+    ) -> VortexResult<Option<PrimitiveArray>> {
+        // Only handle non-nullable, no offset.
+        if offset != 0 || values.dtype().is_nullable() {
+            return Ok(None);
+        }
+
+        let val_width = values.ptype().byte_width();
+
+        let kernel: unsafe extern "C" fn(usize, usize, usize, usize) =
+            match (ends.ptype(), val_width) {
+                (PType::U32, 1) => vortex_runend_decode_1byte,
+                (PType::U32, 2) => vortex_runend_decode_2byte,
+                (PType::U32, 4) => vortex_runend_decode_4byte,
+                (PType::U32, 8) => vortex_runend_decode_8byte,
+                (PType::U64, 1) => vortex_runend_decode_1byte_u64ends,
+                (PType::U64, 2) => vortex_runend_decode_2byte_u64ends,
+                (PType::U64, 4) => vortex_runend_decode_4byte_u64ends,
+                (PType::U64, 8) => vortex_runend_decode_8byte_u64ends,
+                _ => return Ok(None),
+            };
+
+        match_each_unsigned_integer_ptype!(ends.ptype(), |E| {
+            match_each_native_ptype!(values.ptype(), |T| {
+                let ends_slice = ends.as_slice::<E>();
+                let values_slice: &[T] = values.as_slice();
+                let num_runs = ends_slice.len();
+                let mut buffer = BufferMut::<T>::with_capacity(length);
+
+                // SAFETY: The Mojo kernel reads `num_runs` ends and values, writes up to
+                // `length` elements to dst. All buffers are pre-allocated.
+                unsafe {
+                    kernel(
+                        ends_slice.as_ptr() as usize,
+                        values_slice.as_ptr() as usize,
+                        buffer.spare_capacity_mut().as_mut_ptr() as usize,
+                        num_runs,
+                    );
+                    buffer.set_len(length);
+                }
+
+                Ok(Some(PrimitiveArray::new(
+                    buffer.freeze(),
+                    values.dtype().nullability().into(),
+                )))
+            })
+        })
+    }
+}
diff --git a/vortex-array/build.rs b/vortex-array/build.rs
new file mode 100644
index 00000000000..cdd5466dd4c
--- /dev/null
+++ b/vortex-array/build.rs
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(clippy::unwrap_used)]
+#![allow(clippy::expect_used)]
+
+//! Build script for vortex-array.
+//!
+//! When the Mojo SDK is installed, this compiles the SIMD take kernels in `kernels/take.mojo`
+//! ahead-of-time into a static library and links it into the crate. The `vortex_mojo` cfg flag
+//! is emitted so that Rust code can conditionally enable the Mojo take path.
+//!
+//! When Mojo is **not** available the build script is a no-op and the existing Rust SIMD kernels
+//! are used instead.
+
+use std::env;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Command;
+
+fn main() {
+    println!("cargo:rerun-if-changed=kernels/");
+
+    let mojo_bin = find_mojo();
+    let mojo_bin = match mojo_bin {
+        Some(p) => p,
+        None => return,
+    };
+
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set"));
+    let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set");
+    let kernel_src = Path::new(&manifest_dir).join("kernels/take.mojo");
+
+    let obj_path = out_dir.join("vortex_mojo_take.o");
+
+    // AOT compile the Mojo kernel to a native object file.
+    //
+    // Use MOJO_MCPU to override the target CPU (defaults to "native"). In CI we pin to
+    // "skylake" which enables hardware gather instructions (vpgatherqd) that are critical
+    // for performance — "x86-64-v3" lacks them and LLVM scalarizes the gather.
+    let mcpu = env::var("MOJO_MCPU").unwrap_or_else(|_| "native".to_owned());
+
+    // Cargo sets TARGET to e.g. "x86_64-unknown-linux-gnu". Pass it through so Mojo
+    // doesn't fail with "unknown target triple" when the build env differs from the host.
+    // On macOS, Mojo's host detection works correctly but it rejects the Cargo triple format
+    // and --mcpu=native also triggers the broken host triple detection, so skip both there.
+    let is_apple = env::var("TARGET")
+        .map(|t| t.contains("apple"))
+        .unwrap_or(false);
+    let target_triple = env::var("TARGET")
+        .ok()
+        .filter(|_| !is_apple);
+
+    let mut cmd = Command::new(&mojo_bin);
+    cmd.arg("build").arg("--emit").arg("object");
+
+    if !is_apple || mcpu != "native" {
+        cmd.arg("--mcpu").arg(&mcpu).arg("--mtune").arg(&mcpu);
+    }
+
+    if let Some(triple) = &target_triple {
+        cmd.arg("--target-triple").arg(triple);
+    }
+
+    let status = cmd.arg("-o").arg(&obj_path).arg(&kernel_src).status();
+
+    let status = match status {
+        Ok(s) => s,
+        Err(e) => {
+            println!("cargo:warning=Mojo compilation failed to launch: {e}");
+            return;
+        }
+    };
+
+    if !status.success() {
+        println!(
+            "cargo:warning=Mojo AOT compilation failed (exit {}), falling back to Rust SIMD kernels",
+            status
+        );
+        return;
+    }
+
+    // Archive the object file into a static library that Cargo can link.
+    let lib_path = out_dir.join("libvortex_mojo_take.a");
+    let ar_status = Command::new("ar")
+        .args(["rcs"])
+        .arg(&lib_path)
+        .arg(&obj_path)
+        .status();
+
+    match ar_status {
+        Ok(s) if s.success() => {}
+        Ok(s) => {
+            println!("cargo:warning=ar failed (exit {s}), falling back to Rust SIMD kernels");
+            return;
+        }
+        Err(e) => {
+            println!("cargo:warning=ar not found: {e}, falling back to Rust SIMD kernels");
+            return;
+        }
+    }
+
+    // Tell Cargo to link the static library.
+    println!("cargo:rustc-link-search=native={}", out_dir.display());
+    println!("cargo:rustc-link-lib=static=vortex_mojo_take");
+
+    // Enable the cfg flag so Rust code can use the Mojo kernels.
+    println!("cargo:rustc-cfg=vortex_mojo");
+}
+
+/// Searches for the Mojo compiler binary. Checks `PATH` first, then the common
+/// pip-installed location (`~/.local/bin/mojo`).
+fn find_mojo() -> Option<PathBuf> {
+    // Check PATH first.
+    if Command::new("mojo")
+        .arg("--version")
+        .output()
+        .is_ok_and(|o| o.status.success())
+    {
+        return Some(PathBuf::from("mojo"));
+    }
+
+    // Pip installs mojo to ~/.local/bin on Linux.
+    if let Ok(home) = env::var("HOME") {
+        let pip_mojo = PathBuf::from(home).join(".local/bin/mojo");
+        if pip_mojo.exists()
+            && Command::new(&pip_mojo)
+                .arg("--version")
+                .output()
+                .is_ok_and(|o| o.status.success())
+        {
+            return Some(pip_mojo);
+        }
+    }
+
+    None
+}
diff --git a/vortex-array/kernels/take.mojo b/vortex-array/kernels/take.mojo
new file mode 100644
index 00000000000..3e20635964a
--- /dev/null
+++ b/vortex-array/kernels/take.mojo
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+# Mojo AOT-compiled SIMD take (gather) kernels for Vortex.
+#
+# Each exported function gathers values from `src` at positions given by `indices`
+# and writes them into `dst`. The caller (Rust) owns all three buffers — the Mojo
+# side performs zero allocation.
+#
+# Pointers are passed as `Int` (pointer-width integer) because Mojo 0.26's
+# `UnsafePointer` carries origin/mutability parameters that make it incompatible
+# with `@export`. Inside each function we reconstruct typed `UnsafePointer`s via
+# the `type_of` anchor pattern.
+#
+# SIMD width is hardcoded to 8 lanes for 4-byte types and 4 lanes for 8-byte
+# types (matching AVX2 register width). The compiler will use the best available
+# ISA (AVX-512, AVX2, NEON) for the gather instructions.
+
+from std.memory import UnsafePointer
+
+# SIMD lane counts matching 256-bit registers (AVX2 baseline).
+comptime W1: Int = 32  # 1-byte values: 32 lanes
+comptime W2: Int = 16  # 2-byte values: 16 lanes
+comptime W4: Int = 8   # 4-byte values: 8 lanes
+comptime W8: Int = 4   # 8-byte values: 4 lanes
+
+
+# ---------------------------------------------------------------------------
+# Generic gather implementation
+# ---------------------------------------------------------------------------
+
+@always_inline
+fn _take[VT: DType, IT: DType, W: Int](
+    src_addr: Int,
+    idx_addr: Int,
+    dst_addr: Int,
+    count: Int,
+):
+    """Gather `count` elements: dst[i] = src[indices[i]].
+
+    The inner loop is 4x unrolled to keep the CPU's gather pipeline fed with
+    independent loads (critical for throughput on Intel Skylake+ and AMD Zen3+).
+    """
+    var _v_anchor: Scalar[VT] = 0
+    var _i_anchor: Scalar[IT] = 0
+    comptime VP = type_of(UnsafePointer(to=_v_anchor))
+    comptime IP = type_of(UnsafePointer(to=_i_anchor))
+
+    var src = VP(unsafe_from_address=src_addr)
+    var idx = IP(unsafe_from_address=idx_addr)
+    var dst = VP(unsafe_from_address=dst_addr)
+
+    var i = 0
+
+    # 4x unrolled SIMD gather — keeps gather units saturated with independent
+    # loads for maximum instruction-level parallelism.
+    while i + W * 4 <= count:
+        var g0 = src.gather(idx.load[width=W](i))
+        var g1 = src.gather(idx.load[width=W](i + W))
+        var g2 = src.gather(idx.load[width=W](i + W * 2))
+        var g3 = src.gather(idx.load[width=W](i + W * 3))
+
+        dst.store[width=W](i, g0)
+        dst.store[width=W](i + W, g1)
+        dst.store[width=W](i + W * 2, g2)
+        dst.store[width=W](i + W * 3, g3)
+        i += W * 4
+
+    # Single-vector remainder.
+    while i + W <= count:
+        dst.store[width=W](i, src.gather(idx.load[width=W](i)))
+        i += W
+
+    # Scalar remainder.
+    while i < count:
+        dst[i] = src[Int(idx[i])]
+        i += 1
+
+
+# ---------------------------------------------------------------------------
+# 4-byte value types (i32 / u32 / f32)
+# ---------------------------------------------------------------------------
+
+@export("vortex_take_4byte_u8idx")
+fn take_4byte_u8idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint32, DType.uint8, W4](src, idx, dst, n)
+
+@export("vortex_take_4byte_u16idx")
+fn take_4byte_u16idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint32, DType.uint16, W4](src, idx, dst, n)
+
+@export("vortex_take_4byte_u32idx")
+fn take_4byte_u32idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint32, DType.uint32, W4](src, idx, dst, n)
+
+@export("vortex_take_4byte_u64idx")
+fn take_4byte_u64idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint32, DType.uint64, W4](src, idx, dst, n)
+
+
+# ---------------------------------------------------------------------------
+# 8-byte value types (i64 / u64 / f64)
+# ---------------------------------------------------------------------------
+
+@export("vortex_take_8byte_u8idx")
+fn take_8byte_u8idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint64, DType.uint8, W8](src, idx, dst, n)
+
+@export("vortex_take_8byte_u16idx")
+fn take_8byte_u16idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint64, DType.uint16, W8](src, idx, dst, n)
+
+@export("vortex_take_8byte_u32idx")
+fn take_8byte_u32idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint64, DType.uint32, W8](src, idx, dst, n)
+
+@export("vortex_take_8byte_u64idx")
+fn take_8byte_u64idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint64, DType.uint64, W8](src, idx, dst, n)
+
+
+# ---------------------------------------------------------------------------
+# 2-byte value types (i16 / u16 / f16)
+# ---------------------------------------------------------------------------
+
+@export("vortex_take_2byte_u8idx")
+fn take_2byte_u8idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint16, DType.uint8, W2](src, idx, dst, n)
+
+@export("vortex_take_2byte_u16idx")
+fn take_2byte_u16idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint16, DType.uint16, W2](src, idx, dst, n)
+
+@export("vortex_take_2byte_u32idx")
+fn take_2byte_u32idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint16, DType.uint32, W2](src, idx, dst, n)
+
+@export("vortex_take_2byte_u64idx")
+fn take_2byte_u64idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint16, DType.uint64, W2](src, idx, dst, n)
+
+
+# ---------------------------------------------------------------------------
+# 1-byte value types (i8 / u8)
+# ---------------------------------------------------------------------------
+
+@export("vortex_take_1byte_u8idx")
+fn take_1byte_u8idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint8, DType.uint8, W1](src, idx, dst, n)
+
+@export("vortex_take_1byte_u16idx")
+fn take_1byte_u16idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint8, DType.uint16, W1](src, idx, dst, n)
+
+@export("vortex_take_1byte_u32idx")
+fn take_1byte_u32idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint8, DType.uint32, W1](src, idx, dst, n)
+
+@export("vortex_take_1byte_u64idx")
+fn take_1byte_u64idx(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint8, DType.uint64, W1](src, idx, dst, n)
+
+
+# ---------------------------------------------------------------------------
+# Filter kernels (gather by usize indices from mask)
+#
+# These are used by the primitive filter path when the mask is sparse (<80%
+# selectivity). The Rust side converts the bitmap to a &[usize] index array
+# and passes it here. On x86_64 usize = u64, so these are gathers with
+# u64 element indices.
+# ---------------------------------------------------------------------------
+
+@export("vortex_filter_1byte")
+fn filter_1byte(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint8, DType.uint64, W1](src, idx, dst, n)
+
+@export("vortex_filter_2byte")
+fn filter_2byte(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint16, DType.uint64, W2](src, idx, dst, n)
+
+@export("vortex_filter_4byte")
+fn filter_4byte(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint32, DType.uint64, W4](src, idx, dst, n)
+
+@export("vortex_filter_8byte")
+fn filter_8byte(src: Int, idx: Int, dst: Int, n: Int):
+    _take[DType.uint64, DType.uint64, W8](src, idx, dst, n)
diff --git a/vortex-array/src/arrays/filter/execute/slice.rs b/vortex-array/src/arrays/filter/execute/slice.rs
index 1528d272b28..1b13922a7f2 100644
--- a/vortex-array/src/arrays/filter/execute/slice.rs
+++ b/vortex-array/src/arrays/filter/execute/slice.rs
@@ -37,9 +37,64 @@ pub(super) fn filter_slice_by_mask_values<T: Copy>(slice: &[T], mask: &MaskValue
 
 /// Filter a slice by a set of strictly increasing indices.
 fn filter_slice_by_indices<T: Copy>(slice: &[T], indices: &[usize]) -> Buffer<T> {
+    #[cfg(vortex_mojo)]
+    {
+        if let Some(buf) = mojo::filter_by_indices_mojo(slice, indices) {
+            return buf;
+        }
+    }
+
     Buffer::<T>::from_trusted_len_iter(indices.iter().map(|&idx| slice[idx]))
 }
 
+#[cfg(vortex_mojo)]
+mod mojo {
+    use std::mem::size_of;
+
+    use vortex_buffer::Buffer;
+    use vortex_buffer::BufferMut;
+
+    unsafe extern "C" {
+        fn vortex_filter_1byte(src: usize, idx: usize, dst: usize, n: usize);
+        fn vortex_filter_2byte(src: usize, idx: usize, dst: usize, n: usize);
+        fn vortex_filter_4byte(src: usize, idx: usize, dst: usize, n: usize);
+        fn vortex_filter_8byte(src: usize, idx: usize, dst: usize, n: usize);
+    }
+
+    /// SIMD gather for the filter-by-indices path. Returns `None` for unsupported
+    /// element sizes so the caller falls back to scalar.
+    pub(super) fn filter_by_indices_mojo<T: Copy>(
+        slice: &[T],
+        indices: &[usize],
+    ) -> Option<Buffer<T>> {
+        let kernel: unsafe extern "C" fn(usize, usize, usize, usize) = match size_of::<T>() {
+            1 => vortex_filter_1byte,
+            2 => vortex_filter_2byte,
+            4 => vortex_filter_4byte,
+            8 => vortex_filter_8byte,
+            _ => return None,
+        };
+
+        let len = indices.len();
+        let mut buffer = BufferMut::<T>::with_capacity(len);
+        let dst = buffer.spare_capacity_mut().as_mut_ptr().cast::<T>();
+
+        // SAFETY: The Mojo kernel reads `len` indices from `indices`, gathers from
+        // `slice`, and writes `len` elements to `dst`. All pointers are valid.
+        unsafe {
+            kernel(
+                slice.as_ptr() as usize,
+                indices.as_ptr() as usize,
+                dst as usize,
+                len,
+            );
+            buffer.set_len(len);
+        }
+
+        Some(buffer.freeze())
+    }
+}
+
 /// Filter a slice by a set of strictly increasing `(start, end)` ranges.
 fn filter_slice_by_slices<T: Copy>(slice: &[T], slices: &[(usize, usize)]) -> Buffer<T> {
     let output_len: usize = slices.iter().map(|(start, end)| end - start).sum();
diff --git a/vortex-array/src/arrays/primitive/compute/take/mod.rs b/vortex-array/src/arrays/primitive/compute/take/mod.rs
index aedda05b6b6..3d76f63332a 100644
--- a/vortex-array/src/arrays/primitive/compute/take/mod.rs
+++ b/vortex-array/src/arrays/primitive/compute/take/mod.rs
@@ -4,6 +4,9 @@
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 mod avx2;
 
+#[cfg(vortex_mojo)]
+mod mojo;
+
 use std::sync::LazyLock;
 
 use vortex_buffer::Buffer;
@@ -29,19 +32,22 @@ use crate::validity::Validity;
 // Kernel selection happens on the first call to `take` and uses a combination of compile-time
 // and runtime feature detection to infer the best kernel for the platform.
 static PRIMITIVE_TAKE_KERNEL: LazyLock<&'static dyn TakeImpl> = LazyLock::new(|| {
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-    {
-        if is_x86_feature_detected!("avx2") {
-            &avx2::TakeKernelAVX2
+    cfg_if::cfg_if! {
+        if #[cfg(vortex_mojo)] {
+            // Mojo AOT path: 4x-unrolled SIMD gather compiled for the target CPU.
+            // With --mcpu=skylake this generates vpgatherqd and matches hand-written
+            // AVX2 while also working on ARM (NEON) and other platforms.
+            &mojo::TakeKernelMojo
+        } else if #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] {
+            if is_x86_feature_detected!("avx2") {
+                &avx2::TakeKernelAVX2
+            } else {
+                &TakeKernelScalar
+            }
         } else {
             &TakeKernelScalar
         }
     }
-
-    #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
-    {
-        &TakeKernelScalar
-    }
 });
 
 trait TakeImpl: Send + Sync {
@@ -53,6 +59,7 @@ trait TakeImpl: Send + Sync {
     ) -> VortexResult<ArrayRef>;
 }
 
+#[allow(unused)]
 struct TakeKernelScalar;
 
 impl TakeImpl for TakeKernelScalar {
diff --git a/vortex-array/src/arrays/primitive/compute/take/mojo.rs b/vortex-array/src/arrays/primitive/compute/take/mojo.rs
new file mode 100644
index 00000000000..96407447bc9
--- /dev/null
+++ b/vortex-array/src/arrays/primitive/compute/take/mojo.rs
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! FFI bridge to AOT-compiled Mojo SIMD take kernels.
+//!
+//! The Mojo kernels are compiled during `build.rs` and statically linked. Each exported
+//! symbol operates on raw pointer-width integers (`usize`) — Rust allocates the output
+//! buffer, passes addresses as `usize`, and Mojo writes directly into the buffer.
+//!
+//! Value types are dispatched by byte width (1/2/4/8) since the gather operation is
+//! agnostic to signedness. Rust reinterprets the slice pointers accordingly.
+
+use std::mem::size_of;
+
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_error::VortexResult;
+
+use super::TakeImpl;
+use crate::ArrayRef;
+use crate::IntoArray;
+use crate::array::ArrayView;
+use crate::arrays::PrimitiveArray;
+use crate::arrays::primitive::vtable::Primitive;
+use crate::dtype::NativePType;
+use crate::dtype::PType;
+use crate::dtype::UnsignedPType;
+use crate::match_each_native_ptype;
+use crate::match_each_unsigned_integer_ptype;
+use crate::validity::Validity;
+
+// ---------------------------------------------------------------------------
+// Mojo extern declarations — pointers passed as usize (Mojo `Int`).
+// One symbol per (value_byte_width, index_type) pair.
+// ---------------------------------------------------------------------------
+
+unsafe extern "C" {
+    // 1-byte values
+    fn vortex_take_1byte_u8idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_1byte_u16idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_1byte_u32idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_1byte_u64idx(src: usize, idx: usize, dst: usize, len: usize);
+
+    // 2-byte values
+    fn vortex_take_2byte_u8idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_2byte_u16idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_2byte_u32idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_2byte_u64idx(src: usize, idx: usize, dst: usize, len: usize);
+
+    // 4-byte values
+    fn vortex_take_4byte_u8idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_4byte_u16idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_4byte_u32idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_4byte_u64idx(src: usize, idx: usize, dst: usize, len: usize);
+
+    // 8-byte values
+    fn vortex_take_8byte_u8idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_8byte_u16idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_8byte_u32idx(src: usize, idx: usize, dst: usize, len: usize);
+    fn vortex_take_8byte_u64idx(src: usize, idx: usize, dst: usize, len: usize);
+}
+
+pub(super) struct TakeKernelMojo;
+
+impl TakeImpl for TakeKernelMojo {
+    fn take(
+        &self,
+        array: ArrayView<'_, Primitive>,
+        indices: ArrayView<'_, Primitive>,
+        validity: Validity,
+    ) -> VortexResult<ArrayRef> {
+        match_each_native_ptype!(array.ptype(), |V| {
+            match_each_unsigned_integer_ptype!(indices.ptype(), |I| {
+                let buffer = take_mojo::<V, I>(array.as_slice(), indices.as_slice());
+                Ok(PrimitiveArray::new(buffer, validity).into_array())
+            })
+        })
+    }
+}
+
+/// Dispatch to the appropriate Mojo kernel based on value byte width and index type.
+fn take_mojo<V: NativePType, I: UnsignedPType>(values: &[V], indices: &[I]) -> Buffer<V> {
+    let len = indices.len();
+    let mut buffer = BufferMut::<V>::with_capacity(len);
+
+    let dst = buffer.spare_capacity_mut().as_mut_ptr().cast::<V>();
+    let src = values.as_ptr();
+    let idx = indices.as_ptr();
+
+    // SAFETY: All three pointers are valid for their respective lengths. The Mojo kernel
+    // writes exactly `len` elements to `dst`, which has capacity for `len` elements.
+    // We dispatch by value byte-width since the gather is signedness-agnostic.
+    unsafe {
+        match (size_of::<V>(), I::PTYPE) {
+            (1, PType::U8) => {
+                vortex_take_1byte_u8idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (1, PType::U16) => {
+                vortex_take_1byte_u16idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (1, PType::U32) => {
+                vortex_take_1byte_u32idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (1, PType::U64) => {
+                vortex_take_1byte_u64idx(src as usize, idx as usize, dst as usize, len)
+            }
+
+            (2, PType::U8) => {
+                vortex_take_2byte_u8idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (2, PType::U16) => {
+                vortex_take_2byte_u16idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (2, PType::U32) => {
+                vortex_take_2byte_u32idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (2, PType::U64) => {
+                vortex_take_2byte_u64idx(src as usize, idx as usize, dst as usize, len)
+            }
+
+            (4, PType::U8) => {
+                vortex_take_4byte_u8idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (4, PType::U16) => {
+                vortex_take_4byte_u16idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (4, PType::U32) => {
+                vortex_take_4byte_u32idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (4, PType::U64) => {
+                vortex_take_4byte_u64idx(src as usize, idx as usize, dst as usize, len)
+            }
+
+            (8, PType::U8) => {
+                vortex_take_8byte_u8idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (8, PType::U16) => {
+                vortex_take_8byte_u16idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (8, PType::U32) => {
+                vortex_take_8byte_u32idx(src as usize, idx as usize, dst as usize, len)
+            }
+            (8, PType::U64) => {
+                vortex_take_8byte_u64idx(src as usize, idx as usize, dst as usize, len)
+            }
+
+            _ => unreachable!("unsupported value size / index type combination"),
+        }
+
+        buffer.set_len(len);
+    }
+
+    buffer.freeze()
+}
diff --git a/vortex-array/src/arrays/primitive/mod.rs b/vortex-array/src/arrays/primitive/mod.rs
index 4d62f1da517..e75e6cf47ca 100644
--- a/vortex-array/src/arrays/primitive/mod.rs
+++ b/vortex-array/src/arrays/primitive/mod.rs
@@ -9,7 +9,7 @@ pub use array::chunk_range;
 pub use array::patch_chunk;
 pub use vtable::PrimitiveArray;
 
-pub(crate) mod compute;
+mod compute;
 
 mod vtable;
 pub use compute::rules::PrimitiveMaskedValidityRule;