diff --git a/vortex-tensor/Cargo.toml b/vortex-tensor/Cargo.toml
index b14e82287bb..2f92ce5a107 100644
--- a/vortex-tensor/Cargo.toml
+++ b/vortex-tensor/Cargo.toml
@@ -37,8 +37,3 @@ rand = { workspace = true }
 rand_distr = { workspace = true }
 rstest = { workspace = true }
 vortex-btrblocks = { path = "../vortex-btrblocks" }
-
-[[bench]]
-name = "similarity_search"
-harness = false
-test = false
diff --git a/vortex-tensor/benches/similarity_search.rs b/vortex-tensor/benches/similarity_search.rs
deleted file mode 100644
index 1112eda11b2..00000000000
--- a/vortex-tensor/benches/similarity_search.rs
+++ /dev/null
@@ -1,108 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! End-to-end similarity-search execution benchmark.
-//!
-//! For each of three compression strategies (uncompressed, default BtrBlocks, TurboQuant), this
-//! bench:
-//!
-//! 1. Generates a deterministic random `Vector<dim, f32>` batch.
-//! 2. Applies the compression strategy *outside* the timed region.
-//! 3. Builds the lazy
-//!    `Binary(Gt, [CosineSimilarity(data, query), threshold])`
-//!    tree *outside* the timed region.
-//! 4. Times *only* `tree.execute::<BoolArray>(&mut ctx)`.
-//!
-//! Run with: `cargo bench -p vortex-tensor --bench similarity_search`
-
-use divan::Bencher;
-use mimalloc::MiMalloc;
-use vortex_array::VortexSessionExecute;
-use vortex_array::arrays::BoolArray;
-use vortex_error::VortexExpect;
-
-#[path = "similarity_search_common/mod.rs"]
-mod common;
-
-use common::Variant;
-use common::build_similarity_search_tree;
-use common::extract_row_as_query;
-use common::generate_random_vectors;
-
-#[global_allocator]
-static GLOBAL: MiMalloc = MiMalloc;
-
-/// Number of vectors in the benchmark dataset.
-const NUM_ROWS: usize = 100_000;
-
-/// Dimensionality of each vector. Must be `>= vortex_tensor::encodings::turboquant::MIN_DIMENSION`
-/// (128) for the TurboQuant variant to work.
-const DIM: u32 = 768;
-
-/// Deterministic PRNG seed for the generated dataset.
-const SEED: u64 = 0xC0FFEE;
-
-/// Cosine similarity threshold for the "greater than" filter. Random f32 vectors from N(0, 1) at
-/// this dimension have near-zero pairwise similarity, so picking a row of the dataset as the
-/// query guarantees at least that row matches.
-const THRESHOLD: f32 = 0.8;
-
-fn main() {
-    divan::main();
-}
-
-/// Runs one end-to-end execution of the similarity-search tree for the given variant. All dataset
-/// generation and tree construction happens in the bench setup closure so only the execution of
-/// the lazy tree is timed.
-fn bench_variant(bencher: Bencher<'_, '_>, variant: Variant) {
-    bencher
-        .with_inputs(|| {
-            let mut ctx = common::SESSION.create_execution_ctx();
-
-            // Use row 0 of the uncompressed data as the query so we always have at least one
-            // match. Keeping the query extraction separate from the compressed-data build keeps
-            // the query identical across all three variants.
-            let raw = generate_random_vectors(NUM_ROWS, DIM, SEED);
-            let query = extract_row_as_query(&raw, 0, DIM);
-            let data = match variant {
-                Variant::Uncompressed => raw,
-                Variant::DefaultCompression => {
-                    common::compress_default(raw).vortex_expect("default compression succeeds")
-                }
-                Variant::TurboQuant => common::compress_turboquant(raw, &mut ctx)
-                    .vortex_expect("turboquant compression succeeds"),
-            };
-
-            // println!(
-            //     "\n\n{}: {}\n\n",
-            //     variant,
-            //     data.display_tree_encodings_only()
-            // );
-
-            let tree = build_similarity_search_tree(data, &query, THRESHOLD)
-                .vortex_expect("tree construction succeeds");
-
-            (tree, ctx)
-        })
-        .bench_values(|(tree, mut ctx)| {
-            // Hot path: only the .execute() call is timed. The result is a BoolArray of length
-            // NUM_ROWS with true at positions where cosine_similarity > THRESHOLD.
-            tree.execute::<BoolArray>(&mut ctx)
-                .vortex_expect("similarity search tree executes to a BoolArray")
-        });
-}
-
-#[divan::bench]
-fn execute_uncompressed(bencher: Bencher) {
-    bench_variant(bencher, Variant::Uncompressed);
-}
-
-#[divan::bench]
-fn execute_default_compression(bencher: Bencher) {
-    bench_variant(bencher, Variant::DefaultCompression);
-}
-
-#[divan::bench]
-fn execute_turboquant(bencher: Bencher) {
-    bench_variant(bencher, Variant::TurboQuant);
-}
diff --git a/vortex-tensor/benches/similarity_search_common/mod.rs b/vortex-tensor/benches/similarity_search_common/mod.rs
deleted file mode 100644
index c22cb5a9f08..00000000000
--- a/vortex-tensor/benches/similarity_search_common/mod.rs
+++ /dev/null
@@ -1,256 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Shared helpers for the similarity-search benchmark and example.
-//!
-//! This module is included from both `vortex-tensor/benches/similarity_search.rs` and
-//! `vortex-tensor/examples/similarity_search.rs` via an explicit `#[path = ...]` so both targets
-//! use the exact same array-tree builder.
-//!
-//! The three main entry points are:
-//!
-//! - [`generate_random_vectors`] to build a deterministic random [`Vector`] extension array.
-//! - [`build_variant`] to take a raw vector array and apply the requested compression strategy
-//!   (uncompressed, default BtrBlocks, or TurboQuant).
-//! - [`build_similarity_search_tree`] to wire a cosine-similarity + threshold expression on top of
-//!   a prepared data array and a single-row query vector.
-//!
-//! [`Vector`]: vortex_tensor::vector::Vector
-
-#![allow(dead_code)]
-
-use std::fmt;
-use std::sync::LazyLock;
-
-use rand::SeedableRng;
-use rand::rngs::StdRng;
-use rand_distr::Distribution;
-use rand_distr::Normal;
-use vortex_array::ArrayRef;
-use vortex_array::ExecutionCtx;
-use vortex_array::IntoArray;
-use vortex_array::VortexSessionExecute;
-use vortex_array::arrays::ConstantArray;
-use vortex_array::arrays::Extension;
-use vortex_array::arrays::ExtensionArray;
-use vortex_array::arrays::FixedSizeListArray;
-use vortex_array::arrays::PrimitiveArray;
-use vortex_array::arrays::extension::ExtensionArrayExt;
-use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
-use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
-use vortex_array::builtins::ArrayBuiltins;
-use vortex_array::dtype::DType;
-use vortex_array::dtype::Nullability;
-use vortex_array::dtype::PType;
-use vortex_array::dtype::extension::ExtDType;
-use vortex_array::extension::EmptyMetadata;
-use vortex_array::scalar::Scalar;
-use vortex_array::scalar_fn::fns::operators::Operator;
-use vortex_array::session::ArraySession;
-use vortex_array::validity::Validity;
-use vortex_btrblocks::BtrBlocksCompressor;
-use vortex_buffer::BufferMut;
-use vortex_error::VortexExpect;
-use vortex_error::VortexResult;
-use vortex_error::vortex_panic;
-use vortex_session::VortexSession;
-use vortex_tensor::encodings::turboquant::TurboQuantConfig;
-use vortex_tensor::encodings::turboquant::turboquant_encode_unchecked;
-use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
-use vortex_tensor::scalar_fns::l2_denorm::L2Denorm;
-use vortex_tensor::scalar_fns::l2_denorm::normalize_as_l2_denorm;
-use vortex_tensor::vector::Vector;
-
-/// A shared [`VortexSession`] pre-loaded with the builtin [`ArraySession`] so both bench and
-/// example can create execution contexts cheaply.
-pub static SESSION: LazyLock<VortexSession> =
-    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
-
-/// The three compression strategies the benchmark and example exercise.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum Variant {
-    /// Raw `Vector<dim, f32>` with no compression applied.
-    Uncompressed,
-    /// `BtrBlocksCompressor::default()` walks into the extension array and compresses the
-    /// underlying FSL storage child with the default scheme set (no TurboQuant).
-    DefaultCompression,
-    /// TurboQuant: normalize, quantize to `FSL(Dict)`, wrap in SORF + `L2Denorm`.
-    TurboQuant,
-}
-
-impl fmt::Display for Variant {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::Uncompressed => f.write_str("Uncompressed"),
-            Self::DefaultCompression => f.write_str("DefaultCompression"),
-            Self::TurboQuant => f.write_str("TurboQuant"),
-        }
-    }
-}
-
-/// Generate `num_rows` random f32 vectors of dimension `dim`, wrapped in a [`Vector`] extension
-/// array. The values are drawn from a standard normal distribution seeded by `seed` so results
-/// are reproducible across runs.
-///
-/// [`Vector`]: vortex_tensor::vector::Vector
-pub fn generate_random_vectors(num_rows: usize, dim: u32, seed: u64) -> ArrayRef {
-    let mut rng = StdRng::seed_from_u64(seed);
-    // `Normal::new(0, 1)` is infallible for these parameters. `rand_distr::NormalError` does
-    // not implement `Into<VortexError>`, so we cannot use `vortex_expect` here; fall back to
-    // `vortex_panic!` on the (impossible) error path instead.
-    let normal =
-        Normal::new(0.0f32, 1.0).unwrap_or_else(|_| vortex_panic!("Normal(0, 1) is well-defined"));
-
-    let dim_usize = dim as usize;
-    let mut buf = BufferMut::<f32>::with_capacity(num_rows * dim_usize);
-    for _ in 0..(num_rows * dim_usize) {
-        buf.push(normal.sample(&mut rng));
-    }
-
-    let elements = PrimitiveArray::new::<f32>(buf.freeze(), Validity::NonNullable);
-    let fsl =
-        FixedSizeListArray::try_new(elements.into_array(), dim, Validity::NonNullable, num_rows)
-            .vortex_expect("FSL with valid shape and matching children length");
-
-    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, fsl.dtype().clone())
-        .vortex_expect("Vector extension dtype is valid for an f32 FSL")
-        .erased();
-    ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()
-}
-
-/// Pull the `row`-th vector out of a `Vector<dim, f32>` extension array as a plain `Vec<f32>`.
-///
-/// Used to extract a single query vector from a batch of generated data. The input must already
-/// be fully materialized (no lazy scalar-fn wrappers); pass a raw array from
-/// [`generate_random_vectors`], not a compressed variant.
-pub fn extract_row_as_query(vectors: &ArrayRef, row: usize, dim: u32) -> Vec<f32> {
-    let ext = vectors
-        .as_opt::<Extension>()
-        .vortex_expect("data must be a Vector extension array");
-
-    let mut ctx = SESSION.create_execution_ctx();
-    let fsl: FixedSizeListArray = ext
-        .storage_array()
-        .clone()
-        .execute(&mut ctx)
-        .vortex_expect("storage array executes to an FSL");
-    let elements: PrimitiveArray = fsl
-        .elements()
-        .clone()
-        .execute(&mut ctx)
-        .vortex_expect("FSL elements execute to a PrimitiveArray");
-
-    let slice = elements.as_slice::<f32>();
-    let dim_usize = dim as usize;
-    let start = row * dim_usize;
-    slice[start..start + dim_usize].to_vec()
-}
-
-/// Build a `Vector<dim, f32>` extension array whose storage is a [`ConstantArray`] broadcasting a
-/// single query vector across `num_rows` rows. This is how we hand a single query vector to
-/// `CosineSimilarity` on the `rhs` side -- `ScalarFnArray` requires both children to have the
-/// same length, so we broadcast the query instead of hand-rolling a 1-row input.
-fn build_constant_query_vector(query: &[f32], num_rows: usize) -> VortexResult<ArrayRef> {
-    let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable);
-
-    let children: Vec<Scalar> = query
-        .iter()
-        .map(|&v| Scalar::primitive(v, Nullability::NonNullable))
-        .collect();
-    let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable);
-
-    let storage = ConstantArray::new(storage_scalar, num_rows).into_array();
-
-    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, storage.dtype().clone())?.erased();
-    Ok(ExtensionArray::new(ext_dtype, storage).into_array())
-}
-
-/// Compresses a raw `Vector<dim, f32>` array with the default BtrBlocks pipeline.
-///
-/// [`BtrBlocksCompressor`] walks into the extension array and recursively compresses the
-/// underlying FSL storage child. TurboQuant is *not* exercised by this path -- it is not
-/// registered in the default scheme set -- so this measures "generic" lossless compression
-/// applied to float vectors.
-pub fn compress_default(data: ArrayRef) -> VortexResult<ArrayRef> {
-    BtrBlocksCompressor::default().compress(&data)
-}
-
-/// Compresses a raw `Vector<dim, f32>` array with the TurboQuant pipeline by hand, producing the
-/// same tree shape that
-/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] would:
-///
-/// ```text
-/// L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)
-/// ```
-///
-/// Calling the encode helpers directly (instead of going through
-/// `BtrBlocksCompressorBuilder::with_turboquant()`) lets this example avoid depending on the
-/// `unstable_encodings` feature flag.
-///
-/// See `vortex-tensor/src/encodings/turboquant/tests/mod.rs::normalize_and_encode` for the same
-/// canonical recipe.
-pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
-    let l2_denorm = normalize_as_l2_denorm(data, ctx)?;
-    let normalized = l2_denorm.child_at(0).clone();
-    let norms = l2_denorm.child_at(1).clone();
-    let num_rows = l2_denorm.len();
-
-    let normalized_ext = normalized
-        .as_opt::<Extension>()
-        .vortex_expect("normalized child should be an Extension array");
-
-    let config = TurboQuantConfig::default();
-    // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is the
-    // invariant `turboquant_encode_unchecked` expects.
-    let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?;
-
-    Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
-}
-
-/// Dispatch helper that builds the data array for the requested [`Variant`], starting from a
-/// single random-vector generation. Always returns an `ArrayRef` whose logical dtype is
-/// `Vector<dim, f32>`.
-pub fn build_variant(
-    variant: Variant,
-    num_rows: usize,
-    dim: u32,
-    seed: u64,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ArrayRef> {
-    let raw = generate_random_vectors(num_rows, dim, seed);
-    match variant {
-        Variant::Uncompressed => Ok(raw),
-        Variant::DefaultCompression => compress_default(raw),
-        Variant::TurboQuant => compress_turboquant(raw, ctx),
-    }
-}
-
-/// Build the lazy similarity-search array tree for a prepared data array and a single query
-/// vector. The returned tree is a boolean array of length `data.len()` where position `i` is
-/// `true` iff `cosine_similarity(data[i], query) > threshold`.
-///
-/// The tree shape is:
-///
-/// ```text
-/// Binary(Gt, [
-///     CosineSimilarity([data, ConstantArray(query_vec, n)]),
-///     ConstantArray(threshold, n),
-/// ])
-/// ```
-///
-/// This function does no execution; it is safe to call inside a benchmark setup closure.
-pub fn build_similarity_search_tree(
-    data: ArrayRef,
-    query: &[f32],
-    threshold: f32,
-) -> VortexResult<ArrayRef> {
-    let num_rows = data.len();
-    let query_vec = build_constant_query_vector(query, num_rows)?;
-
-    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array();
-
-    let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable);
-    let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array();
-
-    cosine.binary(threshold_array, Operator::Gt)
-}
diff --git a/vortex-tensor/public-api.lock b/vortex-tensor/public-api.lock
index bec8df1cb29..4f6c2dddbfb 100644
--- a/vortex-tensor/public-api.lock
+++ b/vortex-tensor/public-api.lock
@@ -550,4 +550,12 @@ impl core::marker::Copy for vortex_tensor::vector::VectorMatcherMetadata
 
 impl core::marker::StructuralPartialEq for vortex_tensor::vector::VectorMatcherMetadata
 
+pub mod vortex_tensor::vector_search
+
+pub fn vortex_tensor::vector_search::build_constant_query_vector<T: vortex_array::dtype::ptype::NativePType + core::convert::Into<vortex_array::scalar::typed_view::primitive::pvalue::PValue>>(query: &[T], num_rows: usize) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::vector_search::build_similarity_search_tree<T: vortex_array::dtype::ptype::NativePType + core::convert::Into<vortex_array::scalar::typed_view::primitive::pvalue::PValue>>(data: vortex_array::array::erased::ArrayRef, query: &[T], threshold: T) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::vector_search::compress_turboquant(data: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
 pub fn vortex_tensor::initialize(session: &vortex_session::VortexSession)
diff --git a/vortex-tensor/src/lib.rs b/vortex-tensor/src/lib.rs
index 3d3563aa8e4..b3cf6c21695 100644
--- a/vortex-tensor/src/lib.rs
+++ b/vortex-tensor/src/lib.rs
@@ -25,6 +25,8 @@ pub mod vector;
 
 pub mod encodings;
 
+pub mod vector_search;
+
 mod utils;
 
 /// Initialize the Vortex tensor library with a Vortex session.
diff --git a/vortex-tensor/src/scalar_fns/inner_product.rs b/vortex-tensor/src/scalar_fns/inner_product.rs
index c883cf00982..0d6b2bf76aa 100644
--- a/vortex-tensor/src/scalar_fns/inner_product.rs
+++ b/vortex-tensor/src/scalar_fns/inner_product.rs
@@ -527,18 +527,9 @@ impl InnerProduct {
         let values: &[f32] = values_prim.as_slice::<f32>();
         debug_assert_eq!(codes.len(), len * padded_dim);
 
-        // Direct codebook lookup in the hot loop. See the function doc comment for why this
-        // beats an explicit product table here.
-        let mut out = BufferMut::<f32>::with_capacity(len);
-        for row in 0..len {
-            let row_codes = &codes[row * padded_dim..(row + 1) * padded_dim];
-            let mut acc = 0.0f32;
-            for j in 0..padded_dim {
-                acc += q[j] * values[row_codes[j] as usize];
-            }
-            // SAFETY: we reserved `len` slots above and push exactly once per row.
-            unsafe { out.push_unchecked(acc) };
-        }
+        // The hot loop is extracted into [`execute_dict_constant_inner_product`] with
+        // unchecked indexing so the compiler can vectorize the inner gather-accumulate.
+        let out = execute_dict_constant_inner_product(q, values, codes, len, padded_dim);
 
         // SAFETY: the buffer length equals `len`, which matches the validity length.
         let result = unsafe { PrimitiveArray::new_unchecked(out.freeze(), validity) }.into_array();
@@ -556,6 +547,49 @@ fn inner_product_row<T: Float + NativePType>(a: &[T], b: &[T]) -> T {
         .fold(T::zero(), |acc, v| acc + v)
 }
 
+/// Compute inner products between a constant query vector and dictionary-encoded rows.
+///
+/// For each row, computes `sum(q[j] * values[codes[row * dim + j]])` using the codebook
+/// `values` directly instead of decoding the dictionary into dense vectors.
+///
+/// The inner loop uses four independent accumulators so the CPU can pipeline FP additions
+/// instead of waiting for each `fadd` to retire before starting the next.
+fn execute_dict_constant_inner_product(
+    q: &[f32],
+    values: &[f32],
+    codes: &[u8],
+    num_rows: usize,
+    dim: usize,
+) -> BufferMut<f32> {
+    let mut out = BufferMut::<f32>::with_capacity(num_rows);
+
+    const PARTIAL_SUMS: usize = 8;
+
+    for row_codes in codes.chunks_exact(dim) {
+        let mut acc = [0.0f32; PARTIAL_SUMS];
+
+        let code_chunks = row_codes.chunks_exact(PARTIAL_SUMS);
+        let q_chunks = q.chunks_exact(PARTIAL_SUMS);
+        let code_rem = code_chunks.remainder();
+        let q_rem = q_chunks.remainder();
+
+        for (cc, qd) in code_chunks.zip(q_chunks) {
+            for i in 0..PARTIAL_SUMS {
+                acc[i] = qd[i].mul_add(values[cc[i] as usize], acc[i]);
+            }
+        }
+
+        for (&code, &q_val) in code_rem.iter().zip(q_rem.iter()) {
+            acc[0] = q_val.mul_add(values[code as usize], acc[0]);
+        }
+
+        // SAFETY: we reserved `num_rows` slots and push exactly once per row.
+        unsafe { out.push_unchecked(acc.iter().sum::<f32>()) };
+    }
+
+    out
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::LazyLock;
diff --git a/vortex-tensor/src/vector_search.rs b/vortex-tensor/src/vector_search.rs
new file mode 100644
index 00000000000..81a379683db
--- /dev/null
+++ b/vortex-tensor/src/vector_search.rs
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Reusable helpers for building brute-force vector similarity search expressions over
+//! [`Vector`] extension arrays.
+//!
+//! This module exposes three small building blocks that together make it straightforward to
+//! stand up a cosine-similarity-plus-threshold scan on top of a prepared data array:
+//!
+//! - [`compress_turboquant`] applies the canonical TurboQuant encoding pipeline
+//!   (`L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)`) to a raw
+//!   `Vector<dim, f32>` array without requiring the caller to plumb the
+//!   `unstable_encodings` feature flag on the `vortex` facade.
+//! - [`build_constant_query_vector`] wraps a single query vector into a
+//!   [`Vector`] extension array whose storage is a [`ConstantArray`] broadcast
+//!   across `num_rows` rows. This is the shape expected by
+//!   [`CosineSimilarity::try_new_array`] for the RHS of a database-vs-query scan.
+//! - [`build_similarity_search_tree`] wires everything together into a lazy
+//!   `Binary(Gt, [CosineSimilarity(data, query), threshold])` expression.
+//!
+//! Executing the tree from [`build_similarity_search_tree`] into a
+//! [`BoolArray`](vortex_array::arrays::BoolArray) yields one boolean per row indicating whether
+//! that row's cosine similarity to the query exceeds `threshold`.
+//!
+//! # Example
+//!
+//! ```ignore
+//! use vortex_array::{ArrayRef, VortexSessionExecute};
+//! use vortex_array::arrays::BoolArray;
+//! use vortex_session::VortexSession;
+//! use vortex_tensor::vector_search::{build_similarity_search_tree, compress_turboquant};
+//!
+//! fn run(session: &VortexSession, data: ArrayRef, query: &[f32]) -> anyhow::Result<()> {
+//!     let mut ctx = session.create_execution_ctx();
+//!     let data = compress_turboquant(data, &mut ctx)?;
+//!     let tree = build_similarity_search_tree(data, query, 0.8)?;
+//!     let _matches: BoolArray = tree.execute(&mut ctx)?;
+//!     Ok(())
+//! }
+//! ```
+//!
+//! [`Vector`]: crate::vector::Vector
+//! [`CosineSimilarity::try_new_array`]: crate::scalar_fns::cosine_similarity::CosineSimilarity::try_new_array
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::Extension;
+use vortex_array::arrays::ExtensionArray;
+use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::extension::ExtDType;
+use vortex_array::extension::EmptyMetadata;
+use vortex_array::scalar::PValue;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::encodings::turboquant::TurboQuantConfig;
+use crate::encodings::turboquant::turboquant_encode_unchecked;
+use crate::scalar_fns::cosine_similarity::CosineSimilarity;
+use crate::scalar_fns::l2_denorm::L2Denorm;
+use crate::scalar_fns::l2_denorm::normalize_as_l2_denorm;
+use crate::vector::Vector;
+
+/// Apply the canonical TurboQuant encoding pipeline to a `Vector<dim, f32>` array.
+///
+/// The returned array has the shape
+/// `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` — exactly what
+/// [`crate::encodings::turboquant::TurboQuantScheme`] produces when invoked through
+/// `BtrBlocksCompressorBuilder::with_turboquant()`, but without requiring callers to enable
+/// the `unstable_encodings` feature on the `vortex` facade.
+///
+/// The input `data` must be a [`Vector`] extension array whose element type is `f32` and whose
+/// dimensionality is at least
+/// [`turboquant::MIN_DIMENSION`](crate::encodings::turboquant::MIN_DIMENSION). The TurboQuant
+/// configuration used is [`TurboQuantConfig::default()`] (8-bit codes, 3 SORF rounds, seed 42).
+///
+/// # Errors
+///
+/// Returns an error if `data` is not a [`Vector`] extension array, if normalization fails, or
+/// if the underlying TurboQuant encoder rejects the input shape.
+pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
+    let l2_denorm = normalize_as_l2_denorm(data, ctx)?;
+    let normalized = l2_denorm.child_at(0).clone();
+    let norms = l2_denorm.child_at(1).clone();
+    let num_rows = l2_denorm.len();
+
+    let Some(normalized_ext) = normalized.as_opt::<Extension>() else {
+        vortex_bail!("normalize_as_l2_denorm must produce an Extension array child");
+    };
+
+    let config = TurboQuantConfig::default();
+    // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is
+    // the invariant `turboquant_encode_unchecked` expects.
+    let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?;
+
+    Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
+}
+
+/// Build a [`Vector`] extension array whose storage is a [`ConstantArray`] broadcasting a single
+/// query vector across `num_rows` rows.
+///
+/// The element type is inferred from `T` (e.g. `f32` or `f64`). This is the shape expected for
+/// the RHS of a database-vs-query [`CosineSimilarity`] scan: the `ScalarFnArray` contract
+/// requires both children to have the same length, so rather than hand-rolling a 1-row input we
+/// broadcast the query across the whole database.
+///
+/// # Errors
+///
+/// Returns an error if the [`Vector`] extension dtype rejects the constructed storage dtype.
+pub fn build_constant_query_vector<T: NativePType + Into<PValue>>(
+    query: &[T],
+    num_rows: usize,
+) -> VortexResult<ArrayRef> {
+    let element_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable);
+
+    let children: Vec<Scalar> = query
+        .iter()
+        .map(|&v| Scalar::primitive(v, Nullability::NonNullable))
+        .collect();
+    let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable);
+
+    let storage = ConstantArray::new(storage_scalar, num_rows).into_array();
+
+    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, storage.dtype().clone())?.erased();
+    Ok(ExtensionArray::new(ext_dtype, storage).into_array())
+}
+
+/// Build the lazy similarity-search expression tree for a prepared database array and a
+/// single query vector.
+///
+/// The returned array is a lazy boolean expression of length `data.len()` whose position `i`
+/// is `true` iff `cosine_similarity(data[i], query) > threshold`. Executing it into a
+/// [`BoolArray`](vortex_array::arrays::BoolArray) runs the full scan.
+///
+/// The tree shape is:
+///
+/// ```text
+/// Binary(Gt, [
+///     CosineSimilarity([data, ConstantArray(query_vec, n)]),
+///     ConstantArray(threshold, n),
+/// ])
+/// ```
+///
+/// The element type is inferred from `T` and must match the element type of `data`'s
+/// [`Vector`] extension dtype.
+///
+/// This function performs no execution; it is safe to call inside a benchmark setup closure.
+///
+/// # Errors
+///
+/// Returns an error if `query` has a length incompatible with `data`'s vector dimension, or
+/// if any of the intermediate array constructors fails.
+pub fn build_similarity_search_tree<T: NativePType + Into<PValue>>(
+    data: ArrayRef,
+    query: &[T],
+    threshold: T,
+) -> VortexResult<ArrayRef> {
+    let num_rows = data.len();
+    let query_vec = build_constant_query_vector(query, num_rows)?;
+
+    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array();
+
+    let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable);
+    let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array();
+
+    cosine.binary(threshold_array, Operator::Gt)
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::ArrayRef;
+    use vortex_array::IntoArray;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::BoolArray;
+    use vortex_array::arrays::Extension;
+    use vortex_array::arrays::ExtensionArray;
+    use vortex_array::arrays::FixedSizeListArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::bool::BoolArrayExt;
+    use vortex_array::dtype::extension::ExtDType;
+    use vortex_array::extension::EmptyMetadata;
+    use vortex_array::session::ArraySession;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::BufferMut;
+    use vortex_error::VortexResult;
+    use vortex_session::VortexSession;
+
+    use super::build_constant_query_vector;
+    use super::build_similarity_search_tree;
+    use super::compress_turboquant;
+    use crate::vector::Vector;
+
+    /// Build a `Vector<DIM, f32>` extension array from a flat f32 slice. Each contiguous
+    /// group of `DIM` values becomes one row.
+    fn vector_array(dim: u32, values: &[f32]) -> VortexResult<ArrayRef> {
+        let dim_usize = dim as usize;
+        assert_eq!(values.len() % dim_usize, 0);
+        let num_rows = values.len() / dim_usize;
+
+        let mut buf = BufferMut::<f32>::with_capacity(values.len());
+        for &v in values {
+            buf.push(v);
+        }
+        let elements = PrimitiveArray::new::<f32>(buf.freeze(), Validity::NonNullable);
+        let fsl = FixedSizeListArray::try_new(
+            elements.into_array(),
+            dim,
+            Validity::NonNullable,
+            num_rows,
+        )?;
+
+        let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, fsl.dtype().clone())?.erased();
+        Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array())
+    }
+
+    fn test_session() -> VortexSession {
+        VortexSession::empty().with::<ArraySession>()
+    }
+
+    #[test]
+    fn constant_query_vector_has_vector_extension_dtype() -> VortexResult<()> {
+        let query = vec![1.0f32, 0.0, 0.0, 0.0];
+        let rhs = build_constant_query_vector(&query, 5)?;
+
+        assert_eq!(rhs.len(), 5);
+        assert!(rhs.as_opt::<Extension>().is_some());
+        Ok(())
+    }
+
+    #[test]
+    fn similarity_search_tree_executes_to_bool_array() -> VortexResult<()> {
+        // 4 rows of 3-dim vectors; the first and last match the query [1, 0, 0].
+        let data = vector_array(
+            3,
+            &[
+                1.0, 0.0, 0.0, //
+                0.0, 1.0, 0.0, //
+                0.0, 0.0, 1.0, //
+                1.0, 0.0, 0.0, //
+            ],
+        )?;
+        let query = [1.0f32, 0.0, 0.0];
+
+        let tree = build_similarity_search_tree(data, &query, 0.5)?;
+        let mut ctx = test_session().create_execution_ctx();
+        let result: BoolArray = tree.execute(&mut ctx)?;
+
+        let bits = result.to_bit_buffer();
+        assert_eq!(bits.len(), 4);
+        assert!(bits.value(0));
+        assert!(!bits.value(1));
+        assert!(!bits.value(2));
+        assert!(bits.value(3));
+        Ok(())
+    }
+
+    #[test]
+    fn turboquant_roundtrip_preserves_ranking() -> VortexResult<()> {
+        // Build 6 rows of 128-dim vectors where row 0 is highly correlated with the query.
+        // TurboQuant should preserve the "row 0 is the best match" ordering.
+        const DIM: u32 = 128;
+        const NUM_ROWS: usize = 6;
+
+        let mut values = Vec::<f32>::with_capacity(NUM_ROWS * DIM as usize);
+        let query: Vec<f32> = (0..DIM as usize)
+            .map(|i| ((i as f32) * 0.017).sin())
+            .collect();
+
+        // Row 0: identical to query (cosine=1.0)
+        values.extend_from_slice(&query);
+        // Row 1: query + noise
+        for (i, q) in query.iter().enumerate() {
+            values.push(q + 0.05 * ((i as f32) * 0.03).cos());
+        }
+        // Rows 2..6: unrelated patterns
+        for row in 2..NUM_ROWS {
+            for i in 0..DIM as usize {
+                values.push(((row as f32 * 1.3 + i as f32) * 0.07).sin());
+            }
+        }
+
+        let data = vector_array(DIM, &values)?;
+        let mut ctx = test_session().create_execution_ctx();
+        let compressed = compress_turboquant(data, &mut ctx)?;
+        assert_eq!(compressed.len(), NUM_ROWS);
+
+        // Build a tree with a low threshold so row 0 (cosine=1.0 exact) matches.
+        let tree = build_similarity_search_tree(compressed, &query, 0.95)?;
+        let result: BoolArray = tree.execute(&mut ctx)?;
+        let bits = result.to_bit_buffer();
+        assert_eq!(bits.len(), NUM_ROWS);
+        assert!(
+            bits.value(0),
+            "row 0 (identical to query) must match at threshold 0.95 even after TurboQuant"
+        );
+        Ok(())
+    }
+}
diff --git a/vortex/benches/single_encoding_throughput.rs b/vortex/benches/single_encoding_throughput.rs
index e82f5fdd87a..4cedf23c07d 100644
--- a/vortex/benches/single_encoding_throughput.rs
+++ b/vortex/benches/single_encoding_throughput.rs
@@ -430,6 +430,7 @@ fn bench_zstd_decompress_string(bencher: Bencher) {
         .bench_refs(|a| a.to_canonical());
 }
 
+// TODO(connor): Remove this.
 // TurboQuant vector quantization benchmarks.
 #[cfg(feature = "unstable_encodings")]
 mod turboquant_benches {