diff --git a/.cargo/config.toml b/.cargo/config.toml
index 466422a80b1..e0f96fb234e 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -18,6 +18,10 @@ timeout = 3600
 # Environment variables for PyO3. Ensures reproducible builds and avoids spurious recompilations.
 [env]
 PYO3_ENVIRONMENT_SIGNATURE = { value = "cpython-3.11-64bit", force = true }
+# Default benchmark variant. A plain `cargo bench` runs every benchmark once on
+# the host; the CodSpeed workflow overrides this per leg to gate and name
+# feature-set-specific runs. Not forced, so a workflow-provided value wins.
+BENCH_VARIANT = "local"
 
 [cache.global-clean]
 # Anything older than this duration will be deleted in the source cache.
diff --git a/.github/actions/system-info/action.yml b/.github/actions/system-info/action.yml
index 7ca1d99b7ea..c08abf1e851 100644
--- a/.github/actions/system-info/action.yml
+++ b/.github/actions/system-info/action.yml
@@ -10,10 +10,13 @@ runs:
         echo "=== CPU Model ==="
         lscpu
         echo "=== /proc/cpuinfo summary ==="
-        grep -m1 "model name" /proc/cpuinfo
+        # x86 exposes "model name"; aarch64 has no such line (the model is shown
+        # by lscpu above), so a missing match must not fail the step.
+        grep -m1 "model name" /proc/cpuinfo || true
         nproc
         echo "=== CPU Flags ==="
-        grep -m1 "flags" /proc/cpuinfo
+        # x86 exposes "flags"; aarch64 exposes "Features".
+        grep -m1 -E "^(flags|Features)" /proc/cpuinfo || true
 
         echo "=== Memory ==="
         free -h
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
index 50a5cbaa75a..6815e056a96 100644
--- a/.github/workflows/codspeed.yml
+++ b/.github/workflows/codspeed.yml
@@ -58,6 +58,10 @@ jobs:
       - name: Build benchmarks
         env:
           RUSTFLAGS: "-C target-feature=+avx2"
+          # Benchmarks gated on a CPU feature set (e.g. bit_transpose) run their
+          # `simulation`-tagged variants here, in simulation mode. Walltime
+          # variants run in the dedicated bench-codspeed-bittranspose job.
+          BENCH_VARIANT: simulation
         run: cargo codspeed build ${{ matrix.features }} $(printf -- '-p %s ' ${{ matrix.packages }}) --profile bench
       - name: Run benchmarks
         uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
@@ -66,6 +70,54 @@ jobs:
           token: ${{ secrets.CODSPEED_TOKEN }}
           mode: "simulation"
 
+  bench-codspeed-bittranspose:
+    # Walltime measurements for the bit_transpose benchmark on real silicon, one
+    # leg per architecture. Each leg builds with its own target features and sets
+    # BENCH_VARIANT, which both gates the benchmarks (via `ignore`) and prefixes
+    # their names so x86_64 and aarch64 results do not collide in CodSpeed. The
+    # simulation variant of these benchmarks runs in the bench-codspeed job above.
+    if: github.repository == 'vortex-data/vortex'
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - leg: x86_64
+            runner: amd64-medium
+            image: ubuntu24-full-x64-pre-v2
+            rustflags: "-C target-feature=+avx2"
+          - leg: aarch64
+            runner: arm64-medium
+            image: ubuntu24-full-arm64-pre-v2
+            rustflags: "-C target-feature=+neon"
+    name: "Benchmark bit_transpose with Codspeed (${{ matrix.leg }})"
+    timeout-minutes: 30
+    runs-on: >-
+      runs-on=${{ github.run_id }}/runner=${{ matrix.runner }}/image=${{ matrix.image }}/tag=bench-bittranspose-${{ matrix.leg }}
+    steps:
+      - uses: runs-on/action@v2
+        with:
+          sccache: s3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6
+      - name: Setup benchmark environment
+        run: sudo bash scripts/setup-benchmark.sh
+      - uses: ./.github/actions/setup-prebuild
+      - uses: ./.github/actions/system-info
+      - name: Install Codspeed
+        uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995
+        with:
+          tool: cargo-codspeed
+      - name: Build benchmarks
+        env:
+          RUSTFLAGS: ${{ matrix.rustflags }}
+          BENCH_VARIANT: ${{ matrix.leg }}
+        run: cargo codspeed build -m walltime -p vortex-fastlanes --features _test-harness --bench bit_transpose --profile bench
+      - name: Run benchmarks
+        uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
+        with:
+          run: bash scripts/bench-taskset.sh cargo codspeed run
+          token: ${{ secrets.CODSPEED_TOKEN }}
+          mode: "walltime"
+
   bench-codspeed-cuda:
     if: github.repository == 'vortex-data/vortex'
     strategy:
diff --git a/encodings/fastlanes/benches/bit_transpose.rs b/encodings/fastlanes/benches/bit_transpose.rs
index 08c3ffb12e5..ac38c2c8b02 100644
--- a/encodings/fastlanes/benches/bit_transpose.rs
+++ b/encodings/fastlanes/benches/bit_transpose.rs
@@ -5,6 +5,8 @@ use divan::Bencher;
 use vortex_fastlanes::bit_transpose::scalar::transpose_bits_scalar;
 use vortex_fastlanes::bit_transpose::scalar::untranspose_bits_scalar;
 
+mod shared;
+
 fn main() {
     divan::main();
 }
@@ -23,9 +25,15 @@ const BATCH_SIZE: usize = 1000;
 
 // ============================================================================
 // Transpose: single array
+//
+// Scalar benchmarks are architecture-neutral, so they run on every leg as a
+// per-architecture baseline.
 // ============================================================================
 
-#[divan::bench]
+#[divan::bench(
+    name = variant!("transpose_scalar"),
+    ignore = ignore_unless_variant!(simulation, x86_64, aarch64),
+)]
 fn transpose_scalar(bencher: Bencher) {
     let input = generate_test_data(42);
 
@@ -40,7 +48,10 @@ fn transpose_scalar(bencher: Bencher) {
 // Transpose: throughput (1000 arrays)
 // ============================================================================
 
-#[divan::bench]
+#[divan::bench(
+    name = variant!("transpose_scalar_throughput"),
+    ignore = ignore_unless_variant!(simulation, x86_64, aarch64),
+)]
 fn transpose_scalar_throughput(bencher: Bencher) {
     let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
 
@@ -57,7 +68,10 @@ fn transpose_scalar_throughput(bencher: Bencher) {
 // Untranspose: single array
 // ============================================================================
 
-#[divan::bench]
+#[divan::bench(
+    name = variant!("untranspose_scalar"),
+    ignore = ignore_unless_variant!(simulation, x86_64, aarch64),
+)]
 fn untranspose_scalar(bencher: Bencher) {
     let input = generate_test_data(42);
 
@@ -72,7 +86,10 @@ fn untranspose_scalar(bencher: Bencher) {
 // Untranspose: throughput (1000 arrays)
 // ============================================================================
 
-#[divan::bench]
+#[divan::bench(
+    name = variant!("untranspose_scalar_throughput"),
+    ignore = ignore_unless_variant!(simulation, x86_64, aarch64),
+)]
 fn untranspose_scalar_throughput(bencher: Bencher) {
     let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
 
@@ -87,6 +104,10 @@ fn untranspose_scalar_throughput(bencher: Bencher) {
 
 // ============================================================================
 // x86_64 benchmarks
+//
+// BMI2 and VBMI share the `x86_64` walltime leg (and the `simulation` leg); the
+// `#[target_feature]` intrinsics are selected at runtime via `has_bmi2` /
+// `has_vbmi`, so a single x86 build covers both.
 // ============================================================================
 
 #[cfg(target_arch = "x86_64")]
@@ -104,7 +125,10 @@ mod x86 {
 
     // --- Transpose: single array ---
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("transpose_bmi2"),
+        ignore = crate::ignore_unless_variant!(simulation, x86_64),
+    )]
     fn transpose_bmi2(bencher: Bencher) {
         if !has_bmi2() {
             return;
@@ -119,7 +143,10 @@ mod x86 {
             });
     }
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("transpose_vbmi"),
+        ignore = crate::ignore_unless_variant!(simulation, x86_64),
+    )]
     fn transpose_vbmi(bencher: Bencher) {
         if !has_vbmi() {
             return;
@@ -136,7 +163,10 @@ mod x86 {
 
     // --- Untranspose: single array ---
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("untranspose_bmi2"),
+        ignore = crate::ignore_unless_variant!(simulation, x86_64),
+    )]
     fn untranspose_bmi2(bencher: Bencher) {
         if !has_bmi2() {
             return;
@@ -151,7 +181,10 @@ mod x86 {
             });
     }
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("untranspose_vbmi"),
+        ignore = crate::ignore_unless_variant!(simulation, x86_64),
+    )]
     fn untranspose_vbmi(bencher: Bencher) {
         if !has_vbmi() {
             return;
@@ -168,7 +201,10 @@ mod x86 {
 
     // --- Transpose: throughput (1000 arrays) ---
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("transpose_bmi2_throughput"),
+        ignore = crate::ignore_unless_variant!(simulation, x86_64),
+    )]
     fn transpose_bmi2_throughput(bencher: Bencher) {
         if !has_bmi2() {
             return;
@@ -185,7 +221,10 @@ mod x86 {
             });
     }
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("transpose_vbmi_throughput"),
+        ignore = crate::ignore_unless_variant!(simulation, x86_64),
+    )]
     fn transpose_vbmi_throughput(bencher: Bencher) {
         if !has_vbmi() {
             return;
@@ -204,7 +243,10 @@ mod x86 {
 
     // --- Untranspose: throughput (1000 arrays) ---
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("untranspose_bmi2_throughput"),
+        ignore = crate::ignore_unless_variant!(simulation, x86_64),
+    )]
     fn untranspose_bmi2_throughput(bencher: Bencher) {
         if !has_bmi2() {
             return;
@@ -221,7 +263,10 @@ mod x86 {
             });
     }
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("untranspose_vbmi_throughput"),
+        ignore = crate::ignore_unless_variant!(simulation, x86_64),
+    )]
     fn untranspose_vbmi_throughput(bencher: Bencher) {
         if !has_vbmi() {
             return;
@@ -241,6 +286,8 @@ mod x86 {
 
 // ============================================================================
 // aarch64 benchmarks
+//
+// NEON has its own walltime leg; the scalar baselines above also run there.
 // ============================================================================
 
 #[cfg(target_arch = "aarch64")]
@@ -254,7 +301,10 @@ mod aarch64 {
 
     // --- Transpose: single array ---
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("transpose_neon"),
+        ignore = crate::ignore_unless_variant!(aarch64),
+    )]
     fn transpose_neon(bencher: Bencher) {
         let input = generate_test_data(42);
 
@@ -267,7 +317,10 @@ mod aarch64 {
 
     // --- Untranspose: single array ---
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("untranspose_neon"),
+        ignore = crate::ignore_unless_variant!(aarch64),
+    )]
     fn untranspose_neon(bencher: Bencher) {
         let input = generate_test_data(42);
 
@@ -280,7 +333,10 @@ mod aarch64 {
 
     // --- Transpose: throughput (1000 arrays) ---
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("transpose_neon_throughput"),
+        ignore = crate::ignore_unless_variant!(aarch64),
+    )]
     fn transpose_neon_throughput(bencher: Bencher) {
         let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
 
@@ -295,7 +351,10 @@ mod aarch64 {
 
     // --- Untranspose: throughput (1000 arrays) ---
 
-    #[divan::bench]
+    #[divan::bench(
+        name = crate::variant!("untranspose_neon_throughput"),
+        ignore = crate::ignore_unless_variant!(aarch64),
+    )]
     fn untranspose_neon_throughput(bencher: Bencher) {
         let inputs: Vec<[u8; 128]> = (0..BATCH_SIZE).map(generate_test_data).collect();
 
diff --git a/encodings/fastlanes/benches/shared/mod.rs b/encodings/fastlanes/benches/shared/mod.rs
new file mode 100644
index 00000000000..d7cef76b2ef
--- /dev/null
+++ b/encodings/fastlanes/benches/shared/mod.rs
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Shared helpers for the fastlanes benchmark binaries.
+//!
+//! Pulled into a bench via `mod shared;`. The macros here give every benchmark
+//! a single source of truth for *which* CPU feature set / architecture it is
+//! measured under, driven by the compile-time `BENCH_VARIANT` environment
+//! variable:
+//!
+//! * A plain `cargo bench` leaves `BENCH_VARIANT` at its `local` default (set in
+//!   `.cargo/config.toml`), so every benchmark runs once on the host.
+//! * The CodSpeed workflow sets `BENCH_VARIANT` to exactly one variant per CI
+//!   leg (e.g. `simulation`, `x86_64`, `aarch64`), which both prefixes the
+//!   benchmark name and gates whether the benchmark runs.
+
+/// Prefix a benchmark name with the active build variant.
+///
+/// `BENCH_VARIANT` is read at compile time. The prefix keeps measurements taken
+/// under different builds or architectures from colliding in CodSpeed (most
+/// importantly the architecture-neutral scalar benchmarks, which run on every
+/// leg).
+#[macro_export]
+macro_rules! variant {
+    ($name:literal) => {
+        concat!(env!("BENCH_VARIANT"), "::", $name)
+    };
+}
+
+/// Map a known variant identifier to its string tag.
+///
+/// Adding a new variant? Add an arm here *and* a matching CI leg. An unknown
+/// identifier fails to compile, so benchmark tags can't silently typo into a
+/// variant that never runs.
+#[macro_export]
+macro_rules! variant_tag {
+    (simulation) => {
+        "simulation"
+    };
+    (x86_64) => {
+        "x86_64"
+    };
+    (aarch64) => {
+        "aarch64"
+    };
+}
+
+/// divan `ignore` expression: skip this benchmark *unless* we are running
+/// locally (`BENCH_VARIANT=local`, the default) or the active variant is one of
+/// the listed feature sets. CI sets `BENCH_VARIANT` to exactly one variant per
+/// leg; locally it defaults to `local`, so every benchmark runs.
+///
+/// The gate is an OR-chain of `==` rather than `matches!` because
+/// [`variant_tag!`] expands to a string literal, which is not valid in
+/// `matches!` pattern position.
+#[macro_export]
+macro_rules! ignore_unless_variant {
+    ($($v:ident),+ $(,)?) => {{
+        let active = env!("BENCH_VARIANT");
+        !(active == "local" $(|| active == $crate::variant_tag!($v))+)
+    }};
+}