diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp
index e0acc0bb9..85c9c9245 100644
--- a/benchmarks/benchmark_utils.hpp
+++ b/benchmarks/benchmark_utils.hpp
@@ -65,7 +65,9 @@ struct lazy_discard {
   __device__ void device_dispatch(index_type index, value_type const& value) const
   {
     // pick some predicate that is always false, but depends on the runtime value
-    if (threadIdx.x > 2025 + *reinterpret_cast<char const*>(&value)) { *(it + index) = value; }
+    if (threadIdx.x > 2025 + *reinterpret_cast<char const*>(&value) + static_cast<int>(index)) {
+      *(it + index) = value;
+    }
   }
   __host__ __device__ void operator()(index_type index, value_type const& value) const
   {
diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu
index b07c285d4..e539fa1d8 100644
--- a/benchmarks/bloom_filter/add_bench.cu
+++ b/benchmarks/bloom_filter/add_bench.cu
@@ -26,6 +26,9 @@
 
 #include <cuda/iterator>
 #include <cuda/std/limits>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 
 #include <cstdint>
 #include <exception>
@@ -35,132 +38,235 @@ using namespace cuco::benchmark;  // defaults, dist_from_state, rebind_hasher_t,
 using namespace cuco::utility;    // key_generator, distribution
 
 /**
- * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance
+ * @brief Implementation of `cuco::bloom_filter::add_async` with
+ * `parametric_filter_policy`
  */
-template <typename Key, typename Hash, typename Word, nvbench::int32_t WordsPerBlock, typename Dist>
-void bloom_filter_add(nvbench::state& state,
-                      nvbench::type_list<Key, Hash, Word, nvbench::enum_type<WordsPerBlock>, Dist>)
-{
-  using size_type   = std::uint32_t;
-  using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>,
+template <bool ExcludeIO,
+          typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBits,
+          nvbench::int32_t HorizontalLayout,
+          nvbench::int32_t VerticalLayout>
+void pfp_bloom_filter_add_impl(nvbench::state& state,
+                               nvbench::type_list<Key,
                                                   Word,
-                                                  static_cast<std::uint32_t>(WordsPerBlock)>;
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
-
-  constexpr auto filter_block_size =
-    sizeof(typename filter_type::word_type) * filter_type::words_per_block;
-
-  auto const num_keys       = state.get_int64("NumInputs");
-  auto const filter_size_mb = state.get_int64("FilterSizeMB");
-  auto const pattern_bits   = WordsPerBlock;
-
-  try {
-    [[maybe_unused]] auto const policy = policy_type{static_cast<uint32_t>(pattern_bits)};
-  } catch (std::exception const& e) {
-    state.skip(e.what());  // skip invalid configurations
-  }
-
-  std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / filter_block_size;
-
-  if (num_sub_filters > std::numeric_limits<size_type>::max()) {
-    state.skip("num_sub_filters too large for size_type");  // skip invalid configurations
+                                                  nvbench::enum_type<BlockBits>,
+                                                  nvbench::enum_type<PatternBits>,
+                                                  nvbench::enum_type<HorizontalLayout>,
+                                                  nvbench::enum_type<VerticalLayout>>)
+{
+  auto constexpr words_per_block       = BlockBits / cuda::std::numeric_limits<Word>::digits;
+  auto constexpr pattern_bits_per_word = PatternBits / words_per_block;
+
+  // Check for a valid configuration
+  if constexpr ((not cuda::std::has_single_bit(static_cast<uint32_t>(BlockBits))) or
+                (words_per_block == 0)) {
+    state.skip("Invalid filter block size");
+  } else if constexpr (HorizontalLayout * VerticalLayout != words_per_block) {
+    state.skip("Invalid vectorization layout");
+  } else if constexpr ((pattern_bits_per_word <= 0) or
+                       (pattern_bits_per_word > cuda::std::numeric_limits<Word>::digits) or
+                       (pattern_bits_per_word * words_per_block > 64)) {
+    state.skip("Invalid pattern bits per word");
+  } else {
+    using size_type                           = std::uint32_t;
+    using hasher                              = cuco::xxhash_64<Key>;
+    auto constexpr contains_vertical_layout   = words_per_block;
+    auto constexpr contains_horizontal_layout = 1;
+    using policy_type                         = cuco::parametric_filter_policy<hasher,
+                                                                               Word,
+                                                                               words_per_block,
+                                                                               PatternBits,
+                                                                               HorizontalLayout,
+                                                                               VerticalLayout,
+                                                                               contains_horizontal_layout,
+                                                                               contains_vertical_layout>;
+    using filter_type =
+      cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
+
+    auto const num_keys       = state.get_int64("NumInputs");
+    auto const filter_size_mb = state.get_int64("FilterSizeMB");
+
+    std::size_t const num_sub_filters =
+      (filter_size_mb * 1024 * 1024) /
+      (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+    if (num_sub_filters > policy_type::max_filter_blocks) {
+      // skip invalid configurations
+      state.skip("num_sub_filters exceeds max_filter_blocks");
+    }
+
+    state.add_element_count(num_keys);
+
+    filter_type filter{static_cast<size_type>(num_sub_filters)};
+
+    if constexpr (ExcludeIO) {
+      state.add_global_memory_writes<Word>(num_keys * words_per_block);
+
+      cuda::counting_iterator<Key> keys(0);
+
+      state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+        timer.start();
+        filter.add_async(keys, keys + num_keys, {launch.get_stream()});
+        timer.stop();
+        filter.clear_async({launch.get_stream()});
+      });
+    } else {
+      thrust::device_vector<Key> keys(num_keys);
+      thrust::sequence(thrust::device, keys.begin(), keys.end(), 0);
+
+      state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+        timer.start();
+        filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+        timer.stop();
+        filter.clear_async({launch.get_stream()});
+      });
+    }
   }
-
-  cuda::counting_iterator<Key> keys(0);
-
-  state.add_element_count(num_keys);
-
-  filter_type filter{
-    static_cast<size_type>(num_sub_filters), {}, {static_cast<std::uint32_t>(pattern_bits)}};
-
-  state.collect_dram_throughput();
-  state.collect_l2_hit_rates();
-
-  add_fpr_summary(state, filter);
-
-  state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
-  });
 }
 
 /**
  * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with
- * `arrow_filter_policy`
+ * `parametric_filter_policy` with IO
  */
-template <typename Key, typename Dist>
-void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>)
+template <typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBits,
+          nvbench::int32_t HorizontalLayout,
+          nvbench::int32_t VerticalLayout>
+void pfp_bloom_filter_add(nvbench::state& state,
+                          nvbench::type_list<Key,
+                                             Word,
+                                             nvbench::enum_type<BlockBits>,
+                                             nvbench::enum_type<PatternBits>,
+                                             nvbench::enum_type<HorizontalLayout>,
+                                             nvbench::enum_type<VerticalLayout>> type_list)
 {
-  using size_type   = std::uint32_t;
-  using policy_type = cuco::arrow_filter_policy<Key>;
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
-
-  auto const num_keys       = state.get_int64("NumInputs");
-  auto const filter_size_mb = state.get_int64("FilterSizeMB");
+  constexpr bool exclude_io = false;
+  pfp_bloom_filter_add_impl<exclude_io>(state, type_list);
+}
 
-  std::size_t const num_sub_filters =
-    (filter_size_mb * 1024 * 1024) /
-    (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+/**
+ * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with
+ * `parametric_filter_policy` without IO
+ */
+template <typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBits,
+          nvbench::int32_t HorizontalLayout,
+          nvbench::int32_t VerticalLayout>
+void pfp_bloom_filter_add_exclude_io(
+  nvbench::state& state,
+  nvbench::type_list<Key,
+                     Word,
+                     nvbench::enum_type<BlockBits>,
+                     nvbench::enum_type<PatternBits>,
+                     nvbench::enum_type<HorizontalLayout>,
+                     nvbench::enum_type<VerticalLayout>> type_list)
+{
+  constexpr bool exclude_io = true;
+  pfp_bloom_filter_add_impl<exclude_io>(state, type_list);
+}
 
-  if (num_sub_filters > policy_type::max_filter_blocks) {
-    state.skip("bloom filter with arrow policy should have <= 4194304 blocks");  // skip invalid
-                                                                                 // configurations
+/**
+ * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with
+ * `parametric_filter_policy` with cache sectorization
+ */
+template <typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBits,
+          nvbench::int32_t GroupsPerBlock,
+          nvbench::int32_t HorizontalLayout>
+void pfp_bloom_filter_add_csbf(nvbench::state& state,
+                               nvbench::type_list<Key,
+                                                  Word,
+                                                  nvbench::enum_type<BlockBits>,
+                                                  nvbench::enum_type<PatternBits>,
+                                                  nvbench::enum_type<GroupsPerBlock>,
+                                                  nvbench::enum_type<HorizontalLayout>>)
+{
+  auto constexpr words_per_block = BlockBits / cuda::std::numeric_limits<Word>::digits;
+  auto constexpr words_per_group = words_per_block / GroupsPerBlock;
+  auto constexpr VerticalLayout  = words_per_block / HorizontalLayout;
+
+  if constexpr (words_per_group == 0) {
+    state.skip("Invalid GroupsPerBlock");
+  } else if constexpr ((HorizontalLayout * VerticalLayout != words_per_block) or
+                       (VerticalLayout < words_per_group)) {
+    state.skip("Invalid vectorization layout");
+  } else {
+    using size_type                           = std::uint32_t;
+    using hasher                              = cuco::xxhash_64<Key>;
+    auto constexpr contains_horizontal_layout = GroupsPerBlock;
+    auto constexpr contains_vertical_layout   = words_per_group;
+    using policy_type                         = cuco::parametric_filter_policy<hasher,
+                                                                               Word,
+                                                                               words_per_block,
+                                                                               PatternBits,
+                                                                               HorizontalLayout,
+                                                                               VerticalLayout,
+                                                                               contains_horizontal_layout,
+                                                                               contains_vertical_layout,
+                                                                               GroupsPerBlock>;
+    using filter_type =
+      cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
+
+    auto const num_keys       = state.get_int64("NumInputs");
+    auto const filter_size_mb = state.get_int64("FilterSizeMB");
+
+    std::size_t const num_sub_filters =
+      (filter_size_mb * 1024 * 1024) /
+      (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+    if (num_sub_filters > policy_type::max_filter_blocks) {
+      // skip invalid configurations
+      state.skip("num_sub_filters exceeds max_filter_blocks");
+    }
+
+    state.add_element_count(num_keys);
+
+    filter_type filter{static_cast<size_type>(num_sub_filters)};
+
+    thrust::device_vector<Key> keys(num_keys);
+    thrust::sequence(thrust::device, keys.begin(), keys.end(), 0);
+
+    state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      timer.start();
+      filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+      timer.stop();
+      filter.clear_async({launch.get_stream()});
+    });
   }
-
-  cuda::counting_iterator<Key> keys(0);
-
-  state.add_element_count(num_keys);
-
-  filter_type filter{static_cast<size_type>(num_sub_filters)};
-
-  state.collect_dram_throughput();
-  state.collect_l2_hit_rates();
-
-  add_fpr_summary(state, filter);
-
-  state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
-  });
 }
 
-NVBENCH_BENCH_TYPES(bloom_filter_add,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<defaults::BF_HASH>,
-                                      nvbench::type_list<defaults::BF_WORD>,
-                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_add_unique_size")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
-
-NVBENCH_BENCH_TYPES(bloom_filter_add,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      defaults::HASH_RANGE,
-                                      nvbench::type_list<defaults::BF_WORD>,
-                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_add_unique_hash")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
-
-NVBENCH_BENCH_TYPES(bloom_filter_add,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<defaults::BF_HASH>,
-                                      nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>,
-                                      nvbench::enum_type_list<1, 2, 4, 8>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_add_unique_block_dim")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+NVBENCH_BENCH_TYPES(
+  pfp_bloom_filter_add,
+  NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                    nvbench::type_list<nvbench::uint64_t>,             ///< Word
+                    nvbench::enum_type_list<64, 128, 256, 512, 1024>,  ///< BlockBits
+                    nvbench::enum_type_list<16>,                       ///< PatternBits
+                    nvbench::enum_type_list<1, 2, 4, 8, 16>,           ///< HorizontalLayout
+                    nvbench::enum_type_list<1, 2, 4>                   ///< VerticalLayout
+                    ))
+  .set_name("pfp_bloom_filter_add_unique_size_u64")
+  .set_type_axes_names(
+    {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"})
   .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
+  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_FRONTIER_CACHE);
 
-NVBENCH_BENCH_TYPES(arrow_bloom_filter_add,
+NVBENCH_BENCH_TYPES(pfp_bloom_filter_add_csbf,
                     NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("arrow_bloom_filter_add_unique_size")
-  .set_type_axes_names({"Key", "Distribution"})
+                                      nvbench::type_list<nvbench::uint64_t>,
+                                      nvbench::enum_type_list<128, 256, 512, 1024>,
+                                      nvbench::enum_type_list<16>,
+                                      nvbench::enum_type_list<2, 4, 8>,
+                                      nvbench::enum_type_list<1, 2, 4, 8>))
+  .set_name("pfp_bloom_filter_add_csbf_unique_size_u64")
+  .set_type_axes_names(
+    {"Key", "Word", "BlockBits", "PatternBits", "GroupsPerBlock", "HorizontalLayout"})
   .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
+  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_FRONTIER_CACHE);
\ No newline at end of file
diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu
index 7a4549524..ef6594ad5 100644
--- a/benchmarks/bloom_filter/contains_bench.cu
+++ b/benchmarks/bloom_filter/contains_bench.cu
@@ -15,6 +15,7 @@
  */
 
 #include "defaults.hpp"
+#include "nvbench/state.cuh"
 #include "utils.hpp"
 
 #include <benchmark_defaults.hpp>
@@ -26,7 +27,10 @@
 
 #include <cuda/iterator>
 #include <cuda/std/limits>
+#include <thrust/count.h>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 
 #include <exception>
 #include <limits>
@@ -35,146 +39,279 @@ using namespace cuco::benchmark;  // defaults, dist_from_state, rebind_hasher_t,
 using namespace cuco::utility;    // key_generator, distribution
 
 /**
- * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance
+ * @brief Implementation of `cuco::bloom_filter::contains_async` with
+ * `parametric_filter_policy`
  */
-template <typename Key, typename Hash, typename Word, nvbench::int32_t WordsPerBlock, typename Dist>
-void bloom_filter_contains(
-  nvbench::state& state,
-  nvbench::type_list<Key, Hash, Word, nvbench::enum_type<WordsPerBlock>, Dist>)
+template <bool ExcludeIO,
+          typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBits,
+          nvbench::int32_t HorizontalLayout,
+          nvbench::int32_t VerticalLayout>
+void pfp_bloom_filter_contains_impl(nvbench::state& state,
+                                    nvbench::type_list<Key,
+                                                       Word,
+                                                       nvbench::enum_type<BlockBits>,
+                                                       nvbench::enum_type<PatternBits>,
+                                                       nvbench::enum_type<HorizontalLayout>,
+                                                       nvbench::enum_type<VerticalLayout>>)
 {
-  using size_type   = std::uint32_t;
-  using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>,
-                                                  Word,
-                                                  static_cast<std::uint32_t>(WordsPerBlock)>;
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
-
-  constexpr auto filter_block_size =
-    sizeof(typename filter_type::word_type) * filter_type::words_per_block;
-
-  // if (filter_block_size <= 32) {
-  //   cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
-  //   filter block fits into a 32B sector
-  // }
-
-  auto const num_keys       = state.get_int64("NumInputs");
-  auto const filter_size_mb = state.get_int64("FilterSizeMB");
-  auto const pattern_bits   = WordsPerBlock;
-
-  try {
-    [[maybe_unused]] auto const policy = policy_type{static_cast<uint32_t>(pattern_bits)};
-  } catch (std::exception const& e) {
-    state.skip(e.what());  // skip invalid configurations
-  }
-
-  std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / filter_block_size;
-
-  if (num_sub_filters > std::numeric_limits<size_type>::max()) {
-    state.skip("num_sub_filters too large for size_type");  // skip invalid configurations
+  auto constexpr words_per_block       = BlockBits / cuda::std::numeric_limits<Word>::digits;
+  auto constexpr pattern_bits_per_word = PatternBits / words_per_block;
+
+  // Check for a valid configuration
+  if constexpr ((not cuda::std::has_single_bit(static_cast<uint32_t>(BlockBits))) or
+                (words_per_block == 0)) {
+    state.skip("Invalid filter block size");
+  } else if constexpr (HorizontalLayout * VerticalLayout > words_per_block) {
+    state.skip("Invalid vectorization layout");  // TODO check if this is correct
+  } else if constexpr ((pattern_bits_per_word <= 0) or
+                       (pattern_bits_per_word > cuda::std::numeric_limits<Word>::digits) or
+                       (pattern_bits_per_word * words_per_block > 64)) {
+    state.skip("Invalid pattern bits per word");
+  } else {
+    using size_type                      = std::uint32_t;
+    using hasher                         = cuco::xxhash_64<Key>;
+    auto constexpr add_vertical_layout   = 1;
+    auto constexpr add_horizontal_layout = words_per_block;
+    using policy_type                    = cuco::parametric_filter_policy<hasher,
+                                                                          Word,
+                                                                          words_per_block,
+                                                                          PatternBits,
+                                                                          add_horizontal_layout,
+                                                                          add_vertical_layout,
+                                                                          HorizontalLayout,
+                                                                          VerticalLayout>;
+    using filter_type =
+      cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
+
+    auto const num_keys       = state.get_int64("NumInputs");
+    auto const filter_size_mb = state.get_int64("FilterSizeMB");
+
+    std::size_t const num_sub_filters =
+      (filter_size_mb * 1024 * 1024) /
+      (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+    if (num_sub_filters > policy_type::max_filter_blocks) {
+      // skip invalid configurations
+      state.skip("num_sub_filters exceeds max_filter_blocks");
+    }
+
+    state.add_element_count(num_keys);
+
+    filter_type filter{static_cast<size_type>(num_sub_filters)};
+
+    thrust::counting_iterator<Key> key_it(0);
+
+    // insert FPR-optimal number of keys
+    auto const num_build_keys = (filter_size_mb * 1024 * 1024 * 8) / (2 * PatternBits);
+    filter.add(key_it, key_it + num_build_keys);
+
+    // FPR summary
+    thrust::device_vector<bool> result(num_keys, false);
+    filter.contains(key_it + num_build_keys, key_it + num_build_keys + num_keys, result.begin());
+
+    double const fp = thrust::count(thrust::device, result.begin(), result.end(), true);
+
+    auto& summ_fpr = state.add_summary("FalsePositiveRate");
+    summ_fpr.set_string("hint", "FPR");
+    summ_fpr.set_string("short_name", "FPR");
+    summ_fpr.set_string("description", "False-positive rate of the bloom filter.");
+    summ_fpr.set_float64("value", fp / static_cast<double>(num_keys));
+
+    state.collect_dram_throughput();
+    state.collect_l2_hit_rates();
+
+    if constexpr (ExcludeIO) {
+      state.add_global_memory_reads<Word>(num_keys * words_per_block);
+      // state.collect_dram_throughput();
+
+      auto result_it = make_lazy_discard_iterator(result.begin());
+
+      state.exec([&](nvbench::launch& launch) {
+        filter.contains_async(key_it, key_it + num_keys, result_it, {launch.get_stream()});
+      });
+    } else {
+      thrust::device_vector<Key> keys(num_keys);
+      thrust::sequence(thrust::device, keys.begin(), keys.end(), 0);
+
+      state.add_global_memory_reads<char>(num_keys *
+                                          ((words_per_block * sizeof(Word)) + sizeof(Key)));
+      state.add_global_memory_writes<char>(num_keys * sizeof(bool));
+
+      state.exec([&](nvbench::launch& launch) {
+        filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+      });
+    }
   }
+}
 
-  cuda::counting_iterator<Key> keys(0);
-  thrust::device_vector<bool> result(num_keys, false);
-
-  state.add_element_count(num_keys);
-
-  filter_type filter{
-    static_cast<size_type>(num_sub_filters), {}, {static_cast<uint32_t>(pattern_bits)}};
-
-  state.collect_dram_throughput();
-  state.collect_l2_hit_rates();
-
-  add_fpr_summary(state, filter);
-
-  filter.add(keys, keys + num_keys);
+/**
+ * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with
+ * `parametric_filter_policy` with IO
+ */
+template <typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBitsPerWord,
+          nvbench::int32_t HorizontalLayout,
+          nvbench::int32_t VerticalLayout>
+void pfp_bloom_filter_contains(nvbench::state& state,
+                               nvbench::type_list<Key,
+                                                  Word,
+                                                  nvbench::enum_type<BlockBits>,
+                                                  nvbench::enum_type<PatternBitsPerWord>,
+                                                  nvbench::enum_type<HorizontalLayout>,
+                                                  nvbench::enum_type<VerticalLayout>> type_list)
+{
+  constexpr bool exclude_io = false;
+  pfp_bloom_filter_contains_impl<exclude_io>(state, type_list);
+}
 
-  state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
-  });
+/**
+ * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with
+ * `parametric_filter_policy` without IO
+ */
+template <typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBitsPerWord,
+          nvbench::int32_t HorizontalLayout,
+          nvbench::int32_t VerticalLayout>
+void pfp_bloom_filter_contains_exclude_io(
+  nvbench::state& state,
+  nvbench::type_list<Key,
+                     Word,
+                     nvbench::enum_type<BlockBits>,
+                     nvbench::enum_type<PatternBitsPerWord>,
+                     nvbench::enum_type<HorizontalLayout>,
+                     nvbench::enum_type<VerticalLayout>> type_list)
+{
+  constexpr bool exclude_io = true;
+  pfp_bloom_filter_contains_impl<exclude_io>(state, type_list);
 }
 
 /**
  * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with
- * `arrow_filter_policy`
+ * `parametric_filter_policy` with cache sectorization
  */
-template <typename Key, typename Dist>
-void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Dist>)
+template <typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBits,
+          nvbench::int32_t GroupsPerBlock,
+          nvbench::int32_t HorizontalLayout>
+void pfp_bloom_filter_contains_csbf(nvbench::state& state,
+                                    nvbench::type_list<Key,
+                                                       Word,
+                                                       nvbench::enum_type<BlockBits>,
+                                                       nvbench::enum_type<PatternBits>,
+                                                       nvbench::enum_type<GroupsPerBlock>,
+                                                       nvbench::enum_type<HorizontalLayout>>)
 {
-  // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
-  // filter block fits into a 32B sector
-  using size_type   = std::uint32_t;
-  using policy_type = cuco::arrow_filter_policy<Key>;
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
-
-  auto const num_keys       = state.get_int64("NumInputs");
-  auto const filter_size_mb = state.get_int64("FilterSizeMB");
-
-  std::size_t const num_sub_filters =
-    (filter_size_mb * 1024 * 1024) /
-    (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
-
-  if (num_sub_filters > policy_type::max_filter_blocks) {
-    state.skip("bloom filter with arrow policy should have <= 4194304 blocks");  // skip invalid
-                                                                                 // configurations
+  auto constexpr words_per_block = BlockBits / cuda::std::numeric_limits<Word>::digits;
+  auto constexpr words_per_group = words_per_block / GroupsPerBlock;
+  auto constexpr VerticalLayout  = words_per_block / HorizontalLayout;
+
+  if constexpr (words_per_group == 0) {
+    state.skip("Invalid GroupsPerBlock");
+  } else if constexpr ((HorizontalLayout * VerticalLayout != words_per_block) or
+                       (VerticalLayout < words_per_group)) {
+    state.skip("Invalid vectorization layout");
+  } else {
+    using size_type                      = std::uint32_t;
+    using hasher                         = cuco::xxhash_64<Key>;
+    auto constexpr add_horizontal_layout = GroupsPerBlock;
+    auto constexpr add_vertical_layout   = words_per_group;
+
+    using policy_type = cuco::parametric_filter_policy<hasher,
+                                                       Word,
+                                                       words_per_block,
+                                                       PatternBits,
+                                                       add_horizontal_layout,
+                                                       add_vertical_layout,
+                                                       HorizontalLayout,
+                                                       VerticalLayout,
+                                                       GroupsPerBlock>;
+    using filter_type =
+      cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
+
+    auto const num_keys       = state.get_int64("NumInputs");
+    auto const filter_size_mb = state.get_int64("FilterSizeMB");
+
+    std::size_t const num_sub_filters =
+      (filter_size_mb * 1024 * 1024) /
+      (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+    if (num_sub_filters > policy_type::max_filter_blocks) {
+      // skip invalid configurations
+      state.skip("num_sub_filters exceeds max_filter_blocks");
+    }
+
+    state.add_element_count(num_keys);
+
+    filter_type filter{static_cast<size_type>(num_sub_filters)};
+
+    thrust::counting_iterator<Key> key_it(0);
+
+    // insert FPR-optimal number of keys
+    auto const num_build_keys = (filter_size_mb * 1024 * 1024 * 8) / (2 * PatternBits);
+    filter.add(key_it, key_it + num_build_keys);
+
+    // FPR summary
+    thrust::device_vector<bool> result(num_keys, false);
+    filter.contains(key_it + num_build_keys, key_it + num_build_keys + num_keys, result.begin());
+
+    double const fp = thrust::count(thrust::device, result.begin(), result.end(), true);
+
+    auto& summ_fpr = state.add_summary("FalsePositiveRate");
+    summ_fpr.set_string("hint", "FPR");
+    summ_fpr.set_string("short_name", "FPR");
+    summ_fpr.set_string("description", "False-positive rate of the bloom filter.");
+    summ_fpr.set_float64("value", fp / static_cast<double>(num_keys));
+
+    state.collect_dram_throughput();
+    state.collect_l2_hit_rates();
+
+    thrust::device_vector<Key> keys(num_keys);
+    thrust::sequence(thrust::device, keys.begin(), keys.end(), 0);
+
+    state.add_global_memory_reads<char>(num_keys *
+                                        ((words_per_block * sizeof(Word)) + sizeof(Key)));
+    state.add_global_memory_writes<char>(num_keys * sizeof(bool));
+    // state.collect_dram_throughput();
+
+    state.exec([&](nvbench::launch& launch) {
+      filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    });
   }
-
-  cuda::counting_iterator<Key> keys(0);
-  thrust::device_vector<bool> result(num_keys, false);
-
-  state.add_element_count(num_keys);
-
-  filter_type filter{static_cast<size_type>(num_sub_filters)};
-
-  state.collect_dram_throughput();
-  state.collect_l2_hit_rates();
-
-  add_fpr_summary(state, filter);
-
-  filter.add(keys, keys + num_keys);
-
-  state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
-  });
 }
 
-NVBENCH_BENCH_TYPES(bloom_filter_contains,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<defaults::BF_HASH>,
-                                      nvbench::type_list<defaults::BF_WORD>,
-                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_contains_unique_size")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
-
-NVBENCH_BENCH_TYPES(bloom_filter_contains,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      defaults::HASH_RANGE,
-                                      nvbench::type_list<defaults::BF_WORD>,
-                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_contains_unique_hash")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
-
-NVBENCH_BENCH_TYPES(bloom_filter_contains,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<defaults::BF_HASH>,
-                                      nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>,
-                                      nvbench::enum_type_list<1, 2, 4, 8>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_contains_unique_block_dim")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+NVBENCH_BENCH_TYPES(
+  pfp_bloom_filter_contains,
+  NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+                    nvbench::type_list<nvbench::uint64_t>,             ///< Word
+                    nvbench::enum_type_list<64, 128, 256, 512, 1024>,  ///< BlockBits
+                    nvbench::enum_type_list<16>,                       ///< PatternBits
+                    nvbench::enum_type_list<1, 2, 4, 8, 16>,           /// <HorizontalLayout
+                    nvbench::enum_type_list<1, 2, 4, 8, 16>            ///< VerticalLayout
+                    ))
+  .set_name("pfp_bloom_filter_contains_unique_size_u64")
+  .set_type_axes_names(
+    {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"})
   .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
+  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_FRONTIER_CACHE);
 
-NVBENCH_BENCH_TYPES(arrow_bloom_filter_contains,
+NVBENCH_BENCH_TYPES(pfp_bloom_filter_contains_csbf,
                     NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("arrow_bloom_filter_contains_unique_size")
-  .set_type_axes_names({"Key", "Distribution"})
+                                      nvbench::type_list<nvbench::uint64_t>,
+                                      nvbench::enum_type_list<128, 256, 512, 1024>,
+                                      nvbench::enum_type_list<16>,
+                                      nvbench::enum_type_list<2, 4, 8>,
+                                      nvbench::enum_type_list<1, 2, 4, 8>))
+  .set_name("pfp_bloom_filter_contains_csbf_unique_size_u64")
+  .set_type_axes_names(
+    {"Key", "Word", "BlockBits", "PatternBits", "GroupsPerBlock", "HorizontalLayout"})
   .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
+  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_FRONTIER_CACHE);
\ No newline at end of file
diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp
index f1b192aa3..8b7d45f6f 100644
--- a/benchmarks/bloom_filter/defaults.hpp
+++ b/benchmarks/bloom_filter/defaults.hpp
@@ -33,9 +33,11 @@ using BF_WORD = nvbench::uint32_t;
 static constexpr auto BF_N               = 1'000'000'000;
 static constexpr auto BF_SIZE_MB         = 2'000;
 static constexpr auto BF_WORDS_PER_BLOCK = 8;
+static constexpr auto BF_PATTERN_BITS    = 8;
 
 auto const BF_SIZE_MB_RANGE_CACHE =
   std::vector<nvbench::int64_t>{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
-auto const BF_PATTERN_BITS_RANGE = std::vector<nvbench::int64_t>{1, 2, 4, 6, 8, 16};
+auto const BF_SIZE_MB_RANGE_FRONTIER_CACHE = std::vector<nvbench::int64_t>{32, 1024};
+auto const BF_PATTERN_BITS_RANGE           = std::vector<nvbench::int64_t>{1, 2, 4, 6, 8, 16};
 
 }  // namespace cuco::benchmark::defaults
diff --git a/include/cuco/bloom_filter.cuh b/include/cuco/bloom_filter.cuh
index 3f2e69c7c..b50d3753f 100644
--- a/include/cuco/bloom_filter.cuh
+++ b/include/cuco/bloom_filter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,7 +36,13 @@
 namespace cuco {
 
 /**
- * @brief A GPU-accelerated Blocked Bloom Filter.
+ * @brief A GPU-accelerated Bloom filter.
+ *
+ * Implements the Sectorized Bloom Filter (SBF) and Cache-Sectorized Bloom Filter (CSBF) variants
+ * from "Optimizing Bloom Filters for Modern GPU Architectures" (arXiv:2512.15595). Fingerprint
+ * generation is parameterized by `parametric_filter_policy` (see `cuco/bloom_filter_policies.cuh`),
+ * which exposes the paper's Theta/Phi vectorization layout and optional `GroupsPerBlock`
+ * cache-sectorization parameters as compile-time template parameters.
  *
  * The `bloom_filter` supports two types of operations:
  * - Host-side "bulk" operations
@@ -51,6 +57,12 @@ namespace cuco {
  * independent add or lookup operations from device code. These operations are accessed through
  * non-owning, trivially copyable reference types (or "ref").
  *
+ * @note The default `Policy` (`cuco::default_filter_policy<Key>`) is an alias for a
+ * `parametric_filter_policy` instantiation with paper-recommended layouts on a 256-bit block. Users
+ * who need different layouts (smaller/larger blocks, different fingerprint bit counts, CSBF mode)
+ * can instantiate `cuco::parametric_filter_policy<...>` directly and pass it as the `Policy`
+ * template argument.
+ *
  * @tparam Key Key type
  * @tparam Extent Size type that is used to determine the number of blocks in the filter
  * @tparam Scope The scope in which operations will be performed by individual threads
@@ -61,8 +73,8 @@ namespace cuco {
 template <class Key,
           class Extent             = cuco::extent<std::size_t>,
           cuda::thread_scope Scope = cuda::thread_scope_device,
-          class Policy    = cuco::default_filter_policy<cuco::xxhash_64<Key>, std::uint32_t, 8>,
-          class Allocator = cuco::cuda_allocator<cuda::std::byte>>
+          class Policy             = cuco::default_filter_policy<Key>,
+          class Allocator          = cuco::cuda_allocator<cuda::std::byte>>
 class bloom_filter {
  public:
   /**
@@ -136,7 +148,7 @@ class bloom_filter {
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   __host__ constexpr void clear_async(cuda::stream_ref stream = cuda::stream_ref{
-                                        cudaStream_t{nullptr}});
+                                        cudaStream_t{nullptr}}) noexcept;
 
   /**
    * @brief Adds all keys in the range `[first, last)` to the filter.
@@ -164,8 +176,10 @@ class bloom_filter {
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   template <class InputIt>
-  __host__ constexpr void add_async(
-    InputIt first, InputIt last, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
+  __host__ constexpr void add_async(InputIt first,
+                                    InputIt last,
+                                    cuda::stream_ref stream = cuda::stream_ref{
+                                      cudaStream_t{nullptr}}) noexcept;
 
   /**
    * @brief Adds keys in the range `[first, last)` if `pred` of the corresponding `stencil` returns
diff --git a/include/cuco/bloom_filter_policies.cuh b/include/cuco/bloom_filter_policies.cuh
index 0d28b166a..1aa116c14 100644
--- a/include/cuco/bloom_filter_policies.cuh
+++ b/include/cuco/bloom_filter_policies.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,7 @@
 
 #pragma once
 
-#include <cuco/detail/bloom_filter/arrow_filter_policy.cuh>
-#include <cuco/detail/bloom_filter/default_filter_policy_impl.cuh>
+#include <cuco/detail/bloom_filter/parametric_filter_policy.cuh>
 #include <cuco/hash_functions.cuh>
 
 #include <cstdint>
@@ -25,110 +24,60 @@
 namespace cuco {
 
 /**
- * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's
- * fingerprint.
+ * @brief Sectorized Bloom filter policy with multiplicative-hashing fingerprint generation.
  *
- * @tparam Key The type of the values to generate a fingerprint for.
- * @tparam XXHash64 Custom (64 bit) XXHash hasher to generate a key's fingerprint.
- * By default, cuco::xxhash_64 hasher will be used.
+ * Implements the Sectorized Bloom Filter (SBF) and Cache-Sectorized Bloom Filter (CSBF) variants
+ * from "Optimizing Bloom Filters for Modern GPU Architectures" (arXiv:2512.15595). Distributes
+ * `PatternBits` set bits across `WordsPerBlock` words via compile-time salt-based multiplicative
+ * hashing.
  *
+ * Requires a 64-bit hash function: the result is split into upper 32 bits (block selection via
+ * multiply-shift) and lower 32 bits (pattern generation). This is a permanent design requirement.
+ *
+ * @note Constructor signature: `parametric_filter_policy(Hash hash = {})`.
+ *
+ * @tparam Hash 64-bit hash functor.
+ * @tparam Word Underlying word type of a filter block.
+ * @tparam WordsPerBlock Words per filter block.
+ * @tparam PatternBits Fingerprint bits per key (paper's k).
+ * @tparam AddHorizontalLayout CG size for add (paper's Theta).
+ * @tparam AddVerticalLayout Words per thread per add step (paper's Phi).
+ * @tparam ContainsHorizontalLayout CG size for contains.
+ * @tparam ContainsVerticalLayout Words per thread per contains step.
+ * @tparam GroupsPerBlock Cache-sectorization groups (paper's z). Defaults to `WordsPerBlock`
+ * (non-CSBF).
  */
-template <typename Key, template <typename> class XXHash64 = cuco::xxhash_64>
-using arrow_filter_policy = detail::arrow_filter_policy<Key, XXHash64>;
+template <class Hash,
+          class Word,
+          std::uint32_t WordsPerBlock,
+          std::uint32_t PatternBits,
+          std::uint32_t AddHorizontalLayout,
+          std::uint32_t AddVerticalLayout,
+          std::uint32_t ContainsHorizontalLayout,
+          std::uint32_t ContainsVerticalLayout,
+          std::uint32_t GroupsPerBlock = WordsPerBlock>
+using parametric_filter_policy = detail::parametric_filter_policy<Hash,
+                                                                  Word,
+                                                                  WordsPerBlock,
+                                                                  PatternBits,
+                                                                  AddHorizontalLayout,
+                                                                  AddVerticalLayout,
+                                                                  ContainsHorizontalLayout,
+                                                                  ContainsVerticalLayout,
+                                                                  GroupsPerBlock>;
 
 /**
- * @brief The default policy that defines how a Blocked Bloom Filter generates and stores a key's
- * fingerprint.
+ * @brief Default Bloom filter policy used by `cuco::bloom_filter` when no policy is specified.
  *
- * @note `Word` type must be an atomically updatable integral type. `WordsPerBlock` must
- * be a power-of-two.
+ * Alias for a `parametric_filter_policy` instantiation with paper-recommended layouts on a 256-bit
+ * block: 8 x `uint32_t` words, 8 fingerprint bits per key, fully horizontal add (Theta=8) and fully
+ * vertical contains (Phi=8).
  *
- * @tparam Hash Hash function used to generate a key's fingerprint
- * @tparam Word Underlying word/segment type of a filter block
- * @tparam WordsPerBlock Number of words/segments in each block
+ * @tparam Key The key type to generate a fingerprint for.
+ * @tparam XXHash64 64-bit XXHash functor template. Defaults to `cuco::xxhash_64`.
  */
-template <class Hash, class Word, std::uint32_t WordsPerBlock>
-class default_filter_policy {
-  using impl_type = cuco::detail::default_filter_policy_impl<Hash, Word, WordsPerBlock>;
-
- public:
-  using hasher             = typename impl_type::hasher;              ///< Type of the hash function
-  using hash_argument_type = typename impl_type::hash_argument_type;  ///< Hash function input type
-  using hash_result_type   = typename impl_type::hash_result_type;    ///< hash function output type
-  using word_type =
-    typename impl_type::word_type;  ///< Underlying word/segment type of a filter block
-
-  static constexpr std::uint32_t words_per_block =
-    impl_type::words_per_block;  ///< Number of words/segments in each filter block
-
- public:
-  /**
-   * @brief Constructs the `default_filter_policy` object.
-   *
-   * @throws Compile-time error if the specified number of words in a filter block is not a
-   * power-of-two or is larger than 32. If called from host: throws exception; If called from
-   * device: Traps the kernel.
-   *
-   * @throws If the `hash_result_type` is too narrow to generate the requested number of
-   * `pattern_bits`. If called from host: throws exception; If called from device: Traps the kernel.
-   *
-   * @throws If `pattern_bits` is smaller than the number of words in a filter block or larger than
-   * the total number of bits in a filter block. If called from host: throws exception; If called
-   * from device: Traps the kernel.
-   *
-   * @param pattern_bits Number of bits in a key's fingerprint
-   * @param hash Hash function used to generate a key's fingerprint
-   */
-  __host__ __device__ constexpr default_filter_policy(std::uint32_t pattern_bits = words_per_block,
-                                                      Hash hash                  = {});
-
-  /**
-   * @brief Generates the hash value for a given key.
-   *
-   * @note This function is meant as a customization point and is only used in the internals of the
-   * `bloom_filter(_ref)` implementation.
-   *
-   * @param key The key to hash
-   *
-   * @return The hash value of the key
-   */
-  __device__ constexpr hash_result_type hash(hash_argument_type const& key) const;
-
-  /**
-   * @brief Determines the filter block a key is added into.
-   *
-   * @note This function is meant as a customization point and is only used in the internals of the
-   * `bloom_filter(_ref)` implementation.
-   *
-   * @tparam Extent Size type that is used to determine the number of blocks in the filter
-   *
-   * @param hash Hash value of the key
-   * @param num_blocks Number of block in the filter
-   *
-   * @return The block index for the given key's hash value
-   */
-  template <class Extent>
-  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const;
-
-  /**
-   * @brief Determines the fingerprint pattern for a word/segment within the filter block for a
-   * given key's hash value.
-   *
-   * @note This function is meant as a customization point and is only used in the internals of the
-   * `bloom_filter(_ref)` implementation.
-   *
-   * @param hash Hash value of the key
-   * @param word_index Target word/segment within the filter block
-   *
-   * @return The bit pattern for the word/segment in the filter block
-   */
-  __device__ constexpr word_type word_pattern(hash_result_type hash,
-                                              std::uint32_t word_index) const;
-
- private:
-  impl_type impl_;  ///< Policy implementation
-};
+template <class Key, template <typename> class XXHash64 = cuco::xxhash_64>
+using default_filter_policy =
+  parametric_filter_policy<XXHash64<Key>, std::uint32_t, 8, 8, 8, 1, 1, 8>;
 
 }  // namespace cuco
-
-#include <cuco/detail/bloom_filter/default_filter_policy.inl>
\ No newline at end of file
diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh
index f58910afc..ad4ee0cb3 100644
--- a/include/cuco/bloom_filter_ref.cuh
+++ b/include/cuco/bloom_filter_ref.cuh
@@ -109,7 +109,7 @@ class bloom_filter_ref {
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   __host__ constexpr void clear_async(cuda::stream_ref stream = cuda::stream_ref{
-                                        cudaStream_t{nullptr}});
+                                        cudaStream_t{nullptr}}) noexcept;
 
   /**
    * @brief Device function that adds a key to the filter.
@@ -121,6 +121,17 @@ class bloom_filter_ref {
   template <class ProbeKey>
   __device__ void add(ProbeKey const& key);
 
+  /**
+   * @brief Device function that adds all keys in the range `[first, last)` to the filter.
+   *
+   * @tparam InputIt Device-accessible random access input key iterator
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   */
+  template <class InputIt>
+  __device__ void add(InputIt first, InputIt last);
+
   /**
    * @brief Device function that cooperatively adds a key to the filter.
    *
@@ -178,8 +189,10 @@ class bloom_filter_ref {
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   template <class InputIt>
-  __host__ constexpr void add_async(
-    InputIt first, InputIt last, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
+  __host__ constexpr void add_async(InputIt first,
+                                    InputIt last,
+                                    cuda::stream_ref stream = cuda::stream_ref{
+                                      cudaStream_t{nullptr}}) noexcept;
 
   /**
    * @brief Adds keys in the range `[first, last)` if `pred` of the corresponding `stencil` returns
@@ -248,6 +261,20 @@ class bloom_filter_ref {
   template <class ProbeKey>
   [[nodiscard]] __device__ bool contains(ProbeKey const& key) const;
 
+  /**
+   * @brief Device function that tests if all keys in the range `[first, last)` are present in the
+   * filter.
+   *
+   * @tparam InputIt Device-accessible random access input key iterator
+   * @tparam OutputIt Device-accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   */
+  template <class InputIt, class OutputIt>
+  __device__ void contains(InputIt first, InputIt last, OutputIt output_begin) const;
+
   /**
    * @brief Device function that tests if a key's fingerprint is present in the filter.
    *
@@ -265,10 +292,21 @@ class bloom_filter_ref {
   template <class CG, class ProbeKey>
   [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const;
 
-  // TODO
-  // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
-  // const;
+  /**
+   * @brief Device function that tests if all keys in the range `[first, last)` are present in the
+   * filter.
+   *
+   * @tparam CG Cooperative Group type
+   * @tparam InputIt Device-accessible random access input key iterator
+   * @tparam OutputIt Device-accessible output iterator assignable from `bool`
+   *
+   * @param group The Cooperative Group this operation is executed with
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   */
+  template <class CG, class InputIt, class OutputIt>
+  __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) const;
 
   /**
    * @brief Tests all keys in the range `[first, last)` if their fingerprints are present in the
diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
deleted file mode 100644
index 481293335..000000000
--- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuco/hash_functions.cuh>
-
-#include <cuda/functional>
-#include <cuda/std/bit>
-#include <cuda/std/limits>
-
-#include <cstdint>
-#include <nv/target>
-
-namespace cuco::detail {
-
-/**
- * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's
- * fingerprint.
- *
- * Reference:
- * https://github.com/apache/arrow/blob/be1dcdb96b030639c0b56955c4c62f9d6b03f473/cpp/src/parquet/bloom_filter.cc#L219-L230
- *
- * Example:
- * @code{.cpp}
- * template <typename KeyType, std::uint32_t NUM_FILTER_BLOCKS>
- * void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector<KeyType> const& positive_keys,
- *                                                 device_vector<KeyType> const& negative_keys)
- * {
- *     using policy_type = cuco::arrow_filter_policy<KeyType, cuco::xxhash_64>;
- *
- *     // Warn or throw if the number of filter blocks is greater than maximum used by Arrow policy.
- *     static_assert(NUM_FILTER_BLOCKS <= policy_type::max_filter_blocks, "NUM_FILTER_BLOCKS must be
- *                                                                         in range: [1, 4194304]");
- *
- *     // Create a bloom filter with Arrow policy
- *     cuco::bloom_filter<KeyType, cuco::extent<size_t>,
- *         cuda::thread_scope_device, policy_type> filter{NUM_FILTER_BLOCKS};
- *
- *     // Add positive keys to the bloom filter
- *     filter.add(positive_keys.begin(), positive_keys.end());
- *
- *     auto const num_tp = positive_keys.size();
- *     auto const num_tn = negative_keys.size();
- *
- *     // Vectors to store query results.
- *     thrust::device_vector<bool> true_positive_result(num_tp, false);
- *     thrust::device_vector<bool> true_negative_result(num_tn, false);
- *
- *     // Query the bloom filter for the inserted keys.
- *     filter.contains(positive_keys.begin(), positive_keys.end(), true_positive_result.begin());
- *
- *     // We should see a true-positive rate of 1.
- *     float true_positive_rate = float(thrust::count(thrust::device,
- *          true_positive_result.begin(), true_positive_result.end(), true)) / float(num_tp);
- *
- *     // Query the bloom filter for the non-inserted keys.
- *     filter.contains(negative_keys.begin(), negative_keys.end(), true_negative_result.begin());
- *
- *     // We may see a false-positive rate > 0 depending on the number of bits in the
- *     // filter and the number of hashes used per key.
- *     float false_positive_rate = float(thrust::count(thrust::device,
- *          true_negative_result.begin(), true_negative_result.end(), true)) / float(num_tn);
- * }
- * @endcode
- *
- * @tparam Key The type of the values to generate a fingerprint for.
- * @tparam XXHash64 64-bit XXHash hasher implementation for fingerprint generation.
- */
-template <class Key, template <typename> class XXHash64>
-class arrow_filter_policy {
- public:
-  using hasher           = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
-  using word_type        = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
-  using key_type         = Key;            ///< Hash function input type
-  using hash_result_type = std::uint64_t;  ///< hash function output type
-
-  static constexpr uint32_t bits_set_per_block = 8;  ///< hardcoded bits set per Arrow filter block
-  static constexpr uint32_t words_per_block    = 8;  ///< hardcoded words per Arrow filter block
-
-  static constexpr std::uint32_t bytes_per_filter_block =
-    32;  ///< Number of bytes in one Arrow filter block
-  static constexpr std::uint32_t max_arrow_filter_bytes =
-    128 * 1024 * 1024;  ///< Max bytes in Arrow bloom filter
-  static constexpr std::uint32_t max_filter_blocks =
-    (max_arrow_filter_bytes /
-     bytes_per_filter_block);  ///< Max sub-filter blocks allowed in Arrow bloom filter
-
- public:
-  /**
-   * @brief Constructs the `arrow_filter_policy` object.
-   *
-   * @note The number of filter blocks with Arrow policy must be in the
-   * range of [1, 4194304]. If the bloom filter is constructed with a larger
-   * number of blocks, only the first 4194304 (128MB) blocks will be used.
-   *
-   * @param hash Hash function used to generate a key's fingerprint
-   */
-  __host__ __device__ constexpr arrow_filter_policy(hasher hash = {}) : hash_{hash} {}
-
-  /**
-   * @brief Generates the hash value for a given key.
-   *
-   * @param key The key to hash
-   *
-   * @return The hash value of the key
-   */
-  __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); }
-
-  /**
-   * @brief Determines the filter block a key is added into.
-   *
-   * @note The number of filter blocks with Arrow policy must be in the
-   * range of [1, 4194304]. Passing a larger `num_blocks` will still
-   * upperbound the number of blocks used to the mentioned range.
-   *
-   * @tparam Extent Size type that is used to determine the number of blocks in the filter
-   *
-   * @param hash Hash value of the key
-   * @param num_blocks Number of block in the filter
-   *
-   * @return The block index for the given key's hash value
-   */
-  template <class Extent>
-  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const
-  {
-    constexpr auto hash_bits = cuda::std::numeric_limits<word_type>::digits;
-    // TODO: assert if num_blocks > max_filter_blocks
-    auto const max_blocks = cuda::std::min<Extent>(num_blocks, max_filter_blocks);
-    // Make sure we are only contained withing the `max_filter_blocks` blocks
-    return static_cast<word_type>(((hash >> hash_bits) * max_blocks) >> hash_bits) % max_blocks;
-  }
-
-  /**
-   * @brief Determines the fingerprint pattern for a word/segment within the filter block for a
-   * given key's hash value.
-   *
-   * @param hash Hash value of the key
-   * @param word_index Target word/segment within the filter block
-   *
-   * @return The bit pattern for the word/segment in the filter block
-   */
-  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
-  {
-    word_type const key = static_cast<word_type>(hash);
-    std::uint32_t salt{};
-
-    // Basically a switch (word_index) { case 0-7 ... }
-    // First split: 0..3 versus 4..7.
-    if (word_index < 4) {
-      // For indices 0..3, further split into 0..1 and 2..3.
-      if (word_index < 2) {
-        // word_index is 0 or 1.
-        salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U;
-      } else {
-        // word_index is 2 or 3.
-        salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU;
-      }
-    } else {
-      // For indices 4..7, further split into 4..5 and 6..7.
-      if (word_index < 6) {
-        // word_index is 4 or 5.
-        salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU;
-      } else {
-        // word_index is 6 or 7.
-        salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U;
-      }
-    }
-    return word_type{1} << ((key * salt) >> 27);
-  }
-
- private:
-  hasher hash_;
-};
-
-}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/bloom_filter/bloom_filter.inl b/include/cuco/detail/bloom_filter/bloom_filter.inl
index f3306b2fb..eb3bfcc4a 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter.inl
+++ b/include/cuco/detail/bloom_filter/bloom_filter.inl
@@ -34,8 +34,9 @@ __host__ constexpr bloom_filter<Key, Extent, Scope, Policy, Allocator>::bloom_fi
   Allocator const& alloc,
   cuda::stream_ref stream)
   : allocator_{alloc},
-    data_{allocator_.allocate(num_blocks, stream),
-          detail::custom_deleter<std::size_t, allocator_type>{num_blocks, allocator_, stream}},
+    data_{allocator_.allocate(static_cast<size_type>(num_blocks), stream),
+          detail::custom_deleter<std::size_t, allocator_type>{
+            static_cast<size_type>(num_blocks), allocator_, stream}},
     ref_{data_.get(), num_blocks, {}, policy}
 {
   this->clear_async(stream);
@@ -50,7 +51,7 @@ __host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::cle
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
 __host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::clear_async(
-  cuda::stream_ref stream)
+  cuda::stream_ref stream) noexcept
 {
   ref_.clear_async(stream);
 }
@@ -66,7 +67,7 @@ __host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::add
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
 template <class InputIt>
 __host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::add_async(
-  InputIt first, InputIt last, cuda::stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream) noexcept
 {
   ref_.add_async(first, last, stream);
 }
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
index 8e2b49453..f558c5e00 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
+++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
@@ -44,204 +44,6 @@
 
 namespace cuco::detail {
 
-/**
- * @brief Device functor for adding a single key to the bloom filter
- *
- * This functor is used with cuda::static_for to iterate over all words in a filter block
- * and set the appropriate bits for a given key's hash value. Each iteration processes
- * one word in the block using atomic operations to ensure thread safety.
- *
- * @tparam HashValue Type of the hash value (typically uint64_t)
- * @tparam BlockIndex Type of the block index (typically size_t or uint32_t)
- * @tparam Policy Filter policy type that provides word pattern generation
- * @tparam WordType Underlying word type of the filter (typically uint64_t)
- * @tparam Scope CUDA thread scope for atomic operations
- */
-template <typename HashValue,
-          typename BlockIndex,
-          typename Policy,
-          typename WordType,
-          cuda::thread_scope Scope>
-struct add_impl_functor {
-  HashValue hash_value;    ///< Hash value of the key being added
-  BlockIndex block_index;  ///< Index of the filter block to modify
-  Policy policy_;          ///< Filter policy for generating bit patterns
-  WordType* words_;        ///< Pointer to the filter's word array
-  size_t words_per_block;  ///< Number of words in each filter block
-
-  /**
-   * @brief Processes one word in the filter block for key insertion
-   *
-   * @tparam I Type of the integral constant passed by cuda::static_for
-   * @param i Integral constant representing the word index within the block
-   */
-  template <typename I>
-  __device__ void operator()(I i) const
-  {
-    auto const word = policy_.word_pattern(hash_value, i());
-    if (word != 0) {
-      auto atom_word =
-        cuda::atomic_ref<WordType, Scope>{*(words_ + (block_index * words_per_block + i()))};
-      atom_word.fetch_or(word, cuda::memory_order_relaxed);
-    }
-  }
-};
-
-/**
- * @brief Device functor for cooperative group-based batch key insertion
- *
- * This functor is used with cuda::static_for to process multiple keys in parallel
- * within a cooperative group. Each thread in the group processes a different key
- * using shuffle operations to share hash values and block indices across threads.
- *
- * @tparam CG Cooperative group type (e.g., thread_block_tile)
- * @tparam HashValue Type of the hash value
- * @tparam BlockIndex Type of the block index
- * @tparam BloomFilterImpl Type of the bloom filter implementation
- */
-template <typename CG, typename HashValue, typename BlockIndex, typename BloomFilterImpl>
-struct add_group_functor {
-  CG group;                ///< Cooperative group for parallel processing
-  HashValue hash_value;    ///< Hash value of the current thread's key
-  BlockIndex block_index;  ///< Block index of the current thread's key
-  size_t i;                ///< Starting index in the key batch
-  size_t num_keys;         ///< Total number of keys to process
-  size_t num_threads;      ///< Number of threads in the group
-  BloomFilterImpl* self;   ///< Pointer to the bloom filter implementation
-
-  /**
-   * @brief Processes one thread's key in the cooperative group batch insertion
-   *
-   * @tparam J Type of the integral constant passed by cuda::static_for
-   * @param j Integral constant representing the thread index within the group
-   */
-  template <typename J>
-  __device__ void operator()(J j) const
-  {
-    if ((j() < num_threads) and (i + j() < num_keys)) {
-      self->add_impl(group, group.shfl(hash_value, j()), group.shfl(block_index, j()));
-    }
-  }
-};
-
-/**
- * @brief Device functor for worker group-based batch key insertion
- *
- * This functor is used with cuda::static_for to process multiple keys in parallel
- * within a worker group (subdivision of a larger cooperative group). Similar to
- * add_group_functor but operates on a smaller worker group with offset handling
- * for processing different portions of the key batch.
- *
- * @tparam WorkerGroup Worker group type (subdivision of cooperative group)
- * @tparam HashValue Type of the hash value
- * @tparam BlockIndex Type of the block index
- * @tparam BloomFilterImpl Type of the bloom filter implementation
- */
-template <typename WorkerGroup, typename HashValue, typename BlockIndex, typename BloomFilterImpl>
-struct add_worker_group_functor {
-  WorkerGroup worker_group;   ///< Worker group (subdivision of cooperative group)
-  HashValue hash_value;       ///< Hash value of the current thread's key
-  BlockIndex block_index;     ///< Block index of the current thread's key
-  size_t i;                   ///< Starting index in the key batch
-  size_t worker_offset;       ///< Offset for this worker group within the batch
-  size_t num_keys;            ///< Total number of keys to process
-  size_t worker_num_threads;  ///< Number of threads in the worker group
-  BloomFilterImpl* self;      ///< Pointer to the bloom filter implementation
-
-  /**
-   * @brief Processes one thread's key in the worker group batch insertion
-   *
-   * @tparam J Type of the integral constant passed by cuda::static_for
-   * @param j Integral constant representing the thread index within the worker group
-   */
-  template <typename J>
-  __device__ void operator()(J j) const
-  {
-    if ((j() < worker_num_threads) and (i + worker_offset + j() < num_keys)) {
-      self->add_impl(
-        worker_group, worker_group.shfl(hash_value, j()), worker_group.shfl(block_index, j()));
-    }
-  }
-};
-
-/**
- * @brief Device functor for cooperative group-based single key insertion
- *
- * This functor is used with cuda::static_for to add a single key to the bloom filter
- * using a cooperative group. Each thread in the group processes different words in
- * the filter block based on thread rank and stride pattern. Used when the group size
- * doesn't match the number of words per block.
- *
- * @tparam HashValue Type of the hash value
- * @tparam BlockIndex Type of the block index
- * @tparam WordType Underlying word type of the filter
- * @tparam Scope CUDA thread scope for atomic operations
- * @tparam Policy Filter policy type that provides word pattern generation
- */
-template <typename HashValue,
-          typename BlockIndex,
-          typename WordType,
-          cuda::thread_scope Scope,
-          typename Policy>
-struct add_impl_group_functor {
-  HashValue hash_value;    ///< Hash value of the key being added
-  BlockIndex block_index;  ///< Index of the filter block to modify
-  WordType* words_;        ///< Pointer to the filter's word array
-  size_t words_per_block;  ///< Number of words in each filter block
-  size_t rank;             ///< Thread rank within the cooperative group
-  size_t num_threads;      ///< Number of threads in the cooperative group
-  Policy policy_;          ///< Filter policy for generating bit patterns
-
-  /**
-   * @brief Processes one word in the filter block using cooperative group stride pattern
-   *
-   * @tparam I Type of the integral constant passed by cuda::static_for
-   * @param i Integral constant representing the word index within the block
-   */
-  template <typename I>
-  __device__ void operator()(I i) const
-  {
-    if (i() >= rank && (i() - rank) % num_threads == 0) {
-      auto atom_word =
-        cuda::atomic_ref<WordType, Scope>{*(words_ + (block_index * words_per_block + i()))};
-      atom_word.fetch_or(policy_.word_pattern(hash_value, i()), cuda::memory_order_relaxed);
-    }
-  }
-};
-
-/**
- * @brief Device functor for checking if a key exists in the bloom filter
- *
- * This functor is used with cuda::static_for to iterate over all words in a filter block
- * and check if the expected bit patterns for a given key's hash value are present.
- * If any expected bit is missing, the result is set to false, indicating the key
- * is definitely not in the set.
- *
- * @tparam HashValue Type of the hash value
- * @tparam StoredPattern Type of the stored pattern array (typically array of WordType)
- * @tparam Policy Filter policy type that provides word pattern generation
- */
-template <typename HashValue, typename StoredPattern, typename Policy>
-struct contains_functor {
-  HashValue hash_value;          ///< Hash value of the key being queried
-  StoredPattern stored_pattern;  ///< Array of stored bit patterns from the filter block
-  Policy policy_;                ///< Filter policy for generating expected bit patterns
-  bool* result;                  ///< Pointer to result flag (set to false if key not found)
-
-  /**
-   * @brief Checks one word in the filter block for the expected bit pattern
-   *
-   * @tparam I Type of the integral constant passed by cuda::static_for
-   * @param i Integral constant representing the word index within the block
-   */
-  template <typename I>
-  __device__ void operator()(I i) const
-  {
-    auto const expected_pattern = policy_.word_pattern(hash_value, i());
-    if ((stored_pattern[i()] & expected_pattern) != expected_pattern) { *result = false; }
-  }
-};
-
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 class bloom_filter_impl {
  public:
@@ -250,26 +52,43 @@ class bloom_filter_impl {
   using size_type   = typename extent_type::value_type;
   using policy_type = Policy;
   using word_type   = typename policy_type::word_type;
+  // uint64_t may be unsigned long, but atomicOr requires unsigned long long
+  using atomic_word_type = typename cuda::std::
+    conditional_t<cuda::std::is_same_v<word_type, unsigned long>, unsigned long long, word_type>;
+
+  // Implementation-tuning knobs. Not part of the public API; reached via
+  // `bloom_filter_impl::tuning::use_*` from internal kernels. Defaults reflect the ablation
+  // measurements from arXiv:2512.15595; flip in source for tuning experiments.
+  struct tuning {
+    static constexpr bool use_invoke_one                       = true;
+    static constexpr bool use_early_exit                       = false;
+    static constexpr bool use_cub_kernels                      = true;
+    static constexpr bool use_warp_cooperative_add_kernel      = true;
+    static constexpr bool use_warp_cooperative_contains_kernel = true;
+    static constexpr bool use_work_stealing_add_kernel         = false;
+    static constexpr bool use_work_stealing_contains_kernel    = false;
+    static constexpr bool use_cuda_atomic_ref                  = false;
+  };
 
   static constexpr auto thread_scope    = Scope;
   static constexpr auto words_per_block = policy_type::words_per_block;
 
-  __host__ __device__ static constexpr size_t max_vec_bytes() noexcept
-  {
-    constexpr auto word_bytes  = sizeof(word_type);
-    constexpr auto block_bytes = word_bytes * words_per_block;
-    return cuda::std::min(cuda::std::max(word_bytes, 32ul),
-                          block_bytes);  // aiming for 2xLDG128 -> 1 sector per thread
-  }
+  static constexpr auto add_vertical_layout        = policy_type::add_vertical_layout;
+  static constexpr auto add_horizontal_layout      = policy_type::add_horizontal_layout;
+  static constexpr auto contains_vertical_layout   = policy_type::contains_vertical_layout;
+  static constexpr auto contains_horizontal_layout = policy_type::contains_horizontal_layout;
+  static constexpr auto add_loop_count =
+    words_per_block / (add_vertical_layout * add_horizontal_layout);
+  static constexpr auto contains_loop_count =
+    words_per_block / (contains_vertical_layout * contains_horizontal_layout);
 
-  struct alignas(max_vec_bytes()) filter_block_type {
-   private:
-    word_type data_[words_per_block];
-  };
+  static constexpr bool is_cache_sectorized = policy_type::is_cache_sectorized;
 
+  static_assert((not tuning::use_cuda_atomic_ref) or
+                  (Scope == cuda::thread_scope::thread_scope_device),
+                "atomicOr requires device scope");
   static_assert(cuda::std::has_single_bit(words_per_block) and words_per_block <= 32,
                 "Number of words per block must be a power-of-two and less than or equal to 32");
-
   static_assert(
     cuda::std::is_constructible_v<cuda::atomic_ref<word_type, Scope>, word_type&> &&
       cuda::std::is_invocable_r_v<word_type,
@@ -279,6 +98,20 @@ class bloom_filter_impl {
                                   cuda::std::memory_order>,
     "Invalid word type");
 
+  __host__ __device__ static constexpr size_t alignment() noexcept
+  {
+    // Maximum alignment is 32 bytes which is equivalent to one sector
+    return cuda::std::min(
+      static_cast<size_t>(32),
+      static_cast<size_t>(cuda::std::max(add_vertical_layout, contains_vertical_layout) *
+                          sizeof(word_type)));
+  }
+
+  struct filter_block_type {
+   private:
+    alignas(alignment()) word_type data_[words_per_block];
+  };
+
   __host__ __device__ explicit constexpr bloom_filter_impl(filter_block_type* filter,
                                                            Extent num_blocks,
                                                            cuda_thread_scope<Scope>,
@@ -298,7 +131,9 @@ class bloom_filter_impl {
   template <class CG>
   __device__ constexpr void clear(CG group)
   {
-    for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) {
+    // TODO optimize this
+    for (int i = group.thread_rank(); i < static_cast<size_type>(num_blocks_) * words_per_block;
+         i += group.size()) {
       words_[i] = 0;
     }
   }
@@ -313,195 +148,286 @@ class bloom_filter_impl {
 #endif
   }
 
-  __host__ constexpr void clear_async(cuda::stream_ref stream)
+  __host__ constexpr void clear_async(cuda::stream_ref stream) noexcept
   {
-    CUCO_CUDA_TRY(cub::DeviceFor::ForEachN(
+    cub::DeviceFor::ForEachN(
       words_,
-      num_blocks_ * words_per_block,
+      static_cast<size_type>(num_blocks_) * words_per_block,
       [] __device__(word_type & word) { word = 0; },
+      stream.get());
+  }
+
+  __host__ constexpr void merge(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
+                                cuda::stream_ref stream)
+  {
+    this->merge_async(other, stream);
+#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
+    stream.sync();
+#else
+    stream.wait();
+#endif
+  }
+
+  __host__ constexpr void merge_async(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
+                                      cuda::stream_ref stream)
+  {
+    CUCO_EXPECTS(this->block_extent() == other.block_extent(),
+                 "mismatching num_blocks in merge_async");
+    CUCO_CUDA_TRY(cub::DeviceTransform::Transform(
+      cuda::std::tuple{this->data(), other.data()},
+      this->data(),
+      this->block_extent() * words_per_block,
+      [] __device__(word_type a, word_type b) { return a | b; },
       stream.get()));
   }
 
-  template <class ProbeKey>
-  __device__ void add(ProbeKey const& key)
+  __host__ constexpr void intersect(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
+                                    cuda::stream_ref stream)
   {
-    auto const hash_value = policy_.hash(key);
-    this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
+    this->intersect_async(other, stream);
+#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
+    stream.sync();
+#else
+    stream.wait();
+#endif
   }
 
-  template <class InputIt>
-  __device__ void add(InputIt first, InputIt last)
+  __host__ constexpr void intersect_async(
+    bloom_filter_impl<Key, Extent, Scope, Policy> const& other, cuda::stream_ref stream)
   {
-    auto const num_keys = cuco::detail::distance(first, last);
-    for (decltype(num_keys) i = 0; i < num_keys; ++i) {
-      auto const hash_value = policy_.hash(*(first + i));
-      this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
-    }
+    CUCO_EXPECTS(this->block_extent() == other.block_extent(),
+                 "mismatching num_blocks in intersect_async");
+    CUCO_CUDA_TRY(cub::DeviceTransform::Transform(
+      cuda::std::tuple{this->data(), other.data()},
+      this->data(),
+      this->block_extent() * words_per_block,
+      [] __device__(word_type a, word_type b) { return a & b; },
+      stream.get()));
   }
 
-  template <class HashValue, class BlockIndex>
-  __device__ void add_impl(HashValue const& hash_value, BlockIndex block_index)
+  [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; }
+
+  [[nodiscard]] __host__ __device__ constexpr word_type const* data() const noexcept
   {
-    add_impl_functor<HashValue, BlockIndex, policy_type, word_type, thread_scope> functor{
-      hash_value, block_index, policy_, words_, words_per_block};
-    cuda::static_for<words_per_block>(functor);
+    return words_;
   }
 
-  template <class CG, class ProbeKey>
-  __device__ void add(CG group, ProbeKey const& key)
+  [[nodiscard]] __host__ __device__ constexpr extent_type block_extent() const noexcept
   {
-    constexpr auto num_threads         = tile_size_v<CG>;
-    constexpr auto optimal_num_threads = add_optimal_cg_size();
-    constexpr auto worker_num_threads =
-      (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads;
+    return num_blocks_;
+  }
 
-    // If single thread is optimal, use scalar add
-    if constexpr (worker_num_threads == 1) {
-      this->add(key);
+  // Single Thread Add. Layout-agnostic: when `add_horizontal_layout > 1`, runs the per-virtual-
+  // thread work serially in the calling thread.
+  template <bool ConditionalAtomic = true, class BuildKey>
+  __device__ void add(BuildKey build_key)
+  {
+    auto const [upper_hash, lower_hash] = policy_.split_hash(build_key);
+    auto const block_index              = policy_.block_index(upper_hash, num_blocks_);
+
+    if constexpr (add_horizontal_layout == 1) {
+      if constexpr (is_cache_sectorized) {
+        auto const group_hash = lower_hash * policy_type::group_index_salt;
+        add_pattern_cs<ConditionalAtomic, 0>(block_index, lower_hash, group_hash);
+      } else {
+        add_pattern<ConditionalAtomic, 0>(block_index, lower_hash);
+      }
     } else {
-      auto const hash_value = policy_.hash(key);
-      this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
+      if constexpr (is_cache_sectorized) {
+        auto const group_hash = lower_hash * policy_type::group_index_salt;
+#pragma unroll
+        for (uint32_t thread_index = 0; thread_index < add_horizontal_layout; ++thread_index) {
+          add_patterns_cs<ConditionalAtomic, 0>(block_index, lower_hash, group_hash, thread_index);
+        }
+      } else {
+#pragma unroll
+        for (uint32_t thread_index = 0; thread_index < add_horizontal_layout; ++thread_index) {
+          add_patterns<ConditionalAtomic, 0>(block_index, lower_hash, thread_index);
+        }
+      }
     }
   }
 
-  template <class CG, class InputIt>
-  __device__ void add(CG group, InputIt first, InputIt last)
+  // Multi Thread Add. Layout-flexible: when the CG size matches `add_horizontal_layout`, runs the
+  // optimal cooperative path with one shared hash evaluation; otherwise has one lane perform the
+  // layout-agnostic scalar insert and synchronizes the rest.
+  template <bool ConditionalAtomic = true, class CG, class BuildKey>
+  __device__ void add(CG group, BuildKey build_key)
   {
     namespace cg = cooperative_groups;
 
-    constexpr auto num_threads         = tile_size_v<CG>;
-    constexpr auto optimal_num_threads = add_optimal_cg_size();
-    constexpr auto worker_num_threads =
-      (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads;
+    if constexpr (add_horizontal_layout == 1 || tile_size_v<CG> != add_horizontal_layout) {
+      // Tile size doesn't match the layout. Pick one lane to do the scalar (layout-agnostic)
+      // insert; the rest wait at the implicit sync.
+      cg::invoke_one(group, [&] __device__() { this->template add<ConditionalAtomic>(build_key); });
+      return;
+    }
 
-    auto const num_keys = cuco::detail::distance(first, last);
-    if (num_keys == 0) { return; }
+    auto const [upper_hash, lower_hash, block_index] = [&] __device__ {
+      if constexpr (tuning::use_invoke_one) {
+        return cg::invoke_one_broadcast(group, [&] __device__() {
+          auto const sh = policy_.split_hash(build_key);
+          return cuda::std::make_tuple(
+            sh.first, sh.second, policy_.block_index(sh.first, num_blocks_));
+        });
+      } else {
+        auto const sh = policy_.split_hash(build_key);
+        return cuda::std::make_tuple(
+          sh.first, sh.second, policy_.block_index(sh.first, num_blocks_));
+      }
+    }();
 
-    auto const rank = group.thread_rank();
-
-    // If single thread is optimal, use scalar add
-    if constexpr (worker_num_threads == 1) {
-      for (auto i = rank; i < num_keys; i += num_threads) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-          *(first + i)};
-        this->add(insert_element);
-      }
-    } else if constexpr (num_threads == worker_num_threads) {  // given CG is optimal CG
-      typename policy_type::hash_result_type hash_value;
-      size_type block_index;
-
-      auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
-      for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
-        if (i + rank < num_keys) {
-          typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-            *(first + i + rank)};
-          hash_value  = policy_.hash(insert_element);
-          block_index = policy_.block_index(hash_value, num_blocks_);
-        }
+    if constexpr (is_cache_sectorized) {
+      auto const group_hash = lower_hash * policy_type::group_index_salt;
+      add_patterns_cs<ConditionalAtomic, 0>(
+        block_index, lower_hash, group_hash, group.thread_rank());
+    } else {
+      add_patterns<ConditionalAtomic, 0>(block_index, lower_hash, group.thread_rank());
+    }
+  }
 
-        add_group_functor<CG, decltype(hash_value), decltype(block_index), bloom_filter_impl>
-          functor{group,
-                  hash_value,
-                  block_index,
-                  static_cast<size_t>(i),
-                  static_cast<size_t>(num_keys),
-                  static_cast<size_t>(num_threads),
-                  this};
-        cuda::static_for<num_threads>(functor);
-      }
-    } else {  // subdivide given CG into multiple optimal CGs
-      typename policy_type::hash_result_type hash_value;
-      size_type block_index;
-
-      auto const worker_group  = cg::tiled_partition<worker_num_threads, CG>(group);
-      auto const worker_offset = worker_num_threads * worker_group.meta_group_rank();
-
-      auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
-
-      for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
-        if (i + rank < num_keys) {
-          typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
-          hash_value  = policy_.hash(key);
-          block_index = policy_.block_index(hash_value, num_blocks_);
-        }
+  // Warp-cooperative Add
+  template <bool ConditionalAtomic = true, class CG, class BuildKey>
+  __device__ void add_coop(CG group, BuildKey build_key)
+  {
+    constexpr auto num_threads = tile_size_v<CG>;
 
-        add_worker_group_functor<decltype(worker_group),
-                                 decltype(hash_value),
-                                 decltype(block_index),
-                                 bloom_filter_impl>
-          functor{worker_group,
-                  hash_value,
-                  block_index,
-                  static_cast<size_t>(i),
-                  static_cast<size_t>(worker_offset),
-                  static_cast<size_t>(num_keys),
-                  static_cast<size_t>(worker_num_threads),
-                  this};
-        cuda::static_for<worker_num_threads>(functor);
+    auto const [upper_hash, lower_hash] = policy_.split_hash(build_key);
+    auto const block_index              = policy_.block_index(upper_hash, num_blocks_);
+
+    if constexpr (is_cache_sectorized) {
+      auto const group_hash = lower_hash * policy_type::group_index_salt;
+#pragma unroll num_threads
+      for (int i = 0; i < num_threads; ++i) {
+        add_patterns_cs<ConditionalAtomic, 0>(group.shfl(block_index, i),
+                                              group.shfl(lower_hash, i),
+                                              group.shfl(group_hash, i),
+                                              group.thread_rank());
+      }
+    } else {
+#pragma unroll num_threads
+      for (int i = 0; i < num_threads; ++i) {
+        add_patterns<ConditionalAtomic, 0>(
+          group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank());
       }
     }
   }
 
-  template <class CG, class HashValue, class BlockIndex>
-  __device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index)
+  template <bool ConditionalAtomic = true, class CG, class BuildKey>
+  __device__ void add_coop(CG group, BuildKey build_key, bool is_valid)
   {
     constexpr auto num_threads = tile_size_v<CG>;
 
-    auto const rank = group.thread_rank();
+    // Compute the hash and block index only for lanes whose key is valid; invalid lanes' values
+    // are never used (the shfl loop below gates per-iteration work on `group.shfl(is_valid, i)`).
+    // Skipping the hash here saves work on sparse stencil-gated inserts.
+    uint32_t upper_hash   = 0;
+    uint32_t lower_hash   = 0;
+    size_type block_index = 0;
+    if (is_valid) {
+      auto const sh = policy_.split_hash(build_key);
+      upper_hash    = sh.first;
+      lower_hash    = sh.second;
+      block_index   = policy_.block_index(upper_hash, num_blocks_);
+    }
 
-    if constexpr (num_threads == words_per_block) {
-      auto atom_word = cuda::atomic_ref<word_type, thread_scope>{
-        *(words_ + (block_index * words_per_block + rank))};
-      atom_word.fetch_or(policy_.word_pattern(hash_value, rank), cuda::memory_order_relaxed);
+    if constexpr (is_cache_sectorized) {
+      auto const group_hash = lower_hash * policy_type::group_index_salt;
+#pragma unroll num_threads
+      for (int i = 0; i < num_threads; ++i) {
+        if (group.shfl(is_valid, i)) {
+          add_patterns_cs<ConditionalAtomic, 0>(group.shfl(block_index, i),
+                                                group.shfl(lower_hash, i),
+                                                group.shfl(group_hash, i),
+                                                group.thread_rank());
+        }
+      }
     } else {
-      add_impl_group_functor<HashValue, BlockIndex, word_type, thread_scope, policy_type> functor{
-        hash_value, block_index, words_, words_per_block, rank, num_threads, policy_};
-      cuda::static_for<words_per_block>(functor);
+#pragma unroll num_threads
+      for (int i = 0; i < num_threads; ++i) {
+        if (group.shfl(is_valid, i)) {
+          add_patterns<ConditionalAtomic, 0>(
+            group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank());
+        }
+      }
     }
   }
 
+  // Device-side range add (single-thread): loops over keys.
   template <class InputIt>
-  __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream)
+  __device__ void add(InputIt first, InputIt last)
   {
-    this->add_async(first, last, stream);
-#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
-    stream.sync();
-#else
-    stream.wait();
-#endif
+    auto const num_keys = cuco::detail::distance(first, last);
+    for (decltype(num_keys) i = 0; i < num_keys; ++i) {
+      this->add(*(first + i));
+    }
+  }
+
+  // Device-side range add (cooperative). When the tile size matches `add_horizontal_layout`,
+  // each batch loads/hashes one key per lane in parallel, then the tile cooperatively processes
+  // them via `add_coop`. Otherwise the tile parallelizes across the key range with each lane
+  // scalar-inserting its own keys.
+  template <class CG, class InputIt>
+  __device__ void add(CG group, InputIt first, InputIt last)
+  {
+    using key_type      = typename cuda::std::iterator_traits<InputIt>::value_type;
+    auto const num_keys = cuco::detail::distance(first, last);
+    if constexpr (tile_size_v<CG> == add_horizontal_layout && add_horizontal_layout > 1) {
+      auto constexpr num_threads = static_cast<decltype(num_keys)>(tile_size_v<CG>);
+      for (decltype(num_keys) batch = 0; batch < num_keys; batch += num_threads) {
+        auto const idx      = batch + static_cast<decltype(num_keys)>(group.thread_rank());
+        auto const is_valid = idx < num_keys;
+        key_type const key  = is_valid ? *(first + idx) : key_type{};
+        this->template add_coop<true>(group, key, is_valid);
+      }
+    } else {
+      auto const stride = static_cast<decltype(num_keys)>(tile_size_v<CG>);
+      for (auto i = static_cast<decltype(num_keys)>(group.thread_rank()); i < num_keys;
+           i += stride) {
+        this->add(*(first + i));
+      }
+    }
   }
 
+  // Host-side Add Entry Points
   template <class InputIt>
-  __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream)
+  __host__ void add_async(InputIt first, InputIt last, cuda::stream_ref stream) noexcept
   {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return; }
 
-    if constexpr (words_per_block == 1) {
-      CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN(
-        first,
-        num_keys,
-        [*this] __device__(key_type const key) mutable { this->add(key); },
-        stream.get()));
+    auto constexpr block_size = 256;
+    auto constexpr cg_size    = static_cast<int32_t>(add_horizontal_layout);
+    auto const grid_size      = tuning::use_warp_cooperative_add_kernel
+                                  ? cuco::detail::int_div_ceil(num_keys, block_size)
+                                  : cuco::detail::int_div_ceil(num_keys * cg_size, block_size);
+    auto const l2_cache_size  = static_cast<size_t>(cuco::detail::l2_cache_size());
+    auto const filter_size    = static_cast<size_t>(static_cast<size_type>(num_blocks_)) *
+                             words_per_block * sizeof(word_type);
+
+    if (2 * filter_size < l2_cache_size) {
+      if constexpr (tuning::use_work_stealing_add_kernel) {
+        detail::bloom_filter_ns::add_work_stealing_n<false, cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      } else {
+        detail::bloom_filter_ns::add_n<false, cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      }
     } else {
-      auto const num_keys = cuco::detail::distance(first, last);
-      if (num_keys == 0) { return; }
-
-      auto constexpr block_size = cuco::detail::default_block_size();
-      void const* kernel        = reinterpret_cast<void const*>(
-        detail::bloom_filter_ns::add<block_size, InputIt, bloom_filter_impl>);
-      auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, kernel);
-
-      detail::bloom_filter_ns::add<block_size>
-        <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      if constexpr (tuning::use_work_stealing_add_kernel) {
+        detail::bloom_filter_ns::add_work_stealing_n<true, cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      } else {
+        detail::bloom_filter_ns::add_n<true, cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      }
     }
   }
 
-  template <class InputIt, class StencilIt, class Predicate>
-  __host__ constexpr void add_if(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
+  template <class InputIt>
+  __host__ void add(InputIt first, InputIt last, cuda::stream_ref stream) noexcept
   {
-    this->add_if_async(first, last, stencil, pred, stream);
+    this->add_async(first, last, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else
@@ -509,87 +435,232 @@ class bloom_filter_impl {
 #endif
   }
 
-  template <class InputIt, class StencilIt, class Predicate>
-  __host__ constexpr void add_if_async(InputIt first,
-                                       InputIt last,
-                                       StencilIt stencil,
-                                       Predicate pred,
-                                       cuda::stream_ref stream) noexcept
+  // Single Thread Contains. Layout-agnostic: when `contains_horizontal_layout > 1`, runs the
+  // per-virtual-thread work serially in the calling thread.
+  template <class ProbeKey>
+  __device__ bool contains(ProbeKey probe_key) const
   {
-    auto const num_keys = cuco::detail::distance(first, last);
-    if (num_keys == 0) { return; }
-
-    auto constexpr cg_size    = add_optimal_cg_size();
-    auto constexpr block_size = cuco::detail::default_block_size();
-    auto const grid_size =
-      cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);
-
-    detail::bloom_filter_ns::add_if_n<cg_size, block_size>
-      <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
+    auto const [upper_hash, lower_hash] = policy_.split_hash(probe_key);
+    auto const block_index              = policy_.block_index(upper_hash, num_blocks_);
+
+    if constexpr (contains_horizontal_layout == 1) {
+      if constexpr (is_cache_sectorized) {
+        auto const group_hash = lower_hash * policy_type::group_index_salt;
+        return compare_pattern_cs<0>(block_index, lower_hash, group_hash);
+      } else {
+        return compare_pattern<0>(block_index, lower_hash);
+      }
+    } else {
+      if constexpr (is_cache_sectorized) {
+        auto const group_hash = lower_hash * policy_type::group_index_salt;
+        bool result           = true;
+#pragma unroll
+        for (uint32_t thread_index = 0; thread_index < contains_horizontal_layout; ++thread_index) {
+          result =
+            result && compare_patterns_cs<0>(block_index, lower_hash, group_hash, thread_index);
+        }
+        return result;
+      } else {
+        bool result = true;
+#pragma unroll
+        for (uint32_t thread_index = 0; thread_index < contains_horizontal_layout; ++thread_index) {
+          result = result && compare_patterns<0>(block_index, lower_hash, thread_index);
+        }
+        return result;
+      }
+    }
   }
 
-  template <class ProbeKey>
-  [[nodiscard]] __device__ bool contains(ProbeKey const& key) const
+  // Multi Thread Contains. Layout-flexible: when the CG size matches `contains_horizontal_layout`,
+  // runs the optimal cooperative path with one shared hash evaluation and an AND-reduction across
+  // the group; otherwise has one lane do the layout-agnostic scalar query and broadcasts the
+  // result so every lane returns the same value.
+  template <class CG, class ProbeKey>
+  __device__ bool contains(CG group, ProbeKey probe_key) const
   {
-    auto const hash_value = policy_.hash(key);
+    namespace cg = cooperative_groups;
 
-    auto const stored_pattern = this->vec_load_words<words_per_block>(
-      policy_.block_index(hash_value, num_blocks_) * words_per_block);
+    if constexpr (contains_horizontal_layout == 1 ||
+                  tile_size_v<CG> != contains_horizontal_layout) {
+      return cg::invoke_one_broadcast(
+        group, [&] __device__() -> bool { return this->contains(probe_key); });
+    }
 
-    bool result = true;
-    contains_functor<decltype(hash_value), decltype(stored_pattern), policy_type> functor{
-      hash_value, stored_pattern, policy_, &result};
-    cuda::static_for<words_per_block>(functor);
-    if (!result) { return false; }
+    auto const [upper_hash, lower_hash, block_index] = [&] __device__ {
+      if constexpr (tuning::use_invoke_one) {
+        return cg::invoke_one_broadcast(group, [&] __device__() {
+          auto const sh = policy_.split_hash(probe_key);
+          return cuda::std::make_tuple(
+            sh.first, sh.second, policy_.block_index(sh.first, num_blocks_));
+        });
+      } else {
+        auto const sh = policy_.split_hash(probe_key);
+        return cuda::std::make_tuple(
+          sh.first, sh.second, policy_.block_index(sh.first, num_blocks_));
+      }
+    }();
 
-    return true;
+    if constexpr (is_cache_sectorized) {
+      auto const group_hash = lower_hash * policy_type::group_index_salt;
+      return group.all(
+        compare_patterns_cs<0>(block_index, lower_hash, group_hash, group.thread_rank()));
+    } else {
+      return group.all(compare_patterns<0>(block_index, lower_hash, group.thread_rank()));
+    }
   }
 
+  // Warp-cooperative Contains
   template <class CG, class ProbeKey>
-  [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const
+  __device__ bool contains_coop(CG group, ProbeKey probe_key) const
   {
-    constexpr auto num_threads         = tile_size_v<CG>;
-    constexpr auto optimal_num_threads = contains_optimal_cg_size();
-    constexpr auto words_per_thread    = words_per_block / optimal_num_threads;
+    constexpr auto num_threads = tile_size_v<CG>;
 
-    // If single thread is optimal, use scalar contains
-    if constexpr (num_threads == 1 or optimal_num_threads == 1) {
-      return this->contains(key);
+    auto const [upper_hash, lower_hash] = policy_.split_hash(probe_key);
+    auto const block_index              = policy_.block_index(upper_hash, num_blocks_);
+    bool result_out                     = false;
+
+    if constexpr (is_cache_sectorized) {
+      auto const group_hash = lower_hash * policy_type::group_index_salt;
+#pragma unroll num_threads
+      for (int i = 0; i < num_threads; ++i) {
+        auto const result = group.all(compare_patterns_cs<0>(group.shfl(block_index, i),
+                                                             group.shfl(lower_hash, i),
+                                                             group.shfl(group_hash, i),
+                                                             group.thread_rank()));
+        if (i == group.thread_rank()) { result_out = result; }
+      }
     } else {
-      auto const rank       = group.thread_rank();
-      auto const hash_value = policy_.hash(key);
-      bool success          = true;
+#pragma unroll num_threads
+      for (int i = 0; i < num_threads; ++i) {
+        auto const result = group.all(compare_patterns<0>(
+          group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank()));
+        if (i == group.thread_rank()) { result_out = result; }
+      }
+    }
+    return result_out;
+  }
 
-// Use pragma unroll instead of cuda::static_for to avoid CUDA 12.0 compatibility issues
-#pragma unroll
-      for (size_type i = 0; i < optimal_num_threads; ++i) {
-        if (i >= rank && (i - rank) % num_threads == 0) {
-          auto const thread_offset  = i * words_per_thread;
-          auto const stored_pattern = this->vec_load_words<words_per_thread>(
-            policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset);
+  template <class CG, class ProbeKey>
+  __device__ bool contains_coop(CG group, ProbeKey probe_key, bool is_valid) const
+  {
+    constexpr auto num_threads = tile_size_v<CG>;
 
-#pragma unroll
-          for (size_type j = 0; j < words_per_thread; ++j) {
-            auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j);
-            if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; }
-          }
+    // Compute the hash and block index only for lanes whose key is valid; invalid lanes' values
+    // are never used (the shfl loop below gates per-iteration work on `group.shfl(is_valid, i)`).
+    // Skipping the hash here saves work on sparse stencil-gated queries.
+    uint32_t upper_hash   = 0;
+    uint32_t lower_hash   = 0;
+    size_type block_index = 0;
+    if (is_valid) {
+      auto const sh = policy_.split_hash(probe_key);
+      upper_hash    = sh.first;
+      lower_hash    = sh.second;
+      block_index   = policy_.block_index(upper_hash, num_blocks_);
+    }
+
+    bool result_out = false;
+    if constexpr (is_cache_sectorized) {
+      auto const group_hash = lower_hash * policy_type::group_index_salt;
+#pragma unroll num_threads
+      for (int i = 0; i < num_threads; ++i) {
+        if (group.shfl(is_valid, i)) {
+          auto const result = group.all(compare_patterns_cs<0>(group.shfl(block_index, i),
+                                                               group.shfl(lower_hash, i),
+                                                               group.shfl(group_hash, i),
+                                                               group.thread_rank()));
+          if (i == group.thread_rank()) { result_out = result; }
         }
       }
+    } else {
+#pragma unroll num_threads
+      for (int i = 0; i < num_threads; ++i) {
+        if (group.shfl(is_valid, i)) {
+          auto const result = group.all(compare_patterns<0>(
+            group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank()));
+          if (i == group.thread_rank()) { result_out = result; }
+        }
+      }
+    }
+    return result_out;
+  }
+
+  // Device-side range contains (single-thread): loops over keys.
+  template <class InputIt, class OutputIt>
+  __device__ void contains(InputIt first, InputIt last, OutputIt output_begin) const
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    for (decltype(num_keys) i = 0; i < num_keys; ++i) {
+      *(output_begin + i) = this->contains(*(first + i));
+    }
+  }
 
-      return group.all(success);
+  // Device-side range contains (cooperative). When the tile size matches
+  // `contains_horizontal_layout`, each batch loads/hashes one key per lane in parallel, then the
+  // tile cooperatively processes them via `contains_coop` so each lane gets the result for its
+  // own key. Otherwise the tile parallelizes across the key range with each lane scalar-querying
+  // its own keys.
+  template <class CG, class InputIt, class OutputIt>
+  __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) const
+  {
+    using key_type      = typename cuda::std::iterator_traits<InputIt>::value_type;
+    auto const num_keys = cuco::detail::distance(first, last);
+    if constexpr (tile_size_v<CG> == contains_horizontal_layout && contains_horizontal_layout > 1) {
+      auto constexpr num_threads = static_cast<decltype(num_keys)>(tile_size_v<CG>);
+      for (decltype(num_keys) batch = 0; batch < num_keys; batch += num_threads) {
+        auto const idx      = batch + static_cast<decltype(num_keys)>(group.thread_rank());
+        auto const is_valid = idx < num_keys;
+        key_type const key  = is_valid ? *(first + idx) : key_type{};
+        auto const result   = this->contains_coop(group, key, is_valid);
+        if (is_valid) { *(output_begin + idx) = result; }
+      }
+    } else {
+      auto const stride = static_cast<decltype(num_keys)>(tile_size_v<CG>);
+      for (auto i = static_cast<decltype(num_keys)>(group.thread_rank()); i < num_keys;
+           i += stride) {
+        *(output_begin + i) = this->contains(*(first + i));
+      }
     }
   }
 
-  // TODO
-  // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
-  // const;
+  // Host-side Contains Entry Points
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt output_begin,
+                               cuda::stream_ref stream) const noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    if constexpr (tuning::use_cub_kernels and ((words_per_block / contains_vertical_layout) == 1)) {
+      cub::DeviceTransform::Transform(
+        first,
+        output_begin,
+        num_keys,
+        [*this] __device__(auto const& key) { return this->contains(key); },
+        stream.get());
+    } else {
+      auto constexpr block_size = 256;
+      auto constexpr cg_size    = static_cast<int32_t>(contains_horizontal_layout);
+      auto const grid_size      = tuning::use_warp_cooperative_contains_kernel
+                                    ? cuco::detail::int_div_ceil(num_keys, block_size)
+                                    : cuco::detail::int_div_ceil(num_keys * cg_size, block_size);
+
+      if constexpr (tuning::use_work_stealing_contains_kernel) {
+        detail::bloom_filter_ns::contains_work_stealing_n<cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, output_begin, *this);
+      } else {
+        detail::bloom_filter_ns::contains_n<cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, output_begin, *this);
+      }
+    }
+  }
 
   template <class InputIt, class OutputIt>
   __host__ void contains(InputIt first,
                          InputIt last,
                          OutputIt output_begin,
-                         cuda::stream_ref stream) const
+                         cuda::stream_ref stream) const noexcept
   {
     this->contains_async(first, last, output_begin, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
@@ -599,25 +670,43 @@ class bloom_filter_impl {
 #endif
   }
 
-  template <class InputIt, class OutputIt>
-  __host__ void contains_async(InputIt first,
-                               InputIt last,
-                               OutputIt output_begin,
-                               cuda::stream_ref stream) const noexcept
+  // Host-side stencil-gated Add Entry Points
+  template <class InputIt, class StencilIt, class Predicate>
+  __host__ void add_if_async(InputIt first,
+                             InputIt last,
+                             StencilIt stencil,
+                             Predicate pred,
+                             cuda::stream_ref stream) noexcept
   {
-    auto const always_true = cuda::constant_iterator<bool>{true};
-    this->contains_if_async(first, last, always_true, cuda::std::identity{}, output_begin, stream);
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto constexpr block_size = 256;
+    auto constexpr cg_size    = static_cast<int32_t>(add_horizontal_layout);
+    auto const grid_size      = tuning::use_warp_cooperative_add_kernel
+                                  ? cuco::detail::int_div_ceil(num_keys, block_size)
+                                  : cuco::detail::int_div_ceil(num_keys * cg_size, block_size);
+    auto const l2_cache_size  = static_cast<size_t>(cuco::detail::l2_cache_size());
+    auto const filter_size    = static_cast<size_t>(static_cast<size_type>(num_blocks_)) *
+                             words_per_block * sizeof(word_type);
+
+    if (2 * filter_size < l2_cache_size) {
+      detail::bloom_filter_ns::add_if_n<false, cg_size, block_size>
+        <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
+    } else {
+      detail::bloom_filter_ns::add_if_n<true, cg_size, block_size>
+        <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
+    }
   }
 
-  template <class InputIt, class StencilIt, class Predicate, class OutputIt>
-  __host__ void contains_if(InputIt first,
-                            InputIt last,
-                            StencilIt stencil,
-                            Predicate pred,
-                            OutputIt output_begin,
-                            cuda::stream_ref stream) const
+  template <class InputIt, class StencilIt, class Predicate>
+  __host__ void add_if(InputIt first,
+                       InputIt last,
+                       StencilIt stencil,
+                       Predicate pred,
+                       cuda::stream_ref stream) noexcept
   {
-    this->contains_if_async(first, last, stencil, pred, output_begin, stream);
+    this->add_if_async(first, last, stencil, pred, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else
@@ -625,6 +714,7 @@ class bloom_filter_impl {
 #endif
   }
 
+  // Host-side stencil-gated Contains Entry Points
   template <class InputIt, class StencilIt, class Predicate, class OutputIt>
   __host__ void contains_if_async(InputIt first,
                                   InputIt last,
@@ -636,20 +726,26 @@ class bloom_filter_impl {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return; }
 
-    auto constexpr cg_size    = contains_optimal_cg_size();
-    auto constexpr block_size = cuco::detail::default_block_size();
-    auto const grid_size =
-      cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);
+    auto constexpr block_size = 256;
+    auto constexpr cg_size    = static_cast<int32_t>(contains_horizontal_layout);
+    auto const grid_size      = tuning::use_warp_cooperative_contains_kernel
+                                  ? cuco::detail::int_div_ceil(num_keys, block_size)
+                                  : cuco::detail::int_div_ceil(num_keys * cg_size, block_size);
 
     detail::bloom_filter_ns::contains_if_n<cg_size, block_size>
       <<<grid_size, block_size, 0, stream.get()>>>(
         first, num_keys, stencil, pred, output_begin, *this);
   }
 
-  __host__ constexpr void merge(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
-                                cuda::stream_ref stream)
+  template <class InputIt, class StencilIt, class Predicate, class OutputIt>
+  __host__ void contains_if(InputIt first,
+                            InputIt last,
+                            StencilIt stencil,
+                            Predicate pred,
+                            OutputIt output_begin,
+                            cuda::stream_ref stream) const noexcept
   {
-    this->merge_async(other, stream);
+    this->contains_if_async(first, last, stencil, pred, output_begin, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else
@@ -657,81 +753,332 @@ class bloom_filter_impl {
 #endif
   }
 
-  __host__ constexpr void merge_async(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
-                                      cuda::stream_ref stream)
+  // TODO
+  // [[nodiscard]] __host__ double occupancy() const;
+  // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
+  // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
+  // template <typename CG, cuda::thread_scope NewScope = thread_scope>
+  // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const
+  // memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
+
+  //  private:
+  template <uint32_t NumWords>
+  __device__ constexpr cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
   {
-    CUCO_EXPECTS(this->block_extent() == other.block_extent(),
-                 "mismatching num_blocks in merge_async");
-    CUCO_CUDA_TRY(cub::DeviceTransform::Transform(
-      cuda::std::tuple{this->data(), other.data()},
-      this->data(),
-      this->block_extent() * words_per_block,
-      [] __device__(word_type a, word_type b) { return a | b; },
-      stream.get()));
+    // The block storage is aligned to `alignment()`, but a per-lane load at offset `index` is
+    // only guaranteed to be aligned to `min(NumWords * sizeof(word_type), alignment())`. Hand the
+    // compiler the alignment that's actually delivered, not the block-level maximum.
+    constexpr auto load_alignment =
+      cuda::std::min<size_t>(NumWords * sizeof(word_type), alignment());
+    return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(
+      __builtin_assume_aligned(words_ + index, load_alignment));
   }
 
-  __host__ constexpr void intersect(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
-                                    cuda::stream_ref stream)
+  //===--------------------------------------------------===//
+  // Parametric Filter Policy
+  //===--------------------------------------------------===//
+  /// Insert the given pattern into the filter
+  // Precondition: add_horizontal_layout == 1
+  template <bool ConditionalAtomic, uint32_t LoopIndex>
+  __device__ constexpr void add_pattern(uint32_t block_index, uint32_t lower_hash)
   {
-    this->intersect_async(other, stream);
-#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
-    stream.sync();
-#else
-    stream.wait();
-#endif
+    static_assert(add_horizontal_layout == 1, "add_pattern() requires add_horizontal_layout == 1");
+
+    if constexpr (LoopIndex < add_loop_count) {
+      auto const pattern =
+        policy_.template array_pattern<LoopIndex, add_vertical_layout>(lower_hash);
+      auto* word_base = words_ + block_index * words_per_block + LoopIndex * add_vertical_layout;
+
+      for (int i = 0; i < add_vertical_layout; ++i) {
+        atomic_or<ConditionalAtomic>(word_base + i, pattern[i]);
+      }
+
+      // Recurse.
+      add_pattern<ConditionalAtomic, LoopIndex + 1>(block_index, lower_hash);
+    }
   }
 
-  __host__ constexpr void intersect_async(
-    bloom_filter_impl<Key, Extent, Scope, Policy> const& other, cuda::stream_ref stream)
+  // Precondition: add_horizontal_layout > 1
+  template <bool ConditionalAtomic, uint32_t LoopIndex>
+  __device__ constexpr void add_patterns(uint32_t block_index,
+                                         uint32_t lower_hash,
+                                         uint32_t thread_index)
   {
-    CUCO_EXPECTS(this->block_extent() == other.block_extent(),
-                 "mismatching num_blocks in intersect_async");
-    CUCO_CUDA_TRY(cub::DeviceTransform::Transform(
-      cuda::std::tuple{this->data(), other.data()},
-      this->data(),
-      this->block_extent() * words_per_block,
-      [] __device__(word_type a, word_type b) { return a & b; },
-      stream.get()));
+    static_assert(add_horizontal_layout > 1, "add_patterns() requires add_horizontal_layout > 1");
+
+    if constexpr (LoopIndex < add_loop_count) {
+      auto const pattern =
+        policy_.template array_pattern<LoopIndex, add_horizontal_layout, add_vertical_layout>(
+          lower_hash, thread_index);
+      auto* word_base = words_ + block_index * words_per_block +
+                        LoopIndex * add_vertical_layout * add_horizontal_layout +
+                        thread_index * add_vertical_layout;
+
+      for (int i = 0; i < add_vertical_layout; ++i) {
+        atomic_or<ConditionalAtomic>(word_base + i, pattern[i]);
+      }
+
+      // Recurse.
+      add_patterns<ConditionalAtomic, LoopIndex + 1>(block_index, lower_hash, thread_index);
+    }
   }
 
-  [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; }
+  //===----------Cache-Sectorized Add----------===//
+  template <bool ConditionalAtomic, uint32_t LoopIndex>
+  __device__ constexpr void add_pattern_cs(uint32_t block_index,
+                                           uint32_t lower_hash,
+                                           uint32_t group_hash)
+  {
+    auto constexpr add_groups_per_vertical_layout = policy_type::add_groups_per_vertical_layout;
+    auto constexpr group_index_width              = policy_type::group_index_width;
+    auto constexpr group_index_mask               = policy_type::group_index_mask;
+    auto constexpr words_per_group                = policy_type::words_per_group;
+
+    static_assert(add_horizontal_layout == 1, "add_pattern() requires add_horizontal_layout == 1");
+
+    if constexpr (LoopIndex < add_loop_count) {
+      auto const pattern =
+        policy_.template array_pattern<LoopIndex, add_vertical_layout>(lower_hash);
+      auto* word_base = words_ + block_index * words_per_block + LoopIndex * add_vertical_layout;
+
+      for (int i = 0; i < add_groups_per_vertical_layout; ++i) {
+        auto const group_index =
+          (group_hash >> (i + LoopIndex * add_groups_per_vertical_layout) * group_index_width) &
+          group_index_mask;
+        atomic_or<ConditionalAtomic>(word_base + i * words_per_group + group_index, pattern[i]);
+      }
 
-  [[nodiscard]] __host__ __device__ constexpr word_type const* data() const noexcept
+      // Recurse.
+      add_pattern_cs<ConditionalAtomic, LoopIndex + 1>(block_index, lower_hash, group_hash);
+    }
+  }
+
+  template <bool ConditionalAtomic, uint32_t LoopIndex>
+  __device__ constexpr void add_patterns_cs(uint32_t block_index,
+                                            uint32_t lower_hash,
+                                            uint32_t group_hash,
+                                            uint32_t thread_index)
   {
-    return words_;
+    auto constexpr add_groups_per_vertical_layout = policy_type::add_groups_per_vertical_layout;
+    auto constexpr group_index_width              = policy_type::group_index_width;
+    auto constexpr group_index_mask               = policy_type::group_index_mask;
+    auto constexpr words_per_group                = policy_type::words_per_group;
+
+    static_assert(add_horizontal_layout > 1, "add_patterns() requires add_horizontal_layout > 1");
+
+    if constexpr (LoopIndex < add_loop_count) {
+      auto const pattern =
+        policy_.template array_pattern<LoopIndex, add_horizontal_layout, add_vertical_layout>(
+          lower_hash, thread_index);
+      auto* word_base = words_ + block_index * words_per_block +
+                        LoopIndex * add_vertical_layout * add_horizontal_layout +
+                        thread_index * add_vertical_layout;
+
+      for (int i = 0; i < add_groups_per_vertical_layout; ++i) {
+        auto const group_index =
+          (group_hash >> (i + LoopIndex * add_groups_per_vertical_layout * add_horizontal_layout +
+                          thread_index * add_groups_per_vertical_layout) *
+                           group_index_width) &
+          group_index_mask;
+        atomic_or<ConditionalAtomic>(word_base + i * words_per_group + group_index, pattern[i]);
+      }
+
+      // Recurse.
+      add_patterns_cs<ConditionalAtomic, LoopIndex + 1>(
+        block_index, lower_hash, group_hash, thread_index);
+    }
   }
 
-  [[nodiscard]] __host__ __device__ constexpr extent_type block_extent() const noexcept
+  template <bool ConditionalAtomic>
+  __device__ constexpr void atomic_or(word_type* word_ptr, word_type pattern) const
   {
-    return num_blocks_;
+    if constexpr (tuning::use_cuda_atomic_ref) {
+      if constexpr (ConditionalAtomic) {
+        if ((*word_ptr & pattern) != pattern) {
+          auto atom_word = cuda::atomic_ref<word_type, thread_scope>{*word_ptr};
+          atom_word.fetch_or(pattern, cuda::memory_order_relaxed);
+        }
+      } else {
+        auto atom_word = cuda::atomic_ref<word_type, thread_scope>{*word_ptr};
+        atom_word.fetch_or(pattern, cuda::memory_order_relaxed);
+      }
+    } else {
+      if constexpr (ConditionalAtomic) {
+        if ((*word_ptr & pattern) != pattern) {
+          atomicOr(reinterpret_cast<atomic_word_type*>(word_ptr),
+                   static_cast<atomic_word_type>(pattern));
+        }
+      } else {
+        atomicOr(reinterpret_cast<atomic_word_type*>(word_ptr),
+                 static_cast<atomic_word_type>(pattern));
+      }
+    }
   }
 
-  // TODO
-  // [[nodiscard]] __host__ double occupancy() const;
-  // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
-  // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
-  // template <typename CG, cuda::thread_scope NewScope = thread_scope>
-  // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const
-  // memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
+  /// Compare the stored pattern against the expected pattern for the given hash value.
+  // Precondition: contains_horizontal_layout == 1
+  template <uint32_t LoopIndex>
+  __device__ constexpr bool compare_pattern(uint32_t block_index, uint32_t lower_hash) const
+  {
+    static_assert(contains_horizontal_layout == 1,
+                  "compare_pattern() requires contains_horizontal_layout == 1");
+
+    if constexpr (LoopIndex < contains_loop_count) {
+      auto const stored_pattern = this->vec_load_words<contains_vertical_layout>(
+        block_index * words_per_block + LoopIndex * contains_vertical_layout);
+      auto const expected_pattern =
+        policy_.template array_pattern<LoopIndex, contains_vertical_layout>(lower_hash);
+
+      bool match = true;
+      for (int i = 0; i < contains_vertical_layout; ++i) {
+        match &= (stored_pattern[i] & expected_pattern[i]) == expected_pattern[i];
+      }
 
- private:
-  template <uint32_t NumWords>
-  __device__ constexpr cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
+      // Recurse.
+      // Early exit in this implementation occurs at the granulairy of contains_vertical_layout
+      // words.
+      if constexpr (tuning::use_early_exit) {
+        if (!match) { return false; }
+        return compare_pattern<LoopIndex + 1>(block_index, lower_hash);
+      } else {
+        return compare_pattern<LoopIndex + 1>(block_index, lower_hash) && match;
+      }
+    } else {
+      return true;
+    }
+  }
+
+  // Precondition: contains_horizontal_layout > 1.
+  // Returns the per-thread AND across the loop-iteration slice owned by `thread_index`. Callers
+  // that need a per-CG result must reduce across the group (e.g. `group.all(...)`).
+  template <uint32_t LoopIndex>
+  __device__ constexpr bool compare_patterns(uint32_t block_index,
+                                             uint32_t lower_hash,
+                                             uint32_t thread_index) const
   {
-    return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(__builtin_assume_aligned(
-      words_ + index, cuda::std::min(sizeof(word_type) * NumWords, max_vec_bytes())));
+    static_assert(contains_horizontal_layout > 1,
+                  "compare_patterns() requires HorizontalLayout > 1");
+
+    if constexpr (LoopIndex < contains_loop_count) {
+      auto const stored_pattern = this->vec_load_words<contains_vertical_layout>(
+        block_index * words_per_block +
+        LoopIndex * contains_vertical_layout * contains_horizontal_layout +
+        thread_index * contains_vertical_layout);
+      auto const expected_pattern =
+        policy_
+          .template array_pattern<LoopIndex, contains_horizontal_layout, contains_vertical_layout>(
+            lower_hash, thread_index);
+
+      bool match = true;
+      for (int i = 0; i < contains_vertical_layout; ++i) {
+        match &= (stored_pattern[i] & expected_pattern[i]) == expected_pattern[i];
+      }
+
+      // Per-thread early exit: short-circuit this thread's recursion if its slice already missed.
+      if constexpr (tuning::use_early_exit) {
+        if (!match) { return false; }
+        return compare_patterns<LoopIndex + 1>(block_index, lower_hash, thread_index);
+      } else {
+        return compare_patterns<LoopIndex + 1>(block_index, lower_hash, thread_index) && match;
+      }
+    } else {
+      return true;
+    }
   }
 
-  [[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size()
+  //===----------Cache-Sectorized Compare----------===//
+  template <uint32_t LoopIndex>
+  __device__ constexpr bool compare_pattern_cs(uint32_t block_index,
+                                               uint32_t lower_hash,
+                                               uint32_t group_hash) const
   {
-    return words_per_block;  // one thread per word so atomic updates can be coalesced
+    auto constexpr contains_groups_per_vertical_layout =
+      policy_type::contains_groups_per_vertical_layout;
+    auto constexpr group_index_width = policy_type::group_index_width;
+    auto constexpr group_index_mask  = policy_type::group_index_mask;
+    auto constexpr words_per_group   = policy_type::words_per_group;
+
+    static_assert(contains_horizontal_layout == 1,
+                  "compare_pattern() requires contains_horizontal_layout == 1");
+
+    if constexpr (LoopIndex < contains_loop_count) {
+      auto const* word_base =
+        words_ + block_index * words_per_block + LoopIndex * contains_vertical_layout;
+      auto const expected_pattern =
+        policy_.template array_pattern<LoopIndex, contains_vertical_layout>(lower_hash);
+
+      bool match = true;
+      for (int i = 0; i < contains_groups_per_vertical_layout; ++i) {
+        auto const group_index =
+          (group_hash >>
+           (i + LoopIndex * contains_groups_per_vertical_layout) * group_index_width) &
+          group_index_mask;
+        match &= (word_base[i * words_per_group + group_index] & expected_pattern[i]) ==
+                 expected_pattern[i];
+      }
+
+      // Recurse.
+      // Early exit in this implementation occurs at the granulairy of contains_vertical_layout
+      // words.
+      if constexpr (tuning::use_early_exit) {
+        if (!match) { return false; }
+        return compare_pattern_cs<LoopIndex + 1>(block_index, lower_hash, group_hash);
+      } else {
+        return compare_pattern_cs<LoopIndex + 1>(block_index, lower_hash, group_hash) && match;
+      }
+    } else {
+      return true;
+    }
   }
 
-  [[nodiscard]] __host__ __device__ static constexpr int32_t contains_optimal_cg_size()
+  template <uint32_t LoopIndex>
+  __device__ constexpr bool compare_patterns_cs(uint32_t block_index,
+                                                uint32_t lower_hash,
+                                                uint32_t group_hash,
+                                                uint32_t thread_index) const
   {
-    constexpr auto word_bytes  = sizeof(word_type);
-    constexpr auto block_bytes = word_bytes * words_per_block;
-    return block_bytes / max_vec_bytes();  // one vector load per thread
+    auto constexpr contains_groups_per_vertical_layout =
+      policy_type::contains_groups_per_vertical_layout;
+    auto constexpr group_index_width = policy_type::group_index_width;
+    auto constexpr group_index_mask  = policy_type::group_index_mask;
+    auto constexpr words_per_group   = policy_type::words_per_group;
+
+    static_assert(contains_horizontal_layout > 1,
+                  "compare_patterns_cs() requires HorizontalLayout > 1");
+
+    if constexpr (LoopIndex < contains_loop_count) {
+      auto const* word_base = words_ + block_index * words_per_block +
+                              LoopIndex * contains_vertical_layout * contains_horizontal_layout +
+                              thread_index * contains_vertical_layout;
+      auto const expected_pattern =
+        policy_
+          .template array_pattern<LoopIndex, contains_horizontal_layout, contains_vertical_layout>(
+            lower_hash, thread_index);
+
+      bool match = true;
+      for (int i = 0; i < contains_groups_per_vertical_layout; ++i) {
+        auto const group_index =
+          (group_hash >>
+           (i + LoopIndex * contains_groups_per_vertical_layout * contains_horizontal_layout +
+            thread_index * contains_groups_per_vertical_layout) *
+             group_index_width) &
+          group_index_mask;
+        match &= (word_base[i * words_per_group + group_index] & expected_pattern[i]) ==
+                 expected_pattern[i];
+      }
+
+      if constexpr (tuning::use_early_exit) {
+        if (!match) { return false; }
+        return compare_patterns_cs<LoopIndex + 1>(
+          block_index, lower_hash, group_hash, thread_index);
+      } else {
+        return compare_patterns_cs<LoopIndex + 1>(
+                 block_index, lower_hash, group_hash, thread_index) &&
+               match;
+      }
+    } else {
+      return true;
+    }
   }
 
   word_type* words_;
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
index b251dceee..e6f7d9d89 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
+++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(cuda
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear_async(
-  cuda::stream_ref stream)
+  cuda::stream_ref stream) noexcept
 {
   impl_.clear_async(stream);
 }
@@ -64,6 +64,13 @@ __device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(ProbeKey const
   impl_.add(key);
 }
 
+template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
+template <class InputIt>
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(InputIt first, InputIt last)
+{
+  impl_.add(first, last);
+}
+
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG, class ProbeKey>
 __device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group, ProbeKey const& key)
@@ -92,7 +99,7 @@ __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::add(InputI
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class InputIt>
 __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::add_async(
-  InputIt first, InputIt last, cuda::stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream) noexcept
 {
   impl_.add_async(first, last, stream);
 }
@@ -121,6 +128,15 @@ template <class ProbeKey>
   return impl_.contains(key);
 }
 
+template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
+template <class InputIt, class OutputIt>
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(InputIt first,
+                                                                       InputIt last,
+                                                                       OutputIt output_begin) const
+{
+  impl_.contains(first, last, output_begin);
+}
+
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG, class ProbeKey>
 [[nodiscard]] __device__ bool bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
@@ -129,6 +145,16 @@ template <class CG, class ProbeKey>
   return impl_.contains(group, key);
 }
 
+template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
+template <class CG, class InputIt, class OutputIt>
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(CG group,
+                                                                       InputIt first,
+                                                                       InputIt last,
+                                                                       OutputIt output_begin) const
+{
+  impl_.contains(group, first, last, output_begin);
+}
+
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class InputIt, class OutputIt>
 __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
diff --git a/include/cuco/detail/bloom_filter/default_filter_policy.inl b/include/cuco/detail/bloom_filter/default_filter_policy.inl
deleted file mode 100644
index eb8dbf703..000000000
--- a/include/cuco/detail/bloom_filter/default_filter_policy.inl
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cstdint>
-
-namespace cuco {
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-__host__
-  __device__ constexpr default_filter_policy<Hash, Word, WordsPerBlock>::default_filter_policy(
-    uint32_t pattern_bits, Hash hash)
-  : impl_{pattern_bits, hash}
-{
-}
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-__device__ constexpr typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type
-default_filter_policy<Hash, Word, WordsPerBlock>::hash(
-  typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_argument_type const& key) const
-{
-  return impl_.hash(key);
-}
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-template <class Extent>
-__device__ constexpr auto default_filter_policy<Hash, Word, WordsPerBlock>::block_index(
-  typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type hash,
-  Extent num_blocks) const
-{
-  return impl_.block_index(hash, num_blocks);
-}
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-__device__ constexpr typename default_filter_policy<Hash, Word, WordsPerBlock>::word_type
-default_filter_policy<Hash, Word, WordsPerBlock>::word_pattern(
-  default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type hash,
-  std::uint32_t word_index) const
-{
-  return impl_.word_pattern(hash, word_index);
-}
-
-}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh
deleted file mode 100644
index 14509b9b0..000000000
--- a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuco/detail/error.hpp>
-
-#include <cuda/std/bit>
-#include <cuda/std/limits>
-#include <cuda/std/tuple>
-#include <cuda/std/type_traits>
-
-#include <cstdint>
-#include <nv/target>
-
-namespace cuco::detail {
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-class default_filter_policy_impl {
- public:
-  using hasher             = Hash;
-  using word_type          = Word;
-  using hash_argument_type = typename hasher::argument_type;
-  using hash_result_type   = decltype(std::declval<hasher>()(std::declval<hash_argument_type>()));
-
-  static constexpr std::uint32_t words_per_block = WordsPerBlock;
-
- private:
-  static constexpr std::uint32_t word_bits       = cuda::std::numeric_limits<word_type>::digits;
-  static constexpr std::uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1);
-
- public:
-  __host__ __device__ explicit constexpr default_filter_policy_impl(uint32_t pattern_bits,
-                                                                    Hash hash)
-    : pattern_bits_{pattern_bits},
-      min_bits_per_word_{pattern_bits_ / words_per_block},
-      remainder_bits_{pattern_bits_ % words_per_block},
-      hash_{hash}
-  {
-    NV_DISPATCH_TARGET(
-      NV_IS_HOST,
-      (  // This ensures each word in the block has at least one bit set; otherwise we would never
-         // use some of the words
-        constexpr uint32_t min_pattern_bits = words_per_block;
-
-        // The maximum number of bits to be set for a key is capped by the total number of bits in
-        // the filter block
-        constexpr uint32_t max_pattern_bits = word_bits * words_per_block;
-
-        constexpr uint32_t hash_bits = cuda::std::numeric_limits<hash_result_type>::digits;
-        constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width;
-        CUCO_EXPECTS(
-          pattern_bits <= max_pattern_bits_from_hash,
-          "`hash_result_type` too narrow to generate the requested number of `pattern_bits`");
-        CUCO_EXPECTS(pattern_bits_ >= min_pattern_bits,
-                     "`pattern_bits` must be at least `words_per_block`");
-        CUCO_EXPECTS(pattern_bits_ <= max_pattern_bits,
-                     "`pattern_bits` must be less than the total number of bits in a filter "
-                     "block");))
-    // TODO find a proper way to perform input checks/assertions on device without destroying the
-    // context (e.g. __trap())
-  }
-
-  __device__ constexpr hash_result_type hash(hash_argument_type const& key) const
-  {
-    return hash_(key);
-  }
-
-  template <class Extent>
-  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const
-  {
-    return hash % num_blocks;
-  }
-
-  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
-  {
-    word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1;
-
-    auto const bits_so_far = min_bits_per_word_ * word_index +
-                             (word_index < remainder_bits_ ? word_index : remainder_bits_);
-
-    hash >>= bits_so_far * bit_index_width;
-
-    word_type word              = 0;
-    int32_t const bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0);
-
-    for (int32_t bit = 0; bit < bits_per_word; ++bit) {
-      word |= word_type{1} << (hash & bit_index_mask);
-      hash >>= bit_index_width;
-    }
-
-    return word;
-  }
-
- private:
-  uint32_t pattern_bits_;
-  uint32_t min_bits_per_word_;
-  uint32_t remainder_bits_;
-  hasher hash_;
-};
-
-}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh
index 6c918fb7d..55b1df7fe 100644
--- a/include/cuco/detail/bloom_filter/kernels.cuh
+++ b/include/cuco/detail/bloom_filter/kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,39 +18,309 @@
 #include <cuco/detail/utility/cuda.cuh>
 #include <cuco/detail/utility/math.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/iterator>
 
 #include <cooperative_groups.h>
 
 #include <cstdint>
+#include <nv/target>
 
 namespace cuco::detail::bloom_filter_ns {
 
 CUCO_SUPPRESS_KERNEL_WARNINGS
+template <bool ConditionalAtomic, int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+__device__ void add_n_impl(InputIt first, cuco::detail::index_type n, Ref ref)
+{
+  namespace cg   = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
+
+  // Only use warp-cooperative kernels when CGSize > 1
+  if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) {
+    auto const idx          = cuco::detail::global_thread_id();
+    auto group              = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const is_full_tile = (blockIdx.x + 1) * BlockSize <= n;
+    if (is_full_tile) {
+      key_type const& key = *(first + idx);
+      ref.add_coop<ConditionalAtomic>(group, key);
+    } else {
+      auto const is_valid = idx < n;
+      key_type const& key = is_valid ? *(first + idx) : key_type{};
+      ref.add_coop<ConditionalAtomic>(group, key, is_valid);
+    }
+  } else {
+    auto const idx = cuco::detail::global_thread_id() / CGSize;
+    if constexpr (CGSize == 1) {
+      if (idx < n) {
+        key_type const& key = *(first + idx);
+        ref.add<ConditionalAtomic>(key);
+      }
+    } else {
+      auto group = cg::tiled_partition<CGSize>(cg::this_thread_block());
+      if (idx < n) {
+        key_type const& key = *(first + idx);
+        ref.add<ConditionalAtomic>(group, key);
+      }
+    }
+  }
+}
+
+template <bool ConditionalAtomic, int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void add_n(InputIt first,
+                                                    cuco::detail::index_type n,
+                                                    Ref ref)
+{
+  add_n_impl<ConditionalAtomic, CGSize, BlockSize>(first, n, ref);
+}
 
-template <int32_t BlockSize, class InputIt, class Ref>
-CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
-                                                  cuco::detail::index_type n,
-                                                  Ref ref)
+template <bool ConditionalAtomic, int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+__device__ void add_work_stealing_n_impl(InputIt first, cuco::detail::index_type n, Ref ref)
 {
-  namespace cg = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
+
+  namespace cg  = cooperative_groups;
+  namespace ptx = cuda::ptx;
+
+  // Cluster launch control initialization:
+  __shared__ uint4 result;
+  __shared__ uint64_t bar;
+  int phase = 0;
 
-  constexpr auto tile_size = cuco::detail::warp_size();
+  auto const block = cg::this_thread_block();
 
-  auto const tile_idx       = cuco::detail::global_thread_id() / tile_size;
-  auto const n_tiles        = gridDim.x * BlockSize / tile_size;
-  auto const items_per_tile = cuco::detail::int_div_ceil(n, n_tiles);
+  cg::invoke_one(block, [&]() { ptx::mbarrier_init(&bar, 1); });
 
-  auto const tile_start = tile_idx * items_per_tile;
-  if (tile_start >= n) { return; }
-  auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n;
+  int bx = blockIdx.x;
 
-  auto const tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
+  // Work-stealing loop:
+  while (true) {
+    // Protect result from overwrite in the next iteration,
+    // (also ensure barrier initialization at 1st iteration):
+    block.sync();
+
+    cg::invoke_one(block, [&]() {
+      // Acquire write of result in the async proxy:
+      ptx::fence_proxy_async_generic_sync_restrict(
+        ptx::sem_acquire, ptx::space_cluster, ptx::scope_cluster);
+
+      cg::invoke_one(cg::coalesced_threads(),
+                     [&]() { ptx::clusterlaunchcontrol_try_cancel(&result, &bar); });
+      ptx::mbarrier_arrive_expect_tx(
+        ptx::sem_relaxed, ptx::scope_cta, ptx::space_shared, &bar, sizeof(uint4));
+    });
+
+    // Computation:
+    // Only use warp-cooperative kernels when CGSize > 1
+    if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) {
+      cuco::detail::index_type const idx = BlockSize * bx + threadIdx.x;
+      auto group                         = cg::tiled_partition<CGSize>(block);
+      auto const is_full_tile            = (bx + 1) * BlockSize <= n;
+      if (is_full_tile) {
+        key_type const& key = *(first + idx);
+        ref.add_coop<ConditionalAtomic>(group, key);
+      } else {
+        auto const is_valid = idx < n;
+        key_type const& key = is_valid ? *(first + idx) : key_type{};
+        ref.add_coop<ConditionalAtomic>(group, key, is_valid);
+      }
+    } else {
+      cuco::detail::index_type const idx =
+        (static_cast<cuco::detail::index_type>(BlockSize) * bx + threadIdx.x) / CGSize;
+      if constexpr (CGSize == 1) {
+        if (idx < n) {
+          key_type const& key = *(first + idx);
+          ref.add<ConditionalAtomic>(key);
+        }
+      } else {
+        auto group = cg::tiled_partition<CGSize>(block);
+        if (idx < n) {
+          key_type const& key = *(first + idx);
+          ref.add<ConditionalAtomic>(group, key);
+        }
+      }
+    }
 
-  ref.add(tile, first + tile_start, first + tile_stop);
+    // Cancellation request synchronization:
+    while (!ptx::mbarrier_try_wait_parity(ptx::sem_acquire, ptx::scope_cta, &bar, phase)) {}
+    phase ^= 1;
+
+    // Cancellation request decoding:
+    bool success = ptx::clusterlaunchcontrol_query_cancel_is_canceled(result);
+    if (!success) break;
+
+    bx = ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x<int>(result);
+
+    // Release read of result to the async proxy:
+    ptx::fence_proxy_async_generic_sync_restrict(
+      ptx::sem_release, ptx::space_shared, ptx::scope_cluster);
+  }
 }
 
-template <int32_t CGSize,
+template <bool ConditionalAtomic, int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void add_work_stealing_n(InputIt first,
+                                                                  cuco::detail::index_type n,
+                                                                  Ref ref)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_100,
+    (add_work_stealing_n_impl<ConditionalAtomic, CGSize, BlockSize>(first, n, ref);),
+    (add_n_impl<ConditionalAtomic, CGSize, BlockSize>(first, n, ref);))
+}
+
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+__device__ void contains_n_impl(InputIt first,
+                                cuco::detail::index_type n,
+                                OutputIt output_begin,
+                                Ref ref)
+{
+  namespace cg   = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
+
+  // Only use warp-cooperative kernels when CGSize > 1
+  if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) {
+    auto const idx          = cuco::detail::global_thread_id();
+    auto group              = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const is_full_tile = (blockIdx.x + 1) * BlockSize <= n;
+    if (is_full_tile) {
+      key_type const& key   = *(first + idx);
+      *(output_begin + idx) = ref.contains_coop(group, key);
+    } else {
+      auto const is_valid = idx < n;
+      key_type const& key = is_valid ? *(first + idx) : key_type{};
+      auto const result   = ref.contains_coop(group, key, is_valid);
+      if (is_valid) { *(output_begin + idx) = result; }
+    }
+  } else {
+    auto idx = cuco::detail::global_thread_id() / CGSize;
+    if constexpr (CGSize == 1) {
+      if (idx < n) {
+        key_type const& key   = *(first + idx);
+        *(output_begin + idx) = ref.contains(key);
+      }
+    } else {
+      auto group = cg::tiled_partition<CGSize>(cg::this_thread_block());
+      if (idx < n) {
+        key_type const& key = *(first + idx);
+        // ref.contains(group, key) already reduces across the group via group.all(...).
+        auto const found = ref.contains(group, key);
+        if (group.thread_rank() == 0) { *(output_begin + idx) = found; }
+      }
+    }
+  }
+}
+
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void contains_n(InputIt first,
+                                                         cuco::detail::index_type n,
+                                                         OutputIt output_begin,
+                                                         Ref ref)
+{
+  contains_n_impl<CGSize, BlockSize>(first, n, output_begin, ref);
+}
+
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+__device__ void contains_work_stealing_n_impl(InputIt first,
+                                              cuco::detail::index_type n,
+                                              OutputIt output_begin,
+                                              Ref ref)
+{
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
+
+  namespace cg  = cooperative_groups;
+  namespace ptx = cuda::ptx;
+
+  // Cluster launch control initialization:
+  __shared__ uint4 result;
+  __shared__ uint64_t bar;
+  int phase = 0;
+
+  auto const block = cg::this_thread_block();
+
+  cg::invoke_one(block, [&]() { ptx::mbarrier_init(&bar, 1); });
+
+  int bx = blockIdx.x;
+
+  // Work-stealing loop:
+  while (true) {
+    // Protect result from overwrite in the next iteration,
+    // (also ensure barrier initialization at 1st iteration):
+    block.sync();
+
+    cg::invoke_one(block, [&]() {
+      // Acquire write of result in the async proxy:
+      ptx::fence_proxy_async_generic_sync_restrict(
+        ptx::sem_acquire, ptx::space_cluster, ptx::scope_cluster);
+
+      cg::invoke_one(cg::coalesced_threads(),
+                     [&]() { ptx::clusterlaunchcontrol_try_cancel(&result, &bar); });
+      ptx::mbarrier_arrive_expect_tx(
+        ptx::sem_relaxed, ptx::scope_cta, ptx::space_shared, &bar, sizeof(uint4));
+    });
+
+    // Computation:
+    // Only use warp-cooperative kernels when CGSize > 1
+    if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) {
+      cuco::detail::index_type const idx = BlockSize * bx + threadIdx.x;
+      auto group                         = cg::tiled_partition<CGSize>(block);
+      auto const is_full_tile            = (bx + 1) * BlockSize <= n;
+      if (is_full_tile) {
+        key_type const& key   = *(first + idx);
+        *(output_begin + idx) = ref.contains_coop(group, key);
+      } else {
+        auto const is_valid = idx < n;
+        key_type const& key = is_valid ? *(first + idx) : key_type{};
+        auto const result   = ref.contains_coop(group, key, is_valid);
+        if (is_valid) { *(output_begin + idx) = result; }
+      }
+    } else {
+      cuco::detail::index_type const idx =
+        (static_cast<cuco::detail::index_type>(BlockSize) * bx + threadIdx.x) / CGSize;
+      if constexpr (CGSize == 1) {
+        if (idx < n) {
+          key_type const& key   = *(first + idx);
+          *(output_begin + idx) = ref.contains(key);
+        }
+      } else {
+        auto group = cg::tiled_partition<CGSize>(block);
+        if (idx < n) {
+          key_type const& key = *(first + idx);
+          // ref.contains(group, key) already reduces across the group via group.all(...).
+          auto const found = ref.contains(group, key);
+          if (group.thread_rank() == 0) { *(output_begin + idx) = found; }
+        }
+      }
+    }
+
+    // Cancellation request synchronization:
+    while (!ptx::mbarrier_try_wait_parity(ptx::sem_acquire, ptx::scope_cta, &bar, phase)) {}
+    phase ^= 1;
+
+    // Cancellation request decoding:
+    bool success = ptx::clusterlaunchcontrol_query_cancel_is_canceled(result);
+    if (!success) break;
+
+    bx = ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x<int>(result);
+
+    // Release read of result to the async proxy:
+    ptx::fence_proxy_async_generic_sync_restrict(
+      ptx::sem_release, ptx::space_shared, ptx::scope_cluster);
+  }
+}
+
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void contains_work_stealing_n(InputIt first,
+                                                                       cuco::detail::index_type n,
+                                                                       OutputIt output_begin,
+                                                                       Ref ref)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_100,
+    (contains_work_stealing_n_impl<CGSize, BlockSize>(first, n, output_begin, ref);),
+    (contains_n_impl<CGSize, BlockSize>(first, n, output_begin, ref);))
+}
+
+template <bool ConditionalAtomic,
+          int32_t CGSize,
           int32_t BlockSize,
           class InputIt,
           class StencilIt,
@@ -59,21 +329,31 @@ template <int32_t CGSize,
 CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
   InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
 {
-  namespace cg = cooperative_groups;
-
-  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
-  auto idx               = cuco::detail::global_thread_id() / CGSize;
+  namespace cg   = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
 
-  [[maybe_unused]] auto const tile =
-    cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
-
-  while (idx < n) {
-    if (pred(*(stencil + idx))) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-        *(first + idx)};
-      ref.add(tile, insert_element);
+  if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) {
+    auto const idx      = cuco::detail::global_thread_id();
+    auto group          = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const in_range = idx < n;
+    bool is_valid       = false;
+    key_type key{};
+    if (in_range) {
+      key      = *(first + idx);
+      is_valid = pred(*(stencil + idx));
+    }
+    ref.template add_coop<ConditionalAtomic>(group, key, is_valid);
+  } else {
+    auto const idx = cuco::detail::global_thread_id() / CGSize;
+    if (idx < n && pred(*(stencil + idx))) {
+      key_type const& key = *(first + idx);
+      if constexpr (CGSize == 1) {
+        ref.template add<ConditionalAtomic>(key);
+      } else {
+        auto group = cg::tiled_partition<CGSize>(cg::this_thread_block());
+        ref.template add<ConditionalAtomic>(group, key);
+      }
     }
-    idx += loop_stride;
   }
 }
 
@@ -88,30 +368,44 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
                                                             cuco::detail::index_type n,
                                                             StencilIt stencil,
                                                             Predicate pred,
-                                                            OutputIt out,
+                                                            OutputIt output_begin,
                                                             Ref ref)
 {
-  namespace cg = cooperative_groups;
-
-  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
-  auto idx               = cuco::detail::global_thread_id() / CGSize;
-
-  [[maybe_unused]] auto const tile =
-    cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
+  namespace cg   = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
 
-  if constexpr (CGSize == 1) {
-    while (idx < n) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-      *(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false;
-      idx += loop_stride;
+  if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) {
+    auto const idx      = cuco::detail::global_thread_id();
+    auto group          = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const in_range = idx < n;
+    bool is_valid       = false;
+    key_type key{};
+    if (in_range) {
+      key      = *(first + idx);
+      is_valid = pred(*(stencil + idx));
     }
+    auto const result = ref.contains_coop(group, key, is_valid);
+    if (in_range) { *(output_begin + idx) = is_valid ? result : false; }
   } else {
-    auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
-    while (idx < n) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-      auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
-      if (tile.thread_rank() == 0) { *(out + idx) = found; }
-      idx += loop_stride;
+    auto const idx = cuco::detail::global_thread_id() / CGSize;
+    if (idx < n) {
+      if constexpr (CGSize == 1) {
+        if (pred(*(stencil + idx))) {
+          key_type const& key   = *(first + idx);
+          *(output_begin + idx) = ref.contains(key);
+        } else {
+          *(output_begin + idx) = false;
+        }
+      } else {
+        auto group  = cg::tiled_partition<CGSize>(cg::this_thread_block());
+        bool result = false;
+        if (pred(*(stencil + idx))) {
+          key_type const& key = *(first + idx);
+          // ref.contains(group, key) already reduces across the group via group.all(...).
+          result = ref.contains(group, key);
+        }
+        if (group.thread_rank() == 0) { *(output_begin + idx) = result; }
+      }
     }
   }
 }
diff --git a/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh b/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh
new file mode 100644
index 000000000..341850c6b
--- /dev/null
+++ b/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/utility/math.cuh>
+
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/array>
+#include <cuda/std/bit>
+#include <cuda/std/limits>
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+
+#include <cstdint>
+
+namespace cuco::detail {
+
+/**
+ * @brief Sectorized Bloom filter policy with multiplicative-hashing fingerprint generation.
+ *
+ * Implements the Sectorized Bloom Filter (SBF) and Cache-Sectorized Bloom Filter (CSBF) variants
+ * from "Optimizing Bloom Filters for Modern GPU Architectures" (arXiv:2512.15595). Distributes
+ * `PatternBits` set bits across `WordsPerBlock` words using compile-time salt-based multiplicative
+ * hashing. The hash result is split into upper 32 bits (block selection via multiply-shift) and
+ * lower 32 bits (pattern generation), so a 64-bit hash function is required by design.
+ *
+ * @tparam Hash 64-bit hash functor whose return type satisfies `is_same_v<hash_result_type,
+ * uint64_t>`.
+ * @tparam Word Underlying word type of a filter block. Must be an atomically updatable integral.
+ * @tparam WordsPerBlock Words per filter block. Must be a power of two and <= 32.
+ * @tparam PatternBits Number of fingerprint bits (k in the paper).
+ * @tparam AddHorizontalLayout CG size used for `add` (paper's Theta). Must be a power of two and
+ * `AddHorizontalLayout * AddVerticalLayout <= WordsPerBlock`.
+ * @tparam AddVerticalLayout Contiguous words processed per thread per `add` step (paper's Phi).
+ * @tparam ContainsHorizontalLayout CG size used for `contains` (paper's Theta).
+ * @tparam ContainsVerticalLayout Contiguous words processed per thread per `contains` step (paper's
+ * Phi).
+ * @tparam GroupsPerBlock Cache-sectorization groups per block (paper's z). Defaults to
+ * `WordsPerBlock` for non-CSBF mode; setting `GroupsPerBlock < WordsPerBlock` enables CSBF.
+ */
+template <class Hash,
+          class Word,
+          uint32_t WordsPerBlock,
+          uint32_t PatternBits,
+          uint32_t AddHorizontalLayout,
+          uint32_t AddVerticalLayout,
+          uint32_t ContainsHorizontalLayout,
+          uint32_t ContainsVerticalLayout,
+          uint32_t GroupsPerBlock = WordsPerBlock>
+class parametric_filter_policy {
+ public:
+  using hasher             = Hash;                            ///< 64-bit hash functor type
+  using word_type          = Word;                            ///< Underlying filter-block word type
+  using hash_argument_type = typename hasher::argument_type;  ///< Hash function input type
+  using hash_result_type =
+    decltype(std::declval<hasher>()(std::declval<hash_argument_type>()));  ///< Hash function
+                                                                           ///< output type
+
+ private:
+  static constexpr uint32_t max_salts                          = 64;
+  static constexpr cuda::std::array<uint32_t, max_salts> salts = {
+    0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, 0x705495c7U, 0x2df1424bU, 0x9efc4947U,
+    0x5c6bfb31U, 0xb24bcdffU, 0xb6843d6dU, 0x6db04543U, 0x3a12efddU, 0xb0ddd463U, 0x8d22f6e7U,
+    0xb82f1e53U, 0x7db9f86bU, 0xc7afe639U, 0xfb135cd7U, 0x693256e1U, 0x9466d871U, 0x23d3d02fU,
+    0x6461d049U, 0x66a91621U, 0xbaa3006fU, 0x52fb8d99U, 0x3ea88b4fU, 0xf470cfdU,  0xb1db79a5U,
+    0x9809fcd1U, 0xbced4445U, 0x2eb7c737U, 0x2cea6803U, 0x156f1955U, 0x8813c027U, 0xa26819f9U,
+    0x4c3b57bdU, 0x7df94487U, 0xb975e769U, 0xb8f20cb5U, 0x5c9e2e77U, 0x5fb1735fU, 0x3a6f759bU,
+    0x3c090923U, 0xfced424dU, 0xa187a6a9U, 0x6f070a41U, 0x2c85233bU, 0x7e62258bU, 0x2771ef17U,
+    0x13bbf093U, 0x4ff059e5U, 0xe3ce3d0fU, 0xf1b4789fU, 0x9fbb6173U, 0x6a320cf5U, 0x1be2c481U,
+    0x7ba8222bU, 0x6fd619b3U, 0x7b1bbf0dU, 0x8b8993adU, 0x448eca95U, 0x82ab09d9U, 0x2ce53909U,
+    0x4f548685U};
+  static constexpr uint32_t word_bits = cuda::std::numeric_limits<word_type>::digits;
+
+ public:
+  static constexpr uint32_t words_per_block = WordsPerBlock;  ///< Number of words per filter block
+  static constexpr uint32_t pattern_bits    = PatternBits;    ///< Fingerprint bits per key
+
+  static constexpr uint32_t add_horizontal_layout =
+    AddHorizontalLayout;  ///< horizontal vectorization layout for add operation
+  static constexpr uint32_t add_vertical_layout =
+    AddVerticalLayout;  ///< vertical vectorization layout for add operation
+  static constexpr uint32_t contains_horizontal_layout =
+    ContainsHorizontalLayout;  ///< horizontal vectorization layout for contains operation
+  static constexpr uint32_t contains_vertical_layout =
+    ContainsVerticalLayout;  ///< vertical vectorization layout for contains operation
+
+  static constexpr size_t max_filter_blocks =
+    cuda::std::numeric_limits<uint32_t>::max();  ///< Upper bound on the number of filter blocks
+  /// Lower bound on `pattern_bits`: at least one bit per word so every word contributes.
+  static constexpr auto min_pattern_bits = words_per_block;
+  /// Upper bound on `pattern_bits`: the total number of bits in a filter block, capped by the
+  /// number of available salts.
+  static constexpr auto max_pattern_bits = cuda::std::min(word_bits * words_per_block, max_salts);
+
+  //===----------Cache-Sectorized----------===//
+  static constexpr uint32_t groups_per_block =
+    GroupsPerBlock;  ///< Cache-sectorization groups per block (paper's z)
+  static constexpr bool is_cache_sectorized =
+    groups_per_block != words_per_block ? true : false;  ///< CSBF mode flag
+  static constexpr uint32_t words_per_group =
+    words_per_block / groups_per_block;  ///< Words per cache-sectorization group
+  // TODO: when `pattern_bits % groups_per_block != 0`, using a ceil packs all remainder bits into
+  // the first `pattern_bits / max_bits_per_group` groups, leaving later groups with a zero
+  // expected pattern. This wastes block capacity and inflates FPR. Distribute floor bits to every
+  // group plus one extra bit to the first `pattern_bits % groups_per_block` groups, and update
+  // the salt-to-group mapping in `set_bits` accordingly.
+  static constexpr uint32_t max_bits_per_group = cuco::detail::int_div_ceil(
+    pattern_bits, groups_per_block);  ///< CSBF: max fingerprint bits set per group per key
+  static constexpr uint32_t add_groups_per_vertical_layout =
+    add_vertical_layout / words_per_group;  ///< CSBF: groups touched per add vertical step
+  static constexpr uint32_t contains_groups_per_vertical_layout =
+    contains_vertical_layout /
+    words_per_group;  ///< CSBF: groups touched per contains vertical step
+  static constexpr uint32_t group_index_salt =
+    0x5bd1e995U;  ///< CSBF: salt for selecting one word per group
+  static constexpr uint32_t group_index_width = cuda::std::bit_width(
+    words_per_group - 1);  ///< CSBF: bits needed to encode an in-group word index
+  static constexpr uint32_t group_index_mask =
+    words_per_group - 1;  ///< CSBF: mask for selecting an in-group word index
+
+ private:
+  static constexpr uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1);
+  // TODO: same problem as `max_bits_per_group`. For non-multiple
+  // `(pattern_bits, words_per_block)` configs (e.g. PatternBits=12, WordsPerBlock=8), the salt
+  // walk in `set_bits` advances `PatternArrayIndex` every `max_bits_per_word` salts, packing all
+  // bits into the first `ceil(pattern_bits / words_per_block)` words and leaving the rest at
+  // zero. This wastes block capacity and inflates FPR. Distribute floor bits to every word plus
+  // one extra bit to the first `pattern_bits % words_per_block` words, and update the
+  // salt-to-word mapping in `set_bits` accordingly.
+  static constexpr uint32_t max_bits_per_word =
+    cuco::detail::int_div_ceil(pattern_bits, words_per_block);
+
+ public:
+  /**
+   * @brief Constructs a parametric filter policy.
+   *
+   * @param hash Hash function used to generate fingerprints.
+   */
+  __host__ __device__ constexpr parametric_filter_policy(Hash hash = {}) : hash_{hash}
+  {
+    static_assert(pattern_bits >= min_pattern_bits,
+                  "pattern_bits must be at least words_per_block");
+    static_assert(pattern_bits <= max_pattern_bits,
+                  "pattern_bits must be less than the total number of bits in a filter block");
+    // Require exact tiling. With `words_per_block` a power of two, this is equivalent to requiring
+    // both `add_horizontal_layout` and `add_vertical_layout` to be powers of two with product
+    // <= `words_per_block`. The internal loop count uses integer division on the product; non-
+    // dividing layouts would leave trailing words uninserted on add while contains still expects
+    // non-zero patterns there, producing false negatives for every inserted key.
+    static_assert(words_per_block % (add_horizontal_layout * add_vertical_layout) == 0,
+                  "add_horizontal_layout * add_vertical_layout must evenly divide words_per_block");
+    static_assert(
+      words_per_block % (contains_horizontal_layout * contains_vertical_layout) == 0,
+      "contains_horizontal_layout * contains_vertical_layout must evenly divide words_per_block");
+    // The split_hash() design requires a 64-bit hash split into upper 32 bits (block selection
+    // via multiply-shift) and lower 32 bits (pattern generation via salt-based multiplicative
+    // hashing). This is a permanent design requirement, not a temporary limitation.
+    static_assert(cuda::std::is_same_v<hash_result_type, uint64_t>,
+                  "parametric_filter_policy requires a 64-bit hash function");
+
+    //===----------Cache-Sectorized----------===//
+    static_assert(
+      is_cache_sectorized == false || (groups_per_block > 0 && groups_per_block < words_per_block &&
+                                       (words_per_block % groups_per_block == 0)),
+      "in cache-sectorized filter, the number of groups must be positive, be fewer "
+      "than words_per_block, and evenly divide words_per_block");
+    // Require the vertical layout to be a multiple of `words_per_group`. Floor-dividing
+    // `add_vertical_layout / words_per_group` to derive `add_groups_per_vertical_layout` would
+    // otherwise drop group-coverage for the trailing partial group, producing the same false-
+    // negative bug as the non-tiling check above.
+    static_assert(is_cache_sectorized == false || (add_vertical_layout % words_per_group == 0),
+                  "in cache-sectorized filter, add_vertical_layout must be a multiple of "
+                  "words_per_group");
+    static_assert(is_cache_sectorized == false || (contains_vertical_layout % words_per_group == 0),
+                  "in cache-sectorized filter, contains_vertical_layout must be a multiple of "
+                  "words_per_group");
+    static_assert(is_cache_sectorized == false || groups_per_block * group_index_width <= 32,
+                  "in cache-sectorized filter, the number of bits needed to index groups must fit "
+                  "within 32 bits");
+  }
+
+  /**
+   * @brief Splits the 64-bit hash of a key into its upper and lower 32 bits.
+   *
+   * The upper half is used for block selection (via multiply-shift); the lower half drives the
+   * per-word fingerprint pattern via salt-based multiplicative hashing.
+   *
+   * @param key Key to hash.
+   *
+   * @return `{upper 32 bits, lower 32 bits}` of the 64-bit hash.
+   */
+  __device__ constexpr cuda::std::pair<uint32_t, uint32_t> split_hash(hash_argument_type key) const
+  {
+    auto const hash_value = hash_(key);
+    return {static_cast<uint32_t>(hash_value >> 32), static_cast<uint32_t>(hash_value)};
+  }
+
+  /**
+   * @brief Determines the filter block a key maps to via fast multiply-shift modulo.
+   *
+   * @tparam Extent Size type used to determine the number of blocks in the filter.
+   *
+   * @param upper_hash_value Upper 32 bits of the key's hash.
+   * @param num_blocks Number of blocks in the filter.
+   *
+   * @return Block index in `[0, num_blocks)`.
+   */
+  template <class Extent>
+  __device__ constexpr auto block_index(uint32_t upper_hash_value, Extent num_blocks) const
+  {
+    return static_cast<uint32_t>((static_cast<uint64_t>(upper_hash_value) *
+                                  static_cast<typename Extent::value_type>(num_blocks)) >>
+                                 32);
+  }
+
+  /**
+   * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is 1.
+   *
+   * @tparam LoopIndex Outer-loop iteration index when `words_per_block / VerticalLayout > 1`.
+   * @tparam VerticalLayout Number of contiguous words this call produces.
+   *
+   * @param lower_hash_value Lower 32 bits of the key's hash.
+   *
+   * @return Array of `VerticalLayout` (or `groups_per_vertical_layout` in CSBF mode) words.
+   */
+  template <uint32_t LoopIndex, uint32_t VerticalLayout>
+  __device__ constexpr auto array_pattern(uint32_t lower_hash_value) const
+  {
+    return pattern_impl<LoopIndex, VerticalLayout>(lower_hash_value);
+  }
+
+  /**
+   * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is > 1.
+   *
+   * @tparam LoopIndex Outer-loop iteration index.
+   * @tparam HorizontalLayout Cooperative-group size cooperating on a single key.
+   * @tparam VerticalLayout Number of contiguous words this call produces.
+   *
+   * @param lower_hash_value Lower 32 bits of the key's hash.
+   * @param thread_index Caller's rank within the cooperative group.
+   *
+   * @return Array of `VerticalLayout` (or `groups_per_vertical_layout` in CSBF mode) words owned
+   * by the calling thread.
+   */
+  template <uint32_t LoopIndex, uint32_t HorizontalLayout, uint32_t VerticalLayout>
+  __device__ constexpr auto array_pattern(uint32_t lower_hash_value, uint32_t thread_index) const
+  {
+    return pattern_impl<LoopIndex, HorizontalLayout, VerticalLayout>(lower_hash_value,
+                                                                     thread_index);
+  }
+
+ private:
+  hasher hash_;
+
+  /**
+   * @brief pattern_impl - Computes the bit pattern for a vertical layout of words.
+   * I use the terminology of a `virtual thread` to refer to an ordering of the vertical layouts,
+   * namely
+   *   virtual_thread_index = LoopIndex * HorizontalLayout + thread_index,
+   * where LoopIndex is the index of the outermost loop in the range:
+   *     [0, words_per_block / (HorizontalLayout * VerticalLayout)).
+   * @param hash
+   * @return cuda::std::array<word_type, VerticalLayout> - The bit pattern for the vertical layout
+   * defined by the LoopIndex.
+   */
+
+  // Precondition: <add/contains>_horizontal_layout == 1
+  template <uint32_t LoopIndex, uint32_t VerticalLayout>
+  __device__ constexpr auto pattern_impl(uint32_t hash) const
+  {
+    // For cache-sectorized, we set bits for one word for each group per vertical layout.
+    constexpr uint32_t groups_per_vertical_layout = VerticalLayout / words_per_group;
+    using pattern_array_t =
+      cuda::std::conditional_t<is_cache_sectorized,
+                               cuda::std::array<word_type, groups_per_vertical_layout>,
+                               cuda::std::array<word_type, VerticalLayout>>;
+
+    // Sanity check
+    constexpr uint32_t num_iterations = words_per_block / VerticalLayout;
+    static_assert(LoopIndex < num_iterations,
+                  "the loop index cannot exceed the number of loop iterations");
+
+    pattern_array_t pattern_array{0};
+    constexpr uint32_t salt_start_index =
+      is_cache_sectorized ? max_bits_per_group * groups_per_vertical_layout * LoopIndex
+                          : max_bits_per_word * VerticalLayout * LoopIndex;
+    constexpr uint32_t salt_end_index =
+      is_cache_sectorized
+        ? cuda::std::min(salt_start_index + max_bits_per_group * groups_per_vertical_layout,
+                         pattern_bits)
+        : cuda::std::min(salt_start_index + max_bits_per_word * VerticalLayout, pattern_bits);
+    constexpr uint32_t pattern_array_start_index = 0;
+    set_bits<salt_start_index, salt_end_index, pattern_array_start_index>(hash, pattern_array);
+    return pattern_array;
+  }
+
+  // Precondition: <add/contains>_horizontal_layout > 1
+  template <uint32_t LoopIndex, uint32_t HorizontalLayout, uint32_t VerticalLayout>
+  __device__ constexpr auto pattern_impl(uint32_t hash, uint32_t thread_index) const
+  {
+    // For cache-sectorized, we set bits for one word for each group per vertical layout.
+    constexpr uint32_t groups_per_vertical_layout = VerticalLayout / words_per_group;
+    using pattern_array_t =
+      cuda::std::conditional_t<is_cache_sectorized,
+                               cuda::std::array<word_type, groups_per_vertical_layout>,
+                               cuda::std::array<word_type, VerticalLayout>>;
+
+    // Sanity check
+    constexpr uint32_t num_iterations = words_per_block / (HorizontalLayout * VerticalLayout);
+    static_assert(LoopIndex < num_iterations,
+                  "the loop index cannot exceed the number of loop iterations");
+
+    // [lower_bound, upper_bound) defines the range of virtual thread indices for this loop
+    // iteration.
+    constexpr uint32_t lower_bound = LoopIndex * HorizontalLayout;
+    constexpr uint32_t upper_bound = lower_bound + HorizontalLayout;
+
+    // A virtual thread flips max_bits_per_virtual_thread bits in the pattern array, excepting
+    // potentially some of the last virtual threads (if [sectorized] pattern_bits % words_per_block
+    // != 0 or [cache-sectorized] pattern_bits & groups_per_block != 0).
+    constexpr uint32_t max_bits_per_virtual_thread =
+      is_cache_sectorized ? max_bits_per_group * groups_per_vertical_layout
+                          : max_bits_per_word * VerticalLayout;
+
+    pattern_array_t pattern_array{0};
+    if constexpr (num_iterations == 1) {
+      thread_dispatch<max_bits_per_virtual_thread, lower_bound, upper_bound>(
+        hash, thread_index, pattern_array);
+    } else {
+      const uint32_t virtual_thread_index = LoopIndex * HorizontalLayout + thread_index;
+      thread_dispatch<max_bits_per_virtual_thread, lower_bound, upper_bound>(
+        hash, virtual_thread_index, pattern_array);
+    }
+    return pattern_array;
+  }
+
+  // Dispatches a dynamic virtual thread index to a static virtual thread index by building a
+  // compile-time decision tree over the range [LowerBound, UpperBound) for the virtual thread
+  // index. This method is only used when <add/contains>_horizontal_layout > 1.
+  template <uint32_t MaxBitsPerVirtualThread,
+            uint32_t LowerBound,
+            uint32_t UpperBound,
+            class PatternArrayT>
+  __device__ constexpr void thread_dispatch(uint32_t hash,
+                                            uint32_t thread_index,
+                                            PatternArrayT& pattern_array) const
+  {
+    // Sanity check
+    static_assert(LowerBound < UpperBound);
+
+    if constexpr (LowerBound + 1 == UpperBound) {
+      // Base case: thread_index == LowerBound
+      constexpr uint32_t salt_start_index = MaxBitsPerVirtualThread * LowerBound;
+      constexpr uint32_t salt_end_index =
+        cuda::std::min(salt_start_index + MaxBitsPerVirtualThread, pattern_bits);
+      constexpr uint32_t pattern_array_start_index = 0;
+      set_bits<salt_start_index, salt_end_index, pattern_array_start_index>(hash, pattern_array);
+    } else {
+      // Recursive case: thread_index > LowerBound
+      constexpr uint32_t mid = (LowerBound + UpperBound) / 2;
+      if (thread_index < mid) {
+        thread_dispatch<MaxBitsPerVirtualThread, LowerBound, mid>(
+          hash, thread_index, pattern_array);
+      } else {
+        thread_dispatch<MaxBitsPerVirtualThread, mid, UpperBound>(
+          hash, thread_index, pattern_array);
+      }
+    }
+  }
+
+  // Set bits in the pattern array using salts starting from SaltIndex.
+  template <uint32_t SaltIndex,
+            uint32_t SaltEndIndex,
+            uint32_t PatternArrayIndex,
+            class PatternArrayT>
+  __device__ constexpr void set_bits(uint32_t hash, PatternArrayT& pattern_array) const
+  {
+    if constexpr (SaltIndex < SaltEndIndex) {
+      // Select top bit_index_width bits from salted hash to determine the bit index.
+      // if (threadIdx.x == 0) { printf("Salt Idx: %u\n", SaltIndex); }
+      const uint32_t bit_index =
+        (cuda::std::get<SaltIndex>(salts) * hash) >> (32 - bit_index_width);
+
+      // Set the bit in the pattern array.
+      cuda::std::get<PatternArrayIndex>(pattern_array) |= word_type{1} << bit_index;
+
+      // Recurse.
+      constexpr uint32_t next_salt_index = SaltIndex + 1;
+      constexpr uint32_t next_pattern_array_index =
+        is_cache_sectorized
+          ? PatternArrayIndex + (next_salt_index % max_bits_per_group == 0 ? 1 : 0)
+          : PatternArrayIndex + (next_salt_index % max_bits_per_word == 0 ? 1 : 0);
+      set_bits<next_salt_index, SaltEndIndex, next_pattern_array_index>(hash, pattern_array);
+    }
+  }
+};
+
+}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/utility/cuda.hpp b/include/cuco/detail/utility/cuda.hpp
index ae2613f21..5d6342427 100644
--- a/include/cuco/detail/utility/cuda.hpp
+++ b/include/cuco/detail/utility/cuda.hpp
@@ -75,5 +75,16 @@ constexpr auto max_occupancy_grid_size(std::int32_t block_size,
   return max_active_blocks_per_multiprocessor * num_multiprocessors;
 }
 
+inline auto l2_cache_size()
+{
+  int device = 0;
+  CUCO_CUDA_TRY(cudaGetDevice(&device));
+
+  int l2_cache_size = -1;
+  CUCO_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device));
+
+  return l2_cache_size;
+}
+
 }  // namespace detail
 }  // namespace cuco
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 51d4f42c1..08da93a47 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -154,9 +154,9 @@ ConfigureTest(HYPERLOGLOG_TEST
 # - bloom_filter ----------------------------------------------------------------------------------
 ConfigureTest(BLOOM_FILTER_TEST
     bloom_filter/unique_sequence_test.cu
-    bloom_filter/arrow_policy_test.cu
     bloom_filter/variable_cg_test.cu
-    bloom_filter/merge_intersect_test.cu)
+    bloom_filter/merge_intersect_test.cu
+    bloom_filter/arrow_compat_test.cu)
 
 ###################################################################################################
 # - roaring_bitmap ---------------------------------------------------------------------------------
diff --git a/tests/bloom_filter/arrow_policy_test.cu b/tests/bloom_filter/arrow_compat_test.cu
similarity index 82%
rename from tests/bloom_filter/arrow_policy_test.cu
rename to tests/bloom_filter/arrow_compat_test.cu
index 1ca349162..2e820fde2 100644
--- a/tests/bloom_filter/arrow_policy_test.cu
+++ b/tests/bloom_filter/arrow_compat_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,23 @@
  * limitations under the License.
  */
 
+// Verifies that a `parametric_filter_policy` instantiation with Apache Arrow's Block-Split Bloom
+// Filter parameters (256-bit blocks of 8 x uint32_t, 8 fingerprint bits per key, fully horizontal
+// add, fully vertical contains) produces byte-identical bitsets to a precomputed Arrow reference.
+
 #include <test_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
 
 #include <cuda/functional>
 #include <thrust/device_vector.h>
-#include <thrust/functional.h>
 
 #include <catch2/catch_template_test_macros.hpp>
 
-#include <random>
+#include <numeric>
 #include <type_traits>
+#include <utility>
+#include <vector>
 
 namespace {
 
@@ -73,11 +78,11 @@ thrust::device_vector<uint32_t> get_arrow_filter_reference_bitset()
   };
 
   if constexpr (std::is_same_v<Key, int32_t>) {
-    return reference_bitsets[0];  // int32
+    return reference_bitsets[0];
   } else if constexpr (std::is_same_v<Key, int64_t>) {
-    return reference_bitsets[1];  // int64
+    return reference_bitsets[1];
   } else if constexpr (std::is_same_v<Key, float>) {
-    return reference_bitsets[2];  // float
+    return reference_bitsets[2];
   } else {
     throw std::invalid_argument("Reference bitsets available for int32, int64, float only.\n\n");
   }
@@ -93,11 +98,11 @@ std::pair<size_t, size_t> get_arrow_filter_test_settings()
   };
 
   if constexpr (std::is_same_v<Key, int32_t>) {
-    return test_settings[0];  // int32
+    return test_settings[0];
   } else if constexpr (std::is_same_v<Key, int64_t>) {
-    return test_settings[1];  // int64
+    return test_settings[1];
   } else if constexpr (std::is_same_v<Key, float>) {
-    return test_settings[2];  // float
+    return test_settings[2];
   } else {
     throw std::invalid_argument("Test settings available for int32, int64, float only.\n\n");
   }
@@ -119,20 +124,14 @@ void test_filter_bitset(Filter& filter, size_t num_keys)
   using key_type  = typename Filter::key_type;
   using word_type = typename Filter::word_type;
 
-  // Generate keys
   auto const h_keys = sequence_values<key_type>(num_keys);
   thrust::device_vector<key_type> d_keys(h_keys.begin(), h_keys.end());
 
-  // Insert to the bloom filter
   filter.add(d_keys.begin(), d_keys.begin() + num_keys);
 
-  // Get reference words device_vector
   auto const reference_words = get_arrow_filter_reference_bitset<key_type>();
+  auto const num_words       = filter.block_extent() * filter.words_per_block;
 
-  // Number of words in the filter
-  auto const num_words = filter.block_extent() * filter.words_per_block;
-
-  // Get the bitset
   thrust::device_vector<word_type> filter_words(filter.data(), filter.data() + num_words);
 
   REQUIRE(cuco::test::equal(
@@ -144,17 +143,19 @@ void test_filter_bitset(Filter& filter, size_t num_keys)
     })));
 }
 
-TEMPLATE_TEST_CASE_SIG("bloom_filter arrow filter policy bitset validation",
+TEMPLATE_TEST_CASE_SIG("bloom_filter arrow-compatible parametric policy bitset validation",
                        "",
                        (class Key),
                        (int32_t),
                        (int64_t),
                        (float))
 {
-  // Get test settings
   auto const [sub_filters, num_keys] = get_arrow_filter_test_settings<Key>();
 
-  using policy_type = cuco::arrow_filter_policy<Key>;
+  // Apache Arrow Block-Split Bloom Filter parameters: 256-bit blocks (8 x uint32_t), 8 fingerprint
+  // bits per key, fully horizontal add (Theta=8) and fully vertical contains (Phi=8).
+  using policy_type =
+    cuco::parametric_filter_policy<cuco::xxhash_64<Key>, std::uint32_t, 8, 8, 8, 1, 1, 8>;
   cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type> filter{
     sub_filters};
 
diff --git a/tests/bloom_filter/merge_intersect_test.cu b/tests/bloom_filter/merge_intersect_test.cu
index 5638d58a8..43708c359 100644
--- a/tests/bloom_filter/merge_intersect_test.cu
+++ b/tests/bloom_filter/merge_intersect_test.cu
@@ -162,42 +162,15 @@ TEMPLATE_TEST_CASE_SIG(
   "bloom_filter merge and intersect tests",
   "",
   ((class Key, class Policy), Key, Policy),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
-  (int64_t, cuco::default_filter_policy<cuco::xxhash_64<int64_t>, uint64_t, 1>),
-  (int64_t, cuco::default_filter_policy<cuco::xxhash_64<int64_t>, uint64_t, 8>))
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1, 1, 1, 1, 1, 1>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 1, 8>),
+  (int64_t, cuco::parametric_filter_policy<cuco::xxhash_64<int64_t>, uint64_t, 1, 1, 1, 1, 1, 1>),
+  (int64_t, cuco::parametric_filter_policy<cuco::xxhash_64<int64_t>, uint64_t, 8, 8, 8, 1, 1, 8>))
 {
   using filter_type =
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
   constexpr size_type capacity{1000};
 
-  uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1);
-
-  // some parameter combinations might be invalid so we skip them
-  try {
-    [[maybe_unused]] auto policy = Policy{pattern_bits};
-  } catch (std::exception const& e) {
-    SKIP(e.what());
-  }
-
-  auto filter_a = filter_type{capacity, {}, {pattern_bits}};
-  auto filter_b = filter_type{capacity, {}, {pattern_bits}};
-  auto filter_c = filter_type{static_cast<size_t>(capacity) * 2, {}, {pattern_bits}};
-
-  test_merge_intersect(filter_a, filter_b, filter_c, capacity);
-}
-
-TEMPLATE_TEST_CASE_SIG("bloom_filter merge and intersect arrow tests",
-                       "",
-                       ((class Key, class Policy), Key, Policy),
-                       (int32_t, cuco::arrow_filter_policy<int32_t>),
-                       (int64_t, cuco::arrow_filter_policy<int64_t>),
-                       (float, cuco::arrow_filter_policy<float>))
-{
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
-  constexpr size_type capacity{1000};  // Must match capacity used in helper logic
-
   auto filter_a = filter_type{capacity};
   auto filter_b = filter_type{capacity};
   auto filter_c = filter_type{static_cast<size_t>(capacity) * 2};
diff --git a/tests/bloom_filter/unique_sequence_test.cu b/tests/bloom_filter/unique_sequence_test.cu
index 4cf24563c..2215bdbbd 100644
--- a/tests/bloom_filter/unique_sequence_test.cu
+++ b/tests/bloom_filter/unique_sequence_test.cu
@@ -86,38 +86,14 @@ void test_unique_sequence(Filter& filter, size_type num_keys)
 }
 
 TEMPLATE_TEST_CASE_SIG(
-  "bloom_filter default policy tests",
+  "bloom_filter parametric policy tests",
   "",
   ((class Key, class Policy), Key, Policy),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 1>),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 8>))
-{
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
-  constexpr size_type num_keys{400};
-
-  uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4);
-
-  // some parameter combinations might be invalid so we skip them
-  try {
-    [[maybe_unused]] auto policy = Policy{pattern_bits};
-  } catch (std::exception const& e) {
-    SKIP(e.what());
-  }
-
-  auto filter = filter_type{1000, {}, {pattern_bits}};
-
-  test_unique_sequence(filter, num_keys);
-}
-
-TEMPLATE_TEST_CASE_SIG("bloom_filter arrow policy tests",
-                       "",
-                       ((class Key, class Policy), Key, Policy),
-                       (int32_t, cuco::arrow_filter_policy<int32_t>),
-                       (uint64_t, cuco::arrow_filter_policy<uint64_t>),
-                       (float, cuco::arrow_filter_policy<float>))
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1, 1, 1, 1, 1, 1>),
+  (uint64_t,
+   cuco::parametric_filter_policy<cuco::xxhash_64<uint64_t>, uint32_t, 8, 12, 8, 1, 4, 2>),
+  (float, cuco::parametric_filter_policy<cuco::xxhash_64<float>, uint64_t, 4, 4, 2, 2, 1, 2>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 2, 2, 1, 8>))
 {
   using filter_type =
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
diff --git a/tests/bloom_filter/variable_cg_test.cu b/tests/bloom_filter/variable_cg_test.cu
index adb6b703f..1c152aa44 100644
--- a/tests/bloom_filter/variable_cg_test.cu
+++ b/tests/bloom_filter/variable_cg_test.cu
@@ -14,100 +14,139 @@
  * limitations under the License.
  */
 
+// Exercises `cuco::bloom_filter_ref` device-side APIs (`ref.add(key)`, `ref.contains(key)`,
+// `ref.contains(group, key)`) directly from custom kernels, varying the cooperative group size.
+// The bulk host-side APIs route through the warp-cooperative kernel which already had correct
+// CG-reduction semantics; the scalar and CG ref methods are separate code paths that need their
+// own coverage.
+
 #include <test_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
 
-#include <cuda/functional>
-#include <cuda/iterator>
+#include <cuda/std/functional>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
-#include <thrust/fill.h>
 #include <thrust/sequence.h>
 
+#include <cooperative_groups.h>
+
 #include <catch2/catch_template_test_macros.hpp>
-#include <catch2/generators/catch_generators.hpp>
 
 #include <cstdint>
-#include <exception>
 
 using size_type = int32_t;
 
-template <int32_t AddCGSize, int32_t ContainsCGSize, typename Filter>
-void test_variable_cg_size(Filter& filter, size_type num_keys)
+namespace cg = cooperative_groups;
+
+template <class Ref, class Key>
+__global__ void scalar_add_kernel(Ref ref, Key const* keys, size_type n)
 {
-  constexpr int32_t block_size = 128;
-  constexpr int32_t grid_size  = 128;
+  auto const i = static_cast<size_type>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i < n) { ref.add(keys[i]); }
+}
 
-  using Key = typename Filter::key_type;
+template <class Ref, class Key>
+__global__ void scalar_contains_kernel(Ref ref, Key const* keys, size_type n, bool* out)
+{
+  auto const i = static_cast<size_type>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i < n) { out[i] = ref.contains(keys[i]); }
+}
 
-  auto ref = filter.ref();
+template <int CGSize, class Ref, class Key>
+__global__ void cg_contains_consistency_kernel(Ref ref,
+                                               Key const* keys,
+                                               size_type n,
+                                               int* mismatches)
+{
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<CGSize>(block);
+  auto const idx   = static_cast<size_type>((blockIdx.x * blockDim.x + threadIdx.x) / CGSize);
+  if (idx >= n) { return; }
+  bool const got       = ref.contains(tile, keys[idx]);
+  bool const all_agree = tile.all(got);
+  bool const any_agree = tile.any(got);
+  if (tile.thread_rank() == 0 && all_agree != any_agree) { atomicAdd(mismatches, 1); }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "bloom_filter device ref scalar add and contains",
+  "",
+  ((class Key, class Policy), Key, Policy),
+  (int32_t, cuco::default_filter_policy<int32_t>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1, 1, 1, 1, 1, 1>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 4, 2, 4, 2>))
+{
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
+  constexpr size_type num_keys{400};
+
+  auto filter = filter_type{1000};
 
-  // Generate keys
   thrust::device_vector<Key> keys(num_keys);
   thrust::sequence(thrust::device, keys.begin(), keys.end());
-
   thrust::device_vector<bool> contained(num_keys, false);
 
-  auto const always_true = cuda::constant_iterator<bool>{true};
-
-  SECTION("Check if fallback kernels work for varying combinations of CG sizes.")
-  {
-    cuco::detail::bloom_filter_ns::add_if_n<AddCGSize, block_size>
-      <<<grid_size, block_size>>>(keys.begin(), num_keys, always_true, cuda::std::identity{}, ref);
-    cuco::detail::bloom_filter_ns::contains_if_n<ContainsCGSize, block_size>
-      <<<grid_size, block_size>>>(
-        keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref);
-    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
-  }
-
-  filter.clear();
-  thrust::fill(contained.begin(), contained.end(), false);  // reset output vector
-
-  SECTION("Check if adaptive add kernel works with fallback contains kernel.")
-  {
-    cuco::detail::bloom_filter_ns::add<block_size>
-      <<<grid_size, block_size>>>(keys.begin(), num_keys, ref);
-    cuco::detail::bloom_filter_ns::contains_if_n<ContainsCGSize, block_size>
-      <<<grid_size, block_size>>>(
-        keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref);
-    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
-  }
-
-  // TODO adaptive vs. adaptive and fallback add vs. adaptive contains (requires #673)
+  auto ref = filter.ref();
+
+  constexpr int block_size = 128;
+  int const grid_size      = (num_keys + block_size - 1) / block_size;
+
+  scalar_add_kernel<<<grid_size, block_size>>>(
+    ref, thrust::raw_pointer_cast(keys.data()), num_keys);
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+  scalar_contains_kernel<<<grid_size, block_size>>>(ref,
+                                                    thrust::raw_pointer_cast(keys.data()),
+                                                    num_keys,
+                                                    thrust::raw_pointer_cast(contained.data()));
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+  REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
 }
 
 TEMPLATE_TEST_CASE_SIG(
-  "bloom_filter variable CG size tests",
+  "bloom_filter device ref CG contains is reduced across the group",
   "",
-  ((int32_t AddCGSize, int32_t ContainsCGSize, class Key, class Policy),
-   AddCGSize,
-   ContainsCGSize,
-   Key,
-   Policy),
-  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
-  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
-  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 1>),
-  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 8>),
-  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
-  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
-  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 1>),
-  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 8>))
+  ((int32_t CGSize, class Key, class Policy), CGSize, Key, Policy),
+  (4,
+   int32_t,
+   cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 4, 2, 4, 2>),
+  (8,
+   int32_t,
+   cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 8, 1>))
 {
   using filter_type =
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
-  constexpr size_type num_keys{400};
+  // Small filter + many probes drive non-inserted keys into the partial-match regime where
+  // different lanes in the tile see different per-slice match results. Without a group reduction
+  // in `ref.contains(group, key)`, each lane returns its slice's partial result and the tile
+  // disagrees on the answer.
+  constexpr size_type num_inserted{200};
+  constexpr size_type num_probed{2000};
+  constexpr size_type num_blocks{16};
+
+  auto filter = filter_type{num_blocks};
+  auto ref    = filter.ref();
+
+  thrust::device_vector<Key> insert_keys(num_inserted);
+  thrust::sequence(thrust::device, insert_keys.begin(), insert_keys.end());
+  filter.add(insert_keys.begin(), insert_keys.end());
+
+  thrust::device_vector<Key> probe_keys(num_probed);
+  thrust::sequence(thrust::device, probe_keys.begin(), probe_keys.end());
 
-  uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4);
+  thrust::device_vector<int> mismatches(1, 0);
 
-  // some parameter combinations might be invalid so we skip them
-  try {
-    [[maybe_unused]] auto policy = Policy{pattern_bits};
-  } catch (std::exception const& e) {
-    SKIP(e.what());
-  }
+  constexpr int block_size = 128;
+  int const grid_size      = (num_probed * CGSize + block_size - 1) / block_size;
 
-  auto filter = filter_type{1000, {}, {pattern_bits}};
+  cg_contains_consistency_kernel<CGSize>
+    <<<grid_size, block_size>>>(ref,
+                                thrust::raw_pointer_cast(probe_keys.data()),
+                                num_probed,
+                                thrust::raw_pointer_cast(mismatches.data()));
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
 
-  test_variable_cg_size<AddCGSize, ContainsCGSize>(filter, num_keys);
+  REQUIRE(static_cast<int>(mismatches[0]) == 0);
 }