diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp index e0acc0bb9..85c9c9245 100644 --- a/benchmarks/benchmark_utils.hpp +++ b/benchmarks/benchmark_utils.hpp @@ -65,7 +65,9 @@ struct lazy_discard { __device__ void device_dispatch(index_type index, value_type const& value) const { // pick some predicate that is always false, but depends on the runtime value - if (threadIdx.x > 2025 + *reinterpret_cast(&value)) { *(it + index) = value; } + if (threadIdx.x > 2025 + *reinterpret_cast(&value) + static_cast(index)) { + *(it + index) = value; + } } __host__ __device__ void operator()(index_type index, value_type const& value) const { diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu index b07c285d4..e539fa1d8 100644 --- a/benchmarks/bloom_filter/add_bench.cu +++ b/benchmarks/bloom_filter/add_bench.cu @@ -26,6 +26,9 @@ #include #include +#include +#include +#include #include #include @@ -35,132 +38,235 @@ using namespace cuco::benchmark; // defaults, dist_from_state, rebind_hasher_t, using namespace cuco::utility; // key_generator, distribution /** - * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance + * @brief Implementation of `cuco::bloom_filter::add_async` with + * `parametric_filter_policy` */ -template -void bloom_filter_add(nvbench::state& state, - nvbench::type_list, Dist>) -{ - using size_type = std::uint32_t; - using policy_type = cuco::default_filter_policy, +template +void pfp_bloom_filter_add_impl(nvbench::state& state, + nvbench::type_list(WordsPerBlock)>; - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - constexpr auto filter_block_size = - sizeof(typename filter_type::word_type) * filter_type::words_per_block; - - auto const num_keys = state.get_int64("NumInputs"); - auto const filter_size_mb = state.get_int64("FilterSizeMB"); - auto const pattern_bits = WordsPerBlock; - - try { - [[maybe_unused]] auto const policy = policy_type{static_cast(pattern_bits)}; - } catch (std::exception const& e) { - state.skip(e.what()); // skip invalid configurations - } - - std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / filter_block_size; - - if (num_sub_filters > std::numeric_limits::max()) { - state.skip("num_sub_filters too large for size_type"); // skip invalid configurations + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type>) +{ + auto constexpr words_per_block = BlockBits / cuda::std::numeric_limits::digits; + auto constexpr pattern_bits_per_word = PatternBits / words_per_block; + + // Check for a valid configuration + if constexpr ((not cuda::std::has_single_bit(static_cast(BlockBits))) or + (words_per_block == 0)) { + state.skip("Invalid filter block size"); + } else if constexpr (HorizontalLayout * VerticalLayout != words_per_block) { + state.skip("Invalid vectorization layout"); + } else if constexpr ((pattern_bits_per_word <= 0) or + (pattern_bits_per_word > cuda::std::numeric_limits::digits) or + (pattern_bits_per_word * words_per_block > 64)) { + state.skip("Invalid pattern bits per word"); + } else { + using size_type = std::uint32_t; + using hasher = cuco::xxhash_64; + auto constexpr contains_vertical_layout = words_per_block; + auto constexpr contains_horizontal_layout = 1; + using policy_type = cuco::parametric_filter_policy; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, policy_type>; + + auto const num_keys = state.get_int64("NumInputs"); + auto const filter_size_mb = state.get_int64("FilterSizeMB"); + + std::size_t const num_sub_filters = + (filter_size_mb * 1024 * 1024) / + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); + + if (num_sub_filters > policy_type::max_filter_blocks) { + // skip invalid configurations + state.skip("num_sub_filters exceeds max_filter_blocks"); + } + + state.add_element_count(num_keys); + + filter_type filter{static_cast(num_sub_filters)}; + + if constexpr (ExcludeIO) { + state.add_global_memory_writes(num_keys * words_per_block); + + cuda::counting_iterator keys(0); + + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + timer.start(); + filter.add_async(keys, keys + num_keys, {launch.get_stream()}); + timer.stop(); + filter.clear_async({launch.get_stream()}); + }); + } else { + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end(), 0); + + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + timer.start(); + filter.add_async(keys.begin(), keys.end(), {launch.get_stream()}); + timer.stop(); + filter.clear_async({launch.get_stream()}); + }); + } } - - cuda::counting_iterator keys(0); - - state.add_element_count(num_keys); - - filter_type filter{ - static_cast(num_sub_filters), {}, {static_cast(pattern_bits)}}; - - state.collect_dram_throughput(); - state.collect_l2_hit_rates(); - - add_fpr_summary(state, filter); - - state.exec([&](nvbench::launch& launch) { - filter.add_async(keys, keys + num_keys, {launch.get_stream()}); - }); } /** * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with - * `arrow_filter_policy` + * `parametric_filter_policy` with IO */ -template -void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list) +template +void pfp_bloom_filter_add(nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type> type_list) { - using size_type = std::uint32_t; - using policy_type = cuco::arrow_filter_policy; - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - auto const num_keys = state.get_int64("NumInputs"); - auto const filter_size_mb = state.get_int64("FilterSizeMB"); + constexpr bool exclude_io = false; + pfp_bloom_filter_add_impl(state, type_list); +} - std::size_t const num_sub_filters = - (filter_size_mb * 1024 * 1024) / - (sizeof(typename filter_type::word_type) * filter_type::words_per_block); +/** + * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with + * `parametric_filter_policy` without IO + */ +template +void pfp_bloom_filter_add_exclude_io( + nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type> type_list) +{ + constexpr bool exclude_io = true; + pfp_bloom_filter_add_impl(state, type_list); +} - if (num_sub_filters > policy_type::max_filter_blocks) { - state.skip("bloom filter with arrow policy should have <= 4194304 blocks"); // skip invalid - // configurations +/** + * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with + * `parametric_filter_policy` with cache sectorization + */ +template +void pfp_bloom_filter_add_csbf(nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type>) +{ + auto constexpr words_per_block = BlockBits / cuda::std::numeric_limits::digits; + auto constexpr words_per_group = words_per_block / GroupsPerBlock; + auto constexpr VerticalLayout = words_per_block / HorizontalLayout; + + if constexpr (words_per_group == 0) { + state.skip("Invalid GroupsPerBlock"); + } else if constexpr ((HorizontalLayout * VerticalLayout != words_per_block) or + (VerticalLayout < words_per_group)) { + state.skip("Invalid vectorization layout"); + } else { + using size_type = std::uint32_t; + using hasher = cuco::xxhash_64; + auto constexpr contains_horizontal_layout = GroupsPerBlock; + auto constexpr contains_vertical_layout = words_per_group; + using policy_type = cuco::parametric_filter_policy; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, policy_type>; + + auto const num_keys = state.get_int64("NumInputs"); + auto const filter_size_mb = state.get_int64("FilterSizeMB"); + + std::size_t const num_sub_filters = + (filter_size_mb * 1024 * 1024) / + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); + + if (num_sub_filters > policy_type::max_filter_blocks) { + // skip invalid configurations + state.skip("num_sub_filters exceeds max_filter_blocks"); + } + + state.add_element_count(num_keys); + + filter_type filter{static_cast(num_sub_filters)}; + + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end(), 0); + + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + timer.start(); + filter.add_async(keys.begin(), keys.end(), {launch.get_stream()}); + timer.stop(); + filter.clear_async({launch.get_stream()}); + }); } - - cuda::counting_iterator keys(0); - - state.add_element_count(num_keys); - - filter_type filter{static_cast(num_sub_filters)}; - - state.collect_dram_throughput(); - state.collect_l2_hit_rates(); - - add_fpr_summary(state, filter); - - state.exec([&](nvbench::launch& launch) { - filter.add_async(keys, keys + num_keys, {launch.get_stream()}); - }); } -NVBENCH_BENCH_TYPES(bloom_filter_add, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - nvbench::type_list, - nvbench::enum_type_list, - nvbench::type_list)) - .set_name("bloom_filter_add_unique_size") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); - -NVBENCH_BENCH_TYPES(bloom_filter_add, - NVBENCH_TYPE_AXES(nvbench::type_list, - defaults::HASH_RANGE, - nvbench::type_list, - nvbench::enum_type_list, - nvbench::type_list)) - .set_name("bloom_filter_add_unique_hash") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); - -NVBENCH_BENCH_TYPES(bloom_filter_add, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - nvbench::type_list, - nvbench::enum_type_list<1, 2, 4, 8>, - nvbench::type_list)) - .set_name("bloom_filter_add_unique_block_dim") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) +NVBENCH_BENCH_TYPES( + pfp_bloom_filter_add, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, ///< Word + nvbench::enum_type_list<64, 128, 256, 512, 1024>, ///< BlockBits + nvbench::enum_type_list<16>, ///< PatternBits + nvbench::enum_type_list<1, 2, 4, 8, 16>, ///< HorizontalLayout + nvbench::enum_type_list<1, 2, 4> ///< VerticalLayout + )) + .set_name("pfp_bloom_filter_add_unique_size_u64") + .set_type_axes_names( + {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"}) .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_FRONTIER_CACHE); -NVBENCH_BENCH_TYPES(arrow_bloom_filter_add, +NVBENCH_BENCH_TYPES(pfp_bloom_filter_add_csbf, NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list)) - .set_name("arrow_bloom_filter_add_unique_size") - .set_type_axes_names({"Key", "Distribution"}) + nvbench::type_list, + nvbench::enum_type_list<128, 256, 512, 1024>, + nvbench::enum_type_list<16>, + nvbench::enum_type_list<2, 4, 8>, + nvbench::enum_type_list<1, 2, 4, 8>)) + .set_name("pfp_bloom_filter_add_csbf_unique_size_u64") + .set_type_axes_names( + {"Key", "Word", "BlockBits", "PatternBits", "GroupsPerBlock", "HorizontalLayout"}) .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_FRONTIER_CACHE); \ No newline at end of file diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu index 7a4549524..ef6594ad5 100644 --- a/benchmarks/bloom_filter/contains_bench.cu +++ b/benchmarks/bloom_filter/contains_bench.cu @@ -15,6 +15,7 @@ */ #include "defaults.hpp" +#include "nvbench/state.cuh" #include "utils.hpp" #include @@ -26,7 +27,10 @@ #include #include +#include #include +#include +#include #include #include @@ -35,146 +39,279 @@ using namespace cuco::benchmark; // defaults, dist_from_state, rebind_hasher_t, using namespace cuco::utility; // key_generator, distribution /** - * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance + * @brief Implementation of `cuco::bloom_filter::contains_async` with + * `parametric_filter_policy` */ -template -void bloom_filter_contains( - nvbench::state& state, - nvbench::type_list, Dist>) +template +void pfp_bloom_filter_contains_impl(nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type>) { - using size_type = std::uint32_t; - using policy_type = cuco::default_filter_policy, - Word, - static_cast(WordsPerBlock)>; - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - constexpr auto filter_block_size = - sizeof(typename filter_type::word_type) * filter_type::words_per_block; - - // if (filter_block_size <= 32) { - // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if - // filter block fits into a 32B sector - // } - - auto const num_keys = state.get_int64("NumInputs"); - auto const filter_size_mb = state.get_int64("FilterSizeMB"); - auto const pattern_bits = WordsPerBlock; - - try { - [[maybe_unused]] auto const policy = policy_type{static_cast(pattern_bits)}; - } catch (std::exception const& e) { - state.skip(e.what()); // skip invalid configurations - } - - std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / filter_block_size; - - if (num_sub_filters > std::numeric_limits::max()) { - state.skip("num_sub_filters too large for size_type"); // skip invalid configurations + auto constexpr words_per_block = BlockBits / cuda::std::numeric_limits::digits; + auto constexpr pattern_bits_per_word = PatternBits / words_per_block; + + // Check for a valid configuration + if constexpr ((not cuda::std::has_single_bit(static_cast(BlockBits))) or + (words_per_block == 0)) { + state.skip("Invalid filter block size"); + } else if constexpr (HorizontalLayout * VerticalLayout > words_per_block) { + state.skip("Invalid vectorization layout"); // TODO check if this is correct + } else if constexpr ((pattern_bits_per_word <= 0) or + (pattern_bits_per_word > cuda::std::numeric_limits::digits) or + (pattern_bits_per_word * words_per_block > 64)) { + state.skip("Invalid pattern bits per word"); + } else { + using size_type = std::uint32_t; + using hasher = cuco::xxhash_64; + auto constexpr add_vertical_layout = 1; + auto constexpr add_horizontal_layout = words_per_block; + using policy_type = cuco::parametric_filter_policy; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, policy_type>; + + auto const num_keys = state.get_int64("NumInputs"); + auto const filter_size_mb = state.get_int64("FilterSizeMB"); + + std::size_t const num_sub_filters = + (filter_size_mb * 1024 * 1024) / + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); + + if (num_sub_filters > policy_type::max_filter_blocks) { + // skip invalid configurations + state.skip("num_sub_filters exceeds max_filter_blocks"); + } + + state.add_element_count(num_keys); + + filter_type filter{static_cast(num_sub_filters)}; + + thrust::counting_iterator key_it(0); + + // insert FPR-optimal number of keys + auto const num_build_keys = (filter_size_mb * 1024 * 1024 * 8) / (2 * PatternBits); + filter.add(key_it, key_it + num_build_keys); + + // FPR summary + thrust::device_vector result(num_keys, false); + filter.contains(key_it + num_build_keys, key_it + num_build_keys + num_keys, result.begin()); + + double const fp = thrust::count(thrust::device, result.begin(), result.end(), true); + + auto& summ_fpr = state.add_summary("FalsePositiveRate"); + summ_fpr.set_string("hint", "FPR"); + summ_fpr.set_string("short_name", "FPR"); + summ_fpr.set_string("description", "False-positive rate of the bloom filter."); + summ_fpr.set_float64("value", fp / static_cast(num_keys)); + + state.collect_dram_throughput(); + state.collect_l2_hit_rates(); + + if constexpr (ExcludeIO) { + state.add_global_memory_reads(num_keys * words_per_block); + // state.collect_dram_throughput(); + + auto result_it = make_lazy_discard_iterator(result.begin()); + + state.exec([&](nvbench::launch& launch) { + filter.contains_async(key_it, key_it + num_keys, result_it, {launch.get_stream()}); + }); + } else { + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end(), 0); + + state.add_global_memory_reads(num_keys * + ((words_per_block * sizeof(Word)) + sizeof(Key))); + state.add_global_memory_writes(num_keys * sizeof(bool)); + + state.exec([&](nvbench::launch& launch) { + filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + }); + } } +} - cuda::counting_iterator keys(0); - thrust::device_vector result(num_keys, false); - - state.add_element_count(num_keys); - - filter_type filter{ - static_cast(num_sub_filters), {}, {static_cast(pattern_bits)}}; - - state.collect_dram_throughput(); - state.collect_l2_hit_rates(); - - add_fpr_summary(state, filter); - - filter.add(keys, keys + num_keys); +/** + * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with + * `parametric_filter_policy` with IO + */ +template +void pfp_bloom_filter_contains(nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type> type_list) +{ + constexpr bool exclude_io = false; + pfp_bloom_filter_contains_impl(state, type_list); +} - state.exec([&](nvbench::launch& launch) { - filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()}); - }); +/** + * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with + * `parametric_filter_policy` without IO + */ +template +void pfp_bloom_filter_contains_exclude_io( + nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type> type_list) +{ + constexpr bool exclude_io = true; + pfp_bloom_filter_contains_impl(state, type_list); } /** * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with - * `arrow_filter_policy` + * `parametric_filter_policy` with cache sectorization */ -template -void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list) +template +void pfp_bloom_filter_contains_csbf(nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type>) { - // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if - // filter block fits into a 32B sector - using size_type = std::uint32_t; - using policy_type = cuco::arrow_filter_policy; - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - auto const num_keys = state.get_int64("NumInputs"); - auto const filter_size_mb = state.get_int64("FilterSizeMB"); - - std::size_t const num_sub_filters = - (filter_size_mb * 1024 * 1024) / - (sizeof(typename filter_type::word_type) * filter_type::words_per_block); - - if (num_sub_filters > policy_type::max_filter_blocks) { - state.skip("bloom filter with arrow policy should have <= 4194304 blocks"); // skip invalid - // configurations + auto constexpr words_per_block = BlockBits / cuda::std::numeric_limits::digits; + auto constexpr words_per_group = words_per_block / GroupsPerBlock; + auto constexpr VerticalLayout = words_per_block / HorizontalLayout; + + if constexpr (words_per_group == 0) { + state.skip("Invalid GroupsPerBlock"); + } else if constexpr ((HorizontalLayout * VerticalLayout != words_per_block) or + (VerticalLayout < words_per_group)) { + state.skip("Invalid vectorization layout"); + } else { + using size_type = std::uint32_t; + using hasher = cuco::xxhash_64; + auto constexpr add_horizontal_layout = GroupsPerBlock; + auto constexpr add_vertical_layout = words_per_group; + + using policy_type = cuco::parametric_filter_policy; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, policy_type>; + + auto const num_keys = state.get_int64("NumInputs"); + auto const filter_size_mb = state.get_int64("FilterSizeMB"); + + std::size_t const num_sub_filters = + (filter_size_mb * 1024 * 1024) / + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); + + if (num_sub_filters > policy_type::max_filter_blocks) { + // skip invalid configurations + state.skip("num_sub_filters exceeds max_filter_blocks"); + } + + state.add_element_count(num_keys); + + filter_type filter{static_cast(num_sub_filters)}; + + thrust::counting_iterator key_it(0); + + // insert FPR-optimal number of keys + auto const num_build_keys = (filter_size_mb * 1024 * 1024 * 8) / (2 * PatternBits); + filter.add(key_it, key_it + num_build_keys); + + // FPR summary + thrust::device_vector result(num_keys, false); + filter.contains(key_it + num_build_keys, key_it + num_build_keys + num_keys, result.begin()); + + double const fp = thrust::count(thrust::device, result.begin(), result.end(), true); + + auto& summ_fpr = state.add_summary("FalsePositiveRate"); + summ_fpr.set_string("hint", "FPR"); + summ_fpr.set_string("short_name", "FPR"); + summ_fpr.set_string("description", "False-positive rate of the bloom filter."); + summ_fpr.set_float64("value", fp / static_cast(num_keys)); + + state.collect_dram_throughput(); + state.collect_l2_hit_rates(); + + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end(), 0); + + state.add_global_memory_reads(num_keys * + ((words_per_block * sizeof(Word)) + sizeof(Key))); + state.add_global_memory_writes(num_keys * sizeof(bool)); + // state.collect_dram_throughput(); + + state.exec([&](nvbench::launch& launch) { + filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + }); } - - cuda::counting_iterator keys(0); - thrust::device_vector result(num_keys, false); - - state.add_element_count(num_keys); - - filter_type filter{static_cast(num_sub_filters)}; - - state.collect_dram_throughput(); - state.collect_l2_hit_rates(); - - add_fpr_summary(state, filter); - - filter.add(keys, keys + num_keys); - - state.exec([&](nvbench::launch& launch) { - filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()}); - }); } -NVBENCH_BENCH_TYPES(bloom_filter_contains, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - nvbench::type_list, - nvbench::enum_type_list, - nvbench::type_list)) - .set_name("bloom_filter_contains_unique_size") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); - -NVBENCH_BENCH_TYPES(bloom_filter_contains, - NVBENCH_TYPE_AXES(nvbench::type_list, - defaults::HASH_RANGE, - nvbench::type_list, - nvbench::enum_type_list, - nvbench::type_list)) - .set_name("bloom_filter_contains_unique_hash") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); - -NVBENCH_BENCH_TYPES(bloom_filter_contains, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - nvbench::type_list, - nvbench::enum_type_list<1, 2, 4, 8>, - nvbench::type_list)) - .set_name("bloom_filter_contains_unique_block_dim") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) +NVBENCH_BENCH_TYPES( + pfp_bloom_filter_contains, + NVBENCH_TYPE_AXES(nvbench::type_list, + nvbench::type_list, ///< Word + nvbench::enum_type_list<64, 128, 256, 512, 1024>, ///< BlockBits + nvbench::enum_type_list<16>, ///< PatternBits + nvbench::enum_type_list<1, 2, 4, 8, 16>, /// ///< VerticalLayout + )) + .set_name("pfp_bloom_filter_contains_unique_size_u64") + .set_type_axes_names( + {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"}) .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_FRONTIER_CACHE); -NVBENCH_BENCH_TYPES(arrow_bloom_filter_contains, +NVBENCH_BENCH_TYPES(pfp_bloom_filter_contains_csbf, NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list)) - .set_name("arrow_bloom_filter_contains_unique_size") - .set_type_axes_names({"Key", "Distribution"}) + nvbench::type_list, + nvbench::enum_type_list<128, 256, 512, 1024>, + nvbench::enum_type_list<16>, + nvbench::enum_type_list<2, 4, 8>, + nvbench::enum_type_list<1, 2, 4, 8>)) + .set_name("pfp_bloom_filter_contains_csbf_unique_size_u64") + .set_type_axes_names( + {"Key", "Word", "BlockBits", "PatternBits", "GroupsPerBlock", "HorizontalLayout"}) .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_FRONTIER_CACHE); \ No newline at end of file diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp index f1b192aa3..8b7d45f6f 100644 --- a/benchmarks/bloom_filter/defaults.hpp +++ b/benchmarks/bloom_filter/defaults.hpp @@ -33,9 +33,11 @@ using BF_WORD = nvbench::uint32_t; static constexpr auto BF_N = 1'000'000'000; static constexpr auto BF_SIZE_MB = 2'000; static constexpr auto BF_WORDS_PER_BLOCK = 8; +static constexpr auto BF_PATTERN_BITS = 8; auto const BF_SIZE_MB_RANGE_CACHE = std::vector{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; -auto const BF_PATTERN_BITS_RANGE = std::vector{1, 2, 4, 6, 8, 16}; +auto const BF_SIZE_MB_RANGE_FRONTIER_CACHE = std::vector{32, 1024}; +auto const BF_PATTERN_BITS_RANGE = std::vector{1, 2, 4, 6, 8, 16}; } // namespace cuco::benchmark::defaults diff --git a/include/cuco/bloom_filter.cuh b/include/cuco/bloom_filter.cuh index 3f2e69c7c..b50d3753f 100644 --- a/include/cuco/bloom_filter.cuh +++ b/include/cuco/bloom_filter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,7 +36,13 @@ namespace cuco { /** - * @brief A GPU-accelerated Blocked Bloom Filter. + * @brief A GPU-accelerated Bloom filter. + * + * Implements the Sectorized Bloom Filter (SBF) and Cache-Sectorized Bloom Filter (CSBF) variants + * from "Optimizing Bloom Filters for Modern GPU Architectures" (arXiv:2512.15595). Fingerprint + * generation is parameterized by `parametric_filter_policy` (see `cuco/bloom_filter_policies.cuh`), + * which exposes the paper's Theta/Phi vectorization layout and optional `GroupsPerBlock` + * cache-sectorization parameters as compile-time template parameters. * * The `bloom_filter` supports two types of operations: * - Host-side "bulk" operations @@ -51,6 +57,12 @@ namespace cuco { * independent add or lookup operations from device code. These operations are accessed through * non-owning, trivially copyable reference types (or "ref"). * + * @note The default `Policy` (`cuco::default_filter_policy`) is an alias for a + * `parametric_filter_policy` instantiation with paper-recommended layouts on a 256-bit block. Users + * who need different layouts (smaller/larger blocks, different fingerprint bit counts, CSBF mode) + * can instantiate `cuco::parametric_filter_policy<...>` directly and pass it as the `Policy` + * template argument. + * * @tparam Key Key type * @tparam Extent Size type that is used to determine the number of blocks in the filter * @tparam Scope The scope in which operations will be performed by individual threads @@ -61,8 +73,8 @@ namespace cuco { template , cuda::thread_scope Scope = cuda::thread_scope_device, - class Policy = cuco::default_filter_policy, std::uint32_t, 8>, - class Allocator = cuco::cuda_allocator> + class Policy = cuco::default_filter_policy, + class Allocator = cuco::cuda_allocator> class bloom_filter { public: /** @@ -136,7 +148,7 @@ class bloom_filter { * @param stream CUDA stream used for device memory operations and kernel launches */ __host__ constexpr void clear_async(cuda::stream_ref stream = cuda::stream_ref{ - cudaStream_t{nullptr}}); + cudaStream_t{nullptr}}) noexcept; /** * @brief Adds all keys in the range `[first, last)` to the filter. @@ -164,8 +176,10 @@ class bloom_filter { * @param stream CUDA stream used for device memory operations and kernel launches */ template - __host__ constexpr void add_async( - InputIt first, InputIt last, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); + __host__ constexpr void add_async(InputIt first, + InputIt last, + cuda::stream_ref stream = cuda::stream_ref{ + cudaStream_t{nullptr}}) noexcept; /** * @brief Adds keys in the range `[first, last)` if `pred` of the corresponding `stencil` returns diff --git a/include/cuco/bloom_filter_policies.cuh b/include/cuco/bloom_filter_policies.cuh index 0d28b166a..1aa116c14 100644 --- a/include/cuco/bloom_filter_policies.cuh +++ b/include/cuco/bloom_filter_policies.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,7 @@ #pragma once -#include -#include +#include #include #include @@ -25,110 +24,60 @@ namespace cuco { /** - * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's - * fingerprint. + * @brief Sectorized Bloom filter policy with multiplicative-hashing fingerprint generation. * - * @tparam Key The type of the values to generate a fingerprint for. - * @tparam XXHash64 Custom (64 bit) XXHash hasher to generate a key's fingerprint. - * By default, cuco::xxhash_64 hasher will be used. + * Implements the Sectorized Bloom Filter (SBF) and Cache-Sectorized Bloom Filter (CSBF) variants + * from "Optimizing Bloom Filters for Modern GPU Architectures" (arXiv:2512.15595). Distributes + * `PatternBits` set bits across `WordsPerBlock` words via compile-time salt-based multiplicative + * hashing. * + * Requires a 64-bit hash function: the result is split into upper 32 bits (block selection via + * multiply-shift) and lower 32 bits (pattern generation). This is a permanent design requirement. + * + * @note Constructor signature: `parametric_filter_policy(Hash hash = {})`. + * + * @tparam Hash 64-bit hash functor. + * @tparam Word Underlying word type of a filter block. + * @tparam WordsPerBlock Words per filter block. + * @tparam PatternBits Fingerprint bits per key (paper's k). + * @tparam AddHorizontalLayout CG size for add (paper's Theta). + * @tparam AddVerticalLayout Words per thread per add step (paper's Phi). + * @tparam ContainsHorizontalLayout CG size for contains. + * @tparam ContainsVerticalLayout Words per thread per contains step. + * @tparam GroupsPerBlock Cache-sectorization groups (paper's z). Defaults to `WordsPerBlock` + * (non-CSBF). */ -template class XXHash64 = cuco::xxhash_64> -using arrow_filter_policy = detail::arrow_filter_policy; +template +using parametric_filter_policy = detail::parametric_filter_policy; /** - * @brief The default policy that defines how a Blocked Bloom Filter generates and stores a key's - * fingerprint. + * @brief Default Bloom filter policy used by `cuco::bloom_filter` when no policy is specified. * - * @note `Word` type must be an atomically updatable integral type. `WordsPerBlock` must - * be a power-of-two. + * Alias for a `parametric_filter_policy` instantiation with paper-recommended layouts on a 256-bit + * block: 8 x `uint32_t` words, 8 fingerprint bits per key, fully horizontal add (Theta=8) and fully + * vertical contains (Phi=8). * - * @tparam Hash Hash function used to generate a key's fingerprint - * @tparam Word Underlying word/segment type of a filter block - * @tparam WordsPerBlock Number of words/segments in each block + * @tparam Key The key type to generate a fingerprint for. + * @tparam XXHash64 64-bit XXHash functor template. Defaults to `cuco::xxhash_64`. */ -template -class default_filter_policy { - using impl_type = cuco::detail::default_filter_policy_impl; - - public: - using hasher = typename impl_type::hasher; ///< Type of the hash function - using hash_argument_type = typename impl_type::hash_argument_type; ///< Hash function input type - using hash_result_type = typename impl_type::hash_result_type; ///< hash function output type - using word_type = - typename impl_type::word_type; ///< Underlying word/segment type of a filter block - - static constexpr std::uint32_t words_per_block = - impl_type::words_per_block; ///< Number of words/segments in each filter block - - public: - /** - * @brief Constructs the `default_filter_policy` object. - * - * @throws Compile-time error if the specified number of words in a filter block is not a - * power-of-two or is larger than 32. If called from host: throws exception; If called from - * device: Traps the kernel. - * - * @throws If the `hash_result_type` is too narrow to generate the requested number of - * `pattern_bits`. If called from host: throws exception; If called from device: Traps the kernel. - * - * @throws If `pattern_bits` is smaller than the number of words in a filter block or larger than - * the total number of bits in a filter block. If called from host: throws exception; If called - * from device: Traps the kernel. - * - * @param pattern_bits Number of bits in a key's fingerprint - * @param hash Hash function used to generate a key's fingerprint - */ - __host__ __device__ constexpr default_filter_policy(std::uint32_t pattern_bits = words_per_block, - Hash hash = {}); - - /** - * @brief Generates the hash value for a given key. - * - * @note This function is meant as a customization point and is only used in the internals of the - * `bloom_filter(_ref)` implementation. - * - * @param key The key to hash - * - * @return The hash value of the key - */ - __device__ constexpr hash_result_type hash(hash_argument_type const& key) const; - - /** - * @brief Determines the filter block a key is added into. - * - * @note This function is meant as a customization point and is only used in the internals of the - * `bloom_filter(_ref)` implementation. - * - * @tparam Extent Size type that is used to determine the number of blocks in the filter - * - * @param hash Hash value of the key - * @param num_blocks Number of block in the filter - * - * @return The block index for the given key's hash value - */ - template - __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const; - - /** - * @brief Determines the fingerprint pattern for a word/segment within the filter block for a - * given key's hash value. - * - * @note This function is meant as a customization point and is only used in the internals of the - * `bloom_filter(_ref)` implementation. - * - * @param hash Hash value of the key - * @param word_index Target word/segment within the filter block - * - * @return The bit pattern for the word/segment in the filter block - */ - __device__ constexpr word_type word_pattern(hash_result_type hash, - std::uint32_t word_index) const; - - private: - impl_type impl_; ///< Policy implementation -}; +template class XXHash64 = cuco::xxhash_64> +using default_filter_policy = + parametric_filter_policy, std::uint32_t, 8, 8, 8, 1, 1, 8>; } // namespace cuco - -#include \ No newline at end of file diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh index f58910afc..ad4ee0cb3 100644 --- a/include/cuco/bloom_filter_ref.cuh +++ b/include/cuco/bloom_filter_ref.cuh @@ -109,7 +109,7 @@ class bloom_filter_ref { * @param stream CUDA stream used for device memory operations and kernel launches */ __host__ constexpr void clear_async(cuda::stream_ref stream = cuda::stream_ref{ - cudaStream_t{nullptr}}); + cudaStream_t{nullptr}}) noexcept; /** * @brief Device function that adds a key to the filter. @@ -121,6 +121,17 @@ class bloom_filter_ref { template __device__ void add(ProbeKey const& key); + /** + * @brief Device function that adds all keys in the range `[first, last)` to the filter. + * + * @tparam InputIt Device-accessible random access input key iterator + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + */ + template + __device__ void add(InputIt first, InputIt last); + /** * @brief Device function that cooperatively adds a key to the filter. * @@ -178,8 +189,10 @@ class bloom_filter_ref { * @param stream CUDA stream used for device memory operations and kernel launches */ template - __host__ constexpr void add_async( - InputIt first, InputIt last, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); + __host__ constexpr void add_async(InputIt first, + InputIt last, + cuda::stream_ref stream = cuda::stream_ref{ + cudaStream_t{nullptr}}) noexcept; /** * @brief Adds keys in the range `[first, last)` if `pred` of the corresponding `stencil` returns @@ -248,6 +261,20 @@ class bloom_filter_ref { template [[nodiscard]] __device__ bool contains(ProbeKey const& key) const; + /** + * @brief Device function that tests if all keys in the range `[first, last)` are present in the + * filter. + * + * @tparam InputIt Device-accessible random access input key iterator + * @tparam OutputIt Device-accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + */ + template + __device__ void contains(InputIt first, InputIt last, OutputIt output_begin) const; + /** * @brief Device function that tests if a key's fingerprint is present in the filter. * @@ -265,10 +292,21 @@ class bloom_filter_ref { template [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const; - // TODO - // template - // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) - // const; + /** + * @brief Device function that tests if all keys in the range `[first, last)` are present in the + * filter. + * + * @tparam CG Cooperative Group type + * @tparam InputIt Device-accessible random access input key iterator + * @tparam OutputIt Device-accessible output iterator assignable from `bool` + * + * @param group The Cooperative Group this operation is executed with + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + */ + template + __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) const; /** * @brief Tests all keys in the range `[first, last)` if their fingerprints are present in the diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh deleted file mode 100644 index 481293335..000000000 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include - -#include -#include - -namespace cuco::detail { - -/** - * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's - * fingerprint. - * - * Reference: - * https://github.com/apache/arrow/blob/be1dcdb96b030639c0b56955c4c62f9d6b03f473/cpp/src/parquet/bloom_filter.cc#L219-L230 - * - * Example: - * @code{.cpp} - * template - * void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector const& positive_keys, - * device_vector const& negative_keys) - * { - * using policy_type = cuco::arrow_filter_policy; - * - * // Warn or throw if the number of filter blocks is greater than maximum used by Arrow policy. - * static_assert(NUM_FILTER_BLOCKS <= policy_type::max_filter_blocks, "NUM_FILTER_BLOCKS must be - * in range: [1, 4194304]"); - * - * // Create a bloom filter with Arrow policy - * cuco::bloom_filter, - * cuda::thread_scope_device, policy_type> filter{NUM_FILTER_BLOCKS}; - * - * // Add positive keys to the bloom filter - * filter.add(positive_keys.begin(), positive_keys.end()); - * - * auto const num_tp = positive_keys.size(); - * auto const num_tn = negative_keys.size(); - * - * // Vectors to store query results. - * thrust::device_vector true_positive_result(num_tp, false); - * thrust::device_vector true_negative_result(num_tn, false); - * - * // Query the bloom filter for the inserted keys. - * filter.contains(positive_keys.begin(), positive_keys.end(), true_positive_result.begin()); - * - * // We should see a true-positive rate of 1. - * float true_positive_rate = float(thrust::count(thrust::device, - * true_positive_result.begin(), true_positive_result.end(), true)) / float(num_tp); - * - * // Query the bloom filter for the non-inserted keys. - * filter.contains(negative_keys.begin(), negative_keys.end(), true_negative_result.begin()); - * - * // We may see a false-positive rate > 0 depending on the number of bits in the - * // filter and the number of hashes used per key. - * float false_positive_rate = float(thrust::count(thrust::device, - * true_negative_result.begin(), true_negative_result.end(), true)) / float(num_tn); - * } - * @endcode - * - * @tparam Key The type of the values to generate a fingerprint for. - * @tparam XXHash64 64-bit XXHash hasher implementation for fingerprint generation. - */ -template class XXHash64> -class arrow_filter_policy { - public: - using hasher = XXHash64; ///< 64-bit XXHash hasher for Arrow bloom filter policy - using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy - using key_type = Key; ///< Hash function input type - using hash_result_type = std::uint64_t; ///< hash function output type - - static constexpr uint32_t bits_set_per_block = 8; ///< hardcoded bits set per Arrow filter block - static constexpr uint32_t words_per_block = 8; ///< hardcoded words per Arrow filter block - - static constexpr std::uint32_t bytes_per_filter_block = - 32; ///< Number of bytes in one Arrow filter block - static constexpr std::uint32_t max_arrow_filter_bytes = - 128 * 1024 * 1024; ///< Max bytes in Arrow bloom filter - static constexpr std::uint32_t max_filter_blocks = - (max_arrow_filter_bytes / - bytes_per_filter_block); ///< Max sub-filter blocks allowed in Arrow bloom filter - - public: - /** - * @brief Constructs the `arrow_filter_policy` object. - * - * @note The number of filter blocks with Arrow policy must be in the - * range of [1, 4194304]. If the bloom filter is constructed with a larger - * number of blocks, only the first 4194304 (128MB) blocks will be used. - * - * @param hash Hash function used to generate a key's fingerprint - */ - __host__ __device__ constexpr arrow_filter_policy(hasher hash = {}) : hash_{hash} {} - - /** - * @brief Generates the hash value for a given key. - * - * @param key The key to hash - * - * @return The hash value of the key - */ - __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); } - - /** - * @brief Determines the filter block a key is added into. - * - * @note The number of filter blocks with Arrow policy must be in the - * range of [1, 4194304]. Passing a larger `num_blocks` will still - * upperbound the number of blocks used to the mentioned range. - * - * @tparam Extent Size type that is used to determine the number of blocks in the filter - * - * @param hash Hash value of the key - * @param num_blocks Number of block in the filter - * - * @return The block index for the given key's hash value - */ - template - __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const - { - constexpr auto hash_bits = cuda::std::numeric_limits::digits; - // TODO: assert if num_blocks > max_filter_blocks - auto const max_blocks = cuda::std::min(num_blocks, max_filter_blocks); - // Make sure we are only contained withing the `max_filter_blocks` blocks - return static_cast(((hash >> hash_bits) * max_blocks) >> hash_bits) % max_blocks; - } - - /** - * @brief Determines the fingerprint pattern for a word/segment within the filter block for a - * given key's hash value. - * - * @param hash Hash value of the key - * @param word_index Target word/segment within the filter block - * - * @return The bit pattern for the word/segment in the filter block - */ - __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const - { - word_type const key = static_cast(hash); - std::uint32_t salt{}; - - // Basically a switch (word_index) { case 0-7 ... } - // First split: 0..3 versus 4..7. - if (word_index < 4) { - // For indices 0..3, further split into 0..1 and 2..3. - if (word_index < 2) { - // word_index is 0 or 1. - salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U; - } else { - // word_index is 2 or 3. - salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU; - } - } else { - // For indices 4..7, further split into 4..5 and 6..7. - if (word_index < 6) { - // word_index is 4 or 5. - salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU; - } else { - // word_index is 6 or 7. - salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U; - } - } - return word_type{1} << ((key * salt) >> 27); - } - - private: - hasher hash_; -}; - -} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/bloom_filter.inl b/include/cuco/detail/bloom_filter/bloom_filter.inl index f3306b2fb..eb3bfcc4a 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter.inl @@ -34,8 +34,9 @@ __host__ constexpr bloom_filter::bloom_fi Allocator const& alloc, cuda::stream_ref stream) : allocator_{alloc}, - data_{allocator_.allocate(num_blocks, stream), - detail::custom_deleter{num_blocks, allocator_, stream}}, + data_{allocator_.allocate(static_cast(num_blocks), stream), + detail::custom_deleter{ + static_cast(num_blocks), allocator_, stream}}, ref_{data_.get(), num_blocks, {}, policy} { this->clear_async(stream); @@ -50,7 +51,7 @@ __host__ constexpr void bloom_filter::cle template __host__ constexpr void bloom_filter::clear_async( - cuda::stream_ref stream) + cuda::stream_ref stream) noexcept { ref_.clear_async(stream); } @@ -66,7 +67,7 @@ __host__ constexpr void bloom_filter::add template template __host__ constexpr void bloom_filter::add_async( - InputIt first, InputIt last, cuda::stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) noexcept { ref_.add_async(first, last, stream); } diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh index 8e2b49453..f558c5e00 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh +++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh @@ -44,204 +44,6 @@ namespace cuco::detail { -/** - * @brief Device functor for adding a single key to the bloom filter - * - * This functor is used with cuda::static_for to iterate over all words in a filter block - * and set the appropriate bits for a given key's hash value. Each iteration processes - * one word in the block using atomic operations to ensure thread safety. - * - * @tparam HashValue Type of the hash value (typically uint64_t) - * @tparam BlockIndex Type of the block index (typically size_t or uint32_t) - * @tparam Policy Filter policy type that provides word pattern generation - * @tparam WordType Underlying word type of the filter (typically uint64_t) - * @tparam Scope CUDA thread scope for atomic operations - */ -template -struct add_impl_functor { - HashValue hash_value; ///< Hash value of the key being added - BlockIndex block_index; ///< Index of the filter block to modify - Policy policy_; ///< Filter policy for generating bit patterns - WordType* words_; ///< Pointer to the filter's word array - size_t words_per_block; ///< Number of words in each filter block - - /** - * @brief Processes one word in the filter block for key insertion - * - * @tparam I Type of the integral constant passed by cuda::static_for - * @param i Integral constant representing the word index within the block - */ - template - __device__ void operator()(I i) const - { - auto const word = policy_.word_pattern(hash_value, i()); - if (word != 0) { - auto atom_word = - cuda::atomic_ref{*(words_ + (block_index * words_per_block + i()))}; - atom_word.fetch_or(word, cuda::memory_order_relaxed); - } - } -}; - -/** - * @brief Device functor for cooperative group-based batch key insertion - * - * This functor is used with cuda::static_for to process multiple keys in parallel - * within a cooperative group. Each thread in the group processes a different key - * using shuffle operations to share hash values and block indices across threads. - * - * @tparam CG Cooperative group type (e.g., thread_block_tile) - * @tparam HashValue Type of the hash value - * @tparam BlockIndex Type of the block index - * @tparam BloomFilterImpl Type of the bloom filter implementation - */ -template -struct add_group_functor { - CG group; ///< Cooperative group for parallel processing - HashValue hash_value; ///< Hash value of the current thread's key - BlockIndex block_index; ///< Block index of the current thread's key - size_t i; ///< Starting index in the key batch - size_t num_keys; ///< Total number of keys to process - size_t num_threads; ///< Number of threads in the group - BloomFilterImpl* self; ///< Pointer to the bloom filter implementation - - /** - * @brief Processes one thread's key in the cooperative group batch insertion - * - * @tparam J Type of the integral constant passed by cuda::static_for - * @param j Integral constant representing the thread index within the group - */ - template - __device__ void operator()(J j) const - { - if ((j() < num_threads) and (i + j() < num_keys)) { - self->add_impl(group, group.shfl(hash_value, j()), group.shfl(block_index, j())); - } - } -}; - -/** - * @brief Device functor for worker group-based batch key insertion - * - * This functor is used with cuda::static_for to process multiple keys in parallel - * within a worker group (subdivision of a larger cooperative group). Similar to - * add_group_functor but operates on a smaller worker group with offset handling - * for processing different portions of the key batch. - * - * @tparam WorkerGroup Worker group type (subdivision of cooperative group) - * @tparam HashValue Type of the hash value - * @tparam BlockIndex Type of the block index - * @tparam BloomFilterImpl Type of the bloom filter implementation - */ -template -struct add_worker_group_functor { - WorkerGroup worker_group; ///< Worker group (subdivision of cooperative group) - HashValue hash_value; ///< Hash value of the current thread's key - BlockIndex block_index; ///< Block index of the current thread's key - size_t i; ///< Starting index in the key batch - size_t worker_offset; ///< Offset for this worker group within the batch - size_t num_keys; ///< Total number of keys to process - size_t worker_num_threads; ///< Number of threads in the worker group - BloomFilterImpl* self; ///< Pointer to the bloom filter implementation - - /** - * @brief Processes one thread's key in the worker group batch insertion - * - * @tparam J Type of the integral constant passed by cuda::static_for - * @param j Integral constant representing the thread index within the worker group - */ - template - __device__ void operator()(J j) const - { - if ((j() < worker_num_threads) and (i + worker_offset + j() < num_keys)) { - self->add_impl( - worker_group, worker_group.shfl(hash_value, j()), worker_group.shfl(block_index, j())); - } - } -}; - -/** - * @brief Device functor for cooperative group-based single key insertion - * - * This functor is used with cuda::static_for to add a single key to the bloom filter - * using a cooperative group. Each thread in the group processes different words in - * the filter block based on thread rank and stride pattern. Used when the group size - * doesn't match the number of words per block. - * - * @tparam HashValue Type of the hash value - * @tparam BlockIndex Type of the block index - * @tparam WordType Underlying word type of the filter - * @tparam Scope CUDA thread scope for atomic operations - * @tparam Policy Filter policy type that provides word pattern generation - */ -template -struct add_impl_group_functor { - HashValue hash_value; ///< Hash value of the key being added - BlockIndex block_index; ///< Index of the filter block to modify - WordType* words_; ///< Pointer to the filter's word array - size_t words_per_block; ///< Number of words in each filter block - size_t rank; ///< Thread rank within the cooperative group - size_t num_threads; ///< Number of threads in the cooperative group - Policy policy_; ///< Filter policy for generating bit patterns - - /** - * @brief Processes one word in the filter block using cooperative group stride pattern - * - * @tparam I Type of the integral constant passed by cuda::static_for - * @param i Integral constant representing the word index within the block - */ - template - __device__ void operator()(I i) const - { - if (i() >= rank && (i() - rank) % num_threads == 0) { - auto atom_word = - cuda::atomic_ref{*(words_ + (block_index * words_per_block + i()))}; - atom_word.fetch_or(policy_.word_pattern(hash_value, i()), cuda::memory_order_relaxed); - } - } -}; - -/** - * @brief Device functor for checking if a key exists in the bloom filter - * - * This functor is used with cuda::static_for to iterate over all words in a filter block - * and check if the expected bit patterns for a given key's hash value are present. - * If any expected bit is missing, the result is set to false, indicating the key - * is definitely not in the set. - * - * @tparam HashValue Type of the hash value - * @tparam StoredPattern Type of the stored pattern array (typically array of WordType) - * @tparam Policy Filter policy type that provides word pattern generation - */ -template -struct contains_functor { - HashValue hash_value; ///< Hash value of the key being queried - StoredPattern stored_pattern; ///< Array of stored bit patterns from the filter block - Policy policy_; ///< Filter policy for generating expected bit patterns - bool* result; ///< Pointer to result flag (set to false if key not found) - - /** - * @brief Checks one word in the filter block for the expected bit pattern - * - * @tparam I Type of the integral constant passed by cuda::static_for - * @param i Integral constant representing the word index within the block - */ - template - __device__ void operator()(I i) const - { - auto const expected_pattern = policy_.word_pattern(hash_value, i()); - if ((stored_pattern[i()] & expected_pattern) != expected_pattern) { *result = false; } - } -}; - template class bloom_filter_impl { public: @@ -250,26 +52,43 @@ class bloom_filter_impl { using size_type = typename extent_type::value_type; using policy_type = Policy; using word_type = typename policy_type::word_type; + // uint64_t may be unsigned long, but atomicOr requires unsigned long long + using atomic_word_type = typename cuda::std:: + conditional_t, unsigned long long, word_type>; + + // Implementation-tuning knobs. Not part of the public API; reached via + // `bloom_filter_impl::tuning::use_*` from internal kernels. Defaults reflect the ablation + // measurements from arXiv:2512.15595; flip in source for tuning experiments. + struct tuning { + static constexpr bool use_invoke_one = true; + static constexpr bool use_early_exit = false; + static constexpr bool use_cub_kernels = true; + static constexpr bool use_warp_cooperative_add_kernel = true; + static constexpr bool use_warp_cooperative_contains_kernel = true; + static constexpr bool use_work_stealing_add_kernel = false; + static constexpr bool use_work_stealing_contains_kernel = false; + static constexpr bool use_cuda_atomic_ref = false; + }; static constexpr auto thread_scope = Scope; static constexpr auto words_per_block = policy_type::words_per_block; - __host__ __device__ static constexpr size_t max_vec_bytes() noexcept - { - constexpr auto word_bytes = sizeof(word_type); - constexpr auto block_bytes = word_bytes * words_per_block; - return cuda::std::min(cuda::std::max(word_bytes, 32ul), - block_bytes); // aiming for 2xLDG128 -> 1 sector per thread - } + static constexpr auto add_vertical_layout = policy_type::add_vertical_layout; + static constexpr auto add_horizontal_layout = policy_type::add_horizontal_layout; + static constexpr auto contains_vertical_layout = policy_type::contains_vertical_layout; + static constexpr auto contains_horizontal_layout = policy_type::contains_horizontal_layout; + static constexpr auto add_loop_count = + words_per_block / (add_vertical_layout * add_horizontal_layout); + static constexpr auto contains_loop_count = + words_per_block / (contains_vertical_layout * contains_horizontal_layout); - struct alignas(max_vec_bytes()) filter_block_type { - private: - word_type data_[words_per_block]; - }; + static constexpr bool is_cache_sectorized = policy_type::is_cache_sectorized; + static_assert((not tuning::use_cuda_atomic_ref) or + (Scope == cuda::thread_scope::thread_scope_device), + "atomicOr requires device scope"); static_assert(cuda::std::has_single_bit(words_per_block) and words_per_block <= 32, "Number of words per block must be a power-of-two and less than or equal to 32"); - static_assert( cuda::std::is_constructible_v, word_type&> && cuda::std::is_invocable_r_v, "Invalid word type"); + __host__ __device__ static constexpr size_t alignment() noexcept + { + // Maximum alignment is 32 bytes which is equivalent to one sector + return cuda::std::min( + static_cast(32), + static_cast(cuda::std::max(add_vertical_layout, contains_vertical_layout) * + sizeof(word_type))); + } + + struct filter_block_type { + private: + alignas(alignment()) word_type data_[words_per_block]; + }; + __host__ __device__ explicit constexpr bloom_filter_impl(filter_block_type* filter, Extent num_blocks, cuda_thread_scope, @@ -298,7 +131,9 @@ class bloom_filter_impl { template __device__ constexpr void clear(CG group) { - for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) { + // TODO optimize this + for (int i = group.thread_rank(); i < static_cast(num_blocks_) * words_per_block; + i += group.size()) { words_[i] = 0; } } @@ -313,195 +148,286 @@ class bloom_filter_impl { #endif } - __host__ constexpr void clear_async(cuda::stream_ref stream) + __host__ constexpr void clear_async(cuda::stream_ref stream) noexcept { - CUCO_CUDA_TRY(cub::DeviceFor::ForEachN( + cub::DeviceFor::ForEachN( words_, - num_blocks_ * words_per_block, + static_cast(num_blocks_) * words_per_block, [] __device__(word_type & word) { word = 0; }, + stream.get()); + } + + __host__ constexpr void merge(bloom_filter_impl const& other, + cuda::stream_ref stream) + { + this->merge_async(other, stream); +#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) + stream.sync(); +#else + stream.wait(); +#endif + } + + __host__ constexpr void merge_async(bloom_filter_impl const& other, + cuda::stream_ref stream) + { + CUCO_EXPECTS(this->block_extent() == other.block_extent(), + "mismatching num_blocks in merge_async"); + CUCO_CUDA_TRY(cub::DeviceTransform::Transform( + cuda::std::tuple{this->data(), other.data()}, + this->data(), + this->block_extent() * words_per_block, + [] __device__(word_type a, word_type b) { return a | b; }, stream.get())); } - template - __device__ void add(ProbeKey const& key) + __host__ constexpr void intersect(bloom_filter_impl const& other, + cuda::stream_ref stream) { - auto const hash_value = policy_.hash(key); - this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); + this->intersect_async(other, stream); +#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) + stream.sync(); +#else + stream.wait(); +#endif } - template - __device__ void add(InputIt first, InputIt last) + __host__ constexpr void intersect_async( + bloom_filter_impl const& other, cuda::stream_ref stream) { - auto const num_keys = cuco::detail::distance(first, last); - for (decltype(num_keys) i = 0; i < num_keys; ++i) { - auto const hash_value = policy_.hash(*(first + i)); - this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); - } + CUCO_EXPECTS(this->block_extent() == other.block_extent(), + "mismatching num_blocks in intersect_async"); + CUCO_CUDA_TRY(cub::DeviceTransform::Transform( + cuda::std::tuple{this->data(), other.data()}, + this->data(), + this->block_extent() * words_per_block, + [] __device__(word_type a, word_type b) { return a & b; }, + stream.get())); } - template - __device__ void add_impl(HashValue const& hash_value, BlockIndex block_index) + [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; } + + [[nodiscard]] __host__ __device__ constexpr word_type const* data() const noexcept { - add_impl_functor functor{ - hash_value, block_index, policy_, words_, words_per_block}; - cuda::static_for(functor); + return words_; } - template - __device__ void add(CG group, ProbeKey const& key) + [[nodiscard]] __host__ __device__ constexpr extent_type block_extent() const noexcept { - constexpr auto num_threads = tile_size_v; - constexpr auto optimal_num_threads = add_optimal_cg_size(); - constexpr auto worker_num_threads = - (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads; + return num_blocks_; + } - // If single thread is optimal, use scalar add - if constexpr (worker_num_threads == 1) { - this->add(key); + // Single Thread Add. Layout-agnostic: when `add_horizontal_layout > 1`, runs the per-virtual- + // thread work serially in the calling thread. + template + __device__ void add(BuildKey build_key) + { + auto const [upper_hash, lower_hash] = policy_.split_hash(build_key); + auto const block_index = policy_.block_index(upper_hash, num_blocks_); + + if constexpr (add_horizontal_layout == 1) { + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; + add_pattern_cs(block_index, lower_hash, group_hash); + } else { + add_pattern(block_index, lower_hash); + } } else { - auto const hash_value = policy_.hash(key); - this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; +#pragma unroll + for (uint32_t thread_index = 0; thread_index < add_horizontal_layout; ++thread_index) { + add_patterns_cs(block_index, lower_hash, group_hash, thread_index); + } + } else { +#pragma unroll + for (uint32_t thread_index = 0; thread_index < add_horizontal_layout; ++thread_index) { + add_patterns(block_index, lower_hash, thread_index); + } + } } } - template - __device__ void add(CG group, InputIt first, InputIt last) + // Multi Thread Add. Layout-flexible: when the CG size matches `add_horizontal_layout`, runs the + // optimal cooperative path with one shared hash evaluation; otherwise has one lane perform the + // layout-agnostic scalar insert and synchronizes the rest. + template + __device__ void add(CG group, BuildKey build_key) { namespace cg = cooperative_groups; - constexpr auto num_threads = tile_size_v; - constexpr auto optimal_num_threads = add_optimal_cg_size(); - constexpr auto worker_num_threads = - (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads; + if constexpr (add_horizontal_layout == 1 || tile_size_v != add_horizontal_layout) { + // Tile size doesn't match the layout. Pick one lane to do the scalar (layout-agnostic) + // insert; the rest wait at the implicit sync. + cg::invoke_one(group, [&] __device__() { this->template add(build_key); }); + return; + } - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } + auto const [upper_hash, lower_hash, block_index] = [&] __device__ { + if constexpr (tuning::use_invoke_one) { + return cg::invoke_one_broadcast(group, [&] __device__() { + auto const sh = policy_.split_hash(build_key); + return cuda::std::make_tuple( + sh.first, sh.second, policy_.block_index(sh.first, num_blocks_)); + }); + } else { + auto const sh = policy_.split_hash(build_key); + return cuda::std::make_tuple( + sh.first, sh.second, policy_.block_index(sh.first, num_blocks_)); + } + }(); - auto const rank = group.thread_rank(); - - // If single thread is optimal, use scalar add - if constexpr (worker_num_threads == 1) { - for (auto i = rank; i < num_keys; i += num_threads) { - typename cuda::std::iterator_traits::value_type const& insert_element{ - *(first + i)}; - this->add(insert_element); - } - } else if constexpr (num_threads == worker_num_threads) { // given CG is optimal CG - typename policy_type::hash_result_type hash_value; - size_type block_index; - - auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); - for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) { - if (i + rank < num_keys) { - typename cuda::std::iterator_traits::value_type const& insert_element{ - *(first + i + rank)}; - hash_value = policy_.hash(insert_element); - block_index = policy_.block_index(hash_value, num_blocks_); - } + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; + add_patterns_cs( + block_index, lower_hash, group_hash, group.thread_rank()); + } else { + add_patterns(block_index, lower_hash, group.thread_rank()); + } + } - add_group_functor - functor{group, - hash_value, - block_index, - static_cast(i), - static_cast(num_keys), - static_cast(num_threads), - this}; - cuda::static_for(functor); - } - } else { // subdivide given CG into multiple optimal CGs - typename policy_type::hash_result_type hash_value; - size_type block_index; - - auto const worker_group = cg::tiled_partition(group); - auto const worker_offset = worker_num_threads * worker_group.meta_group_rank(); - - auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); - - for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) { - if (i + rank < num_keys) { - typename cuda::std::iterator_traits::value_type const& key{*(first + i + rank)}; - hash_value = policy_.hash(key); - block_index = policy_.block_index(hash_value, num_blocks_); - } + // Warp-cooperative Add + template + __device__ void add_coop(CG group, BuildKey build_key) + { + constexpr auto num_threads = tile_size_v; - add_worker_group_functor - functor{worker_group, - hash_value, - block_index, - static_cast(i), - static_cast(worker_offset), - static_cast(num_keys), - static_cast(worker_num_threads), - this}; - cuda::static_for(functor); + auto const [upper_hash, lower_hash] = policy_.split_hash(build_key); + auto const block_index = policy_.block_index(upper_hash, num_blocks_); + + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + add_patterns_cs(group.shfl(block_index, i), + group.shfl(lower_hash, i), + group.shfl(group_hash, i), + group.thread_rank()); + } + } else { +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + add_patterns( + group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank()); } } } - template - __device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index) + template + __device__ void add_coop(CG group, BuildKey build_key, bool is_valid) { constexpr auto num_threads = tile_size_v; - auto const rank = group.thread_rank(); + // Compute the hash and block index only for lanes whose key is valid; invalid lanes' values + // are never used (the shfl loop below gates per-iteration work on `group.shfl(is_valid, i)`). + // Skipping the hash here saves work on sparse stencil-gated inserts. + uint32_t upper_hash = 0; + uint32_t lower_hash = 0; + size_type block_index = 0; + if (is_valid) { + auto const sh = policy_.split_hash(build_key); + upper_hash = sh.first; + lower_hash = sh.second; + block_index = policy_.block_index(upper_hash, num_blocks_); + } - if constexpr (num_threads == words_per_block) { - auto atom_word = cuda::atomic_ref{ - *(words_ + (block_index * words_per_block + rank))}; - atom_word.fetch_or(policy_.word_pattern(hash_value, rank), cuda::memory_order_relaxed); + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + if (group.shfl(is_valid, i)) { + add_patterns_cs(group.shfl(block_index, i), + group.shfl(lower_hash, i), + group.shfl(group_hash, i), + group.thread_rank()); + } + } } else { - add_impl_group_functor functor{ - hash_value, block_index, words_, words_per_block, rank, num_threads, policy_}; - cuda::static_for(functor); +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + if (group.shfl(is_valid, i)) { + add_patterns( + group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank()); + } + } } } + // Device-side range add (single-thread): loops over keys. template - __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream) + __device__ void add(InputIt first, InputIt last) { - this->add_async(first, last, stream); -#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) - stream.sync(); -#else - stream.wait(); -#endif + auto const num_keys = cuco::detail::distance(first, last); + for (decltype(num_keys) i = 0; i < num_keys; ++i) { + this->add(*(first + i)); + } + } + + // Device-side range add (cooperative). When the tile size matches `add_horizontal_layout`, + // each batch loads/hashes one key per lane in parallel, then the tile cooperatively processes + // them via `add_coop`. Otherwise the tile parallelizes across the key range with each lane + // scalar-inserting its own keys. + template + __device__ void add(CG group, InputIt first, InputIt last) + { + using key_type = typename cuda::std::iterator_traits::value_type; + auto const num_keys = cuco::detail::distance(first, last); + if constexpr (tile_size_v == add_horizontal_layout && add_horizontal_layout > 1) { + auto constexpr num_threads = static_cast(tile_size_v); + for (decltype(num_keys) batch = 0; batch < num_keys; batch += num_threads) { + auto const idx = batch + static_cast(group.thread_rank()); + auto const is_valid = idx < num_keys; + key_type const key = is_valid ? *(first + idx) : key_type{}; + this->template add_coop(group, key, is_valid); + } + } else { + auto const stride = static_cast(tile_size_v); + for (auto i = static_cast(group.thread_rank()); i < num_keys; + i += stride) { + this->add(*(first + i)); + } + } } + // Host-side Add Entry Points template - __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream) + __host__ void add_async(InputIt first, InputIt last, cuda::stream_ref stream) noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - if constexpr (words_per_block == 1) { - CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN( - first, - num_keys, - [*this] __device__(key_type const key) mutable { this->add(key); }, - stream.get())); + auto constexpr block_size = 256; + auto constexpr cg_size = static_cast(add_horizontal_layout); + auto const grid_size = tuning::use_warp_cooperative_add_kernel + ? cuco::detail::int_div_ceil(num_keys, block_size) + : cuco::detail::int_div_ceil(num_keys * cg_size, block_size); + auto const l2_cache_size = static_cast(cuco::detail::l2_cache_size()); + auto const filter_size = static_cast(static_cast(num_blocks_)) * + words_per_block * sizeof(word_type); + + if (2 * filter_size < l2_cache_size) { + if constexpr (tuning::use_work_stealing_add_kernel) { + detail::bloom_filter_ns::add_work_stealing_n + <<>>(first, num_keys, *this); + } else { + detail::bloom_filter_ns::add_n + <<>>(first, num_keys, *this); + } } else { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto constexpr block_size = cuco::detail::default_block_size(); - void const* kernel = reinterpret_cast( - detail::bloom_filter_ns::add); - auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, kernel); - - detail::bloom_filter_ns::add - <<>>(first, num_keys, *this); + if constexpr (tuning::use_work_stealing_add_kernel) { + detail::bloom_filter_ns::add_work_stealing_n + <<>>(first, num_keys, *this); + } else { + detail::bloom_filter_ns::add_n + <<>>(first, num_keys, *this); + } } } - template - __host__ constexpr void add_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) + template + __host__ void add(InputIt first, InputIt last, cuda::stream_ref stream) noexcept { - this->add_if_async(first, last, stencil, pred, stream); + this->add_async(first, last, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else @@ -509,87 +435,232 @@ class bloom_filter_impl { #endif } - template - __host__ constexpr void add_if_async(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - cuda::stream_ref stream) noexcept + // Single Thread Contains. Layout-agnostic: when `contains_horizontal_layout > 1`, runs the + // per-virtual-thread work serially in the calling thread. + template + __device__ bool contains(ProbeKey probe_key) const { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto constexpr cg_size = add_optimal_cg_size(); - auto constexpr block_size = cuco::detail::default_block_size(); - auto const grid_size = - cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size); - - detail::bloom_filter_ns::add_if_n - <<>>(first, num_keys, stencil, pred, *this); + auto const [upper_hash, lower_hash] = policy_.split_hash(probe_key); + auto const block_index = policy_.block_index(upper_hash, num_blocks_); + + if constexpr (contains_horizontal_layout == 1) { + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; + return compare_pattern_cs<0>(block_index, lower_hash, group_hash); + } else { + return compare_pattern<0>(block_index, lower_hash); + } + } else { + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; + bool result = true; +#pragma unroll + for (uint32_t thread_index = 0; thread_index < contains_horizontal_layout; ++thread_index) { + result = + result && compare_patterns_cs<0>(block_index, lower_hash, group_hash, thread_index); + } + return result; + } else { + bool result = true; +#pragma unroll + for (uint32_t thread_index = 0; thread_index < contains_horizontal_layout; ++thread_index) { + result = result && compare_patterns<0>(block_index, lower_hash, thread_index); + } + return result; + } + } } - template - [[nodiscard]] __device__ bool contains(ProbeKey const& key) const + // Multi Thread Contains. Layout-flexible: when the CG size matches `contains_horizontal_layout`, + // runs the optimal cooperative path with one shared hash evaluation and an AND-reduction across + // the group; otherwise has one lane do the layout-agnostic scalar query and broadcasts the + // result so every lane returns the same value. + template + __device__ bool contains(CG group, ProbeKey probe_key) const { - auto const hash_value = policy_.hash(key); + namespace cg = cooperative_groups; - auto const stored_pattern = this->vec_load_words( - policy_.block_index(hash_value, num_blocks_) * words_per_block); + if constexpr (contains_horizontal_layout == 1 || + tile_size_v != contains_horizontal_layout) { + return cg::invoke_one_broadcast( + group, [&] __device__() -> bool { return this->contains(probe_key); }); + } - bool result = true; - contains_functor functor{ - hash_value, stored_pattern, policy_, &result}; - cuda::static_for(functor); - if (!result) { return false; } + auto const [upper_hash, lower_hash, block_index] = [&] __device__ { + if constexpr (tuning::use_invoke_one) { + return cg::invoke_one_broadcast(group, [&] __device__() { + auto const sh = policy_.split_hash(probe_key); + return cuda::std::make_tuple( + sh.first, sh.second, policy_.block_index(sh.first, num_blocks_)); + }); + } else { + auto const sh = policy_.split_hash(probe_key); + return cuda::std::make_tuple( + sh.first, sh.second, policy_.block_index(sh.first, num_blocks_)); + } + }(); - return true; + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; + return group.all( + compare_patterns_cs<0>(block_index, lower_hash, group_hash, group.thread_rank())); + } else { + return group.all(compare_patterns<0>(block_index, lower_hash, group.thread_rank())); + } } + // Warp-cooperative Contains template - [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const + __device__ bool contains_coop(CG group, ProbeKey probe_key) const { - constexpr auto num_threads = tile_size_v; - constexpr auto optimal_num_threads = contains_optimal_cg_size(); - constexpr auto words_per_thread = words_per_block / optimal_num_threads; + constexpr auto num_threads = tile_size_v; - // If single thread is optimal, use scalar contains - if constexpr (num_threads == 1 or optimal_num_threads == 1) { - return this->contains(key); + auto const [upper_hash, lower_hash] = policy_.split_hash(probe_key); + auto const block_index = policy_.block_index(upper_hash, num_blocks_); + bool result_out = false; + + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + auto const result = group.all(compare_patterns_cs<0>(group.shfl(block_index, i), + group.shfl(lower_hash, i), + group.shfl(group_hash, i), + group.thread_rank())); + if (i == group.thread_rank()) { result_out = result; } + } } else { - auto const rank = group.thread_rank(); - auto const hash_value = policy_.hash(key); - bool success = true; +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + auto const result = group.all(compare_patterns<0>( + group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank())); + if (i == group.thread_rank()) { result_out = result; } + } + } + return result_out; + } -// Use pragma unroll instead of cuda::static_for to avoid CUDA 12.0 compatibility issues -#pragma unroll - for (size_type i = 0; i < optimal_num_threads; ++i) { - if (i >= rank && (i - rank) % num_threads == 0) { - auto const thread_offset = i * words_per_thread; - auto const stored_pattern = this->vec_load_words( - policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset); + template + __device__ bool contains_coop(CG group, ProbeKey probe_key, bool is_valid) const + { + constexpr auto num_threads = tile_size_v; -#pragma unroll - for (size_type j = 0; j < words_per_thread; ++j) { - auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j); - if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; } - } + // Compute the hash and block index only for lanes whose key is valid; invalid lanes' values + // are never used (the shfl loop below gates per-iteration work on `group.shfl(is_valid, i)`). + // Skipping the hash here saves work on sparse stencil-gated queries. + uint32_t upper_hash = 0; + uint32_t lower_hash = 0; + size_type block_index = 0; + if (is_valid) { + auto const sh = policy_.split_hash(probe_key); + upper_hash = sh.first; + lower_hash = sh.second; + block_index = policy_.block_index(upper_hash, num_blocks_); + } + + bool result_out = false; + if constexpr (is_cache_sectorized) { + auto const group_hash = lower_hash * policy_type::group_index_salt; +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + if (group.shfl(is_valid, i)) { + auto const result = group.all(compare_patterns_cs<0>(group.shfl(block_index, i), + group.shfl(lower_hash, i), + group.shfl(group_hash, i), + group.thread_rank())); + if (i == group.thread_rank()) { result_out = result; } } } + } else { +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + if (group.shfl(is_valid, i)) { + auto const result = group.all(compare_patterns<0>( + group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank())); + if (i == group.thread_rank()) { result_out = result; } + } + } + } + return result_out; + } + + // Device-side range contains (single-thread): loops over keys. + template + __device__ void contains(InputIt first, InputIt last, OutputIt output_begin) const + { + auto const num_keys = cuco::detail::distance(first, last); + for (decltype(num_keys) i = 0; i < num_keys; ++i) { + *(output_begin + i) = this->contains(*(first + i)); + } + } - return group.all(success); + // Device-side range contains (cooperative). When the tile size matches + // `contains_horizontal_layout`, each batch loads/hashes one key per lane in parallel, then the + // tile cooperatively processes them via `contains_coop` so each lane gets the result for its + // own key. Otherwise the tile parallelizes across the key range with each lane scalar-querying + // its own keys. + template + __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) const + { + using key_type = typename cuda::std::iterator_traits::value_type; + auto const num_keys = cuco::detail::distance(first, last); + if constexpr (tile_size_v == contains_horizontal_layout && contains_horizontal_layout > 1) { + auto constexpr num_threads = static_cast(tile_size_v); + for (decltype(num_keys) batch = 0; batch < num_keys; batch += num_threads) { + auto const idx = batch + static_cast(group.thread_rank()); + auto const is_valid = idx < num_keys; + key_type const key = is_valid ? *(first + idx) : key_type{}; + auto const result = this->contains_coop(group, key, is_valid); + if (is_valid) { *(output_begin + idx) = result; } + } + } else { + auto const stride = static_cast(tile_size_v); + for (auto i = static_cast(group.thread_rank()); i < num_keys; + i += stride) { + *(output_begin + i) = this->contains(*(first + i)); + } } } - // TODO - // template - // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) - // const; + // Host-side Contains Entry Points + template + __host__ void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda::stream_ref stream) const noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + if constexpr (tuning::use_cub_kernels and ((words_per_block / contains_vertical_layout) == 1)) { + cub::DeviceTransform::Transform( + first, + output_begin, + num_keys, + [*this] __device__(auto const& key) { return this->contains(key); }, + stream.get()); + } else { + auto constexpr block_size = 256; + auto constexpr cg_size = static_cast(contains_horizontal_layout); + auto const grid_size = tuning::use_warp_cooperative_contains_kernel + ? cuco::detail::int_div_ceil(num_keys, block_size) + : cuco::detail::int_div_ceil(num_keys * cg_size, block_size); + + if constexpr (tuning::use_work_stealing_contains_kernel) { + detail::bloom_filter_ns::contains_work_stealing_n + <<>>(first, num_keys, output_begin, *this); + } else { + detail::bloom_filter_ns::contains_n + <<>>(first, num_keys, output_begin, *this); + } + } + } template __host__ void contains(InputIt first, InputIt last, OutputIt output_begin, - cuda::stream_ref stream) const + cuda::stream_ref stream) const noexcept { this->contains_async(first, last, output_begin, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) @@ -599,25 +670,43 @@ class bloom_filter_impl { #endif } - template - __host__ void contains_async(InputIt first, - InputIt last, - OutputIt output_begin, - cuda::stream_ref stream) const noexcept + // Host-side stencil-gated Add Entry Points + template + __host__ void add_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream) noexcept { - auto const always_true = cuda::constant_iterator{true}; - this->contains_if_async(first, last, always_true, cuda::std::identity{}, output_begin, stream); + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto constexpr block_size = 256; + auto constexpr cg_size = static_cast(add_horizontal_layout); + auto const grid_size = tuning::use_warp_cooperative_add_kernel + ? cuco::detail::int_div_ceil(num_keys, block_size) + : cuco::detail::int_div_ceil(num_keys * cg_size, block_size); + auto const l2_cache_size = static_cast(cuco::detail::l2_cache_size()); + auto const filter_size = static_cast(static_cast(num_blocks_)) * + words_per_block * sizeof(word_type); + + if (2 * filter_size < l2_cache_size) { + detail::bloom_filter_ns::add_if_n + <<>>(first, num_keys, stencil, pred, *this); + } else { + detail::bloom_filter_ns::add_if_n + <<>>(first, num_keys, stencil, pred, *this); + } } - template - __host__ void contains_if(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - OutputIt output_begin, - cuda::stream_ref stream) const + template + __host__ void add_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream) noexcept { - this->contains_if_async(first, last, stencil, pred, output_begin, stream); + this->add_if_async(first, last, stencil, pred, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else @@ -625,6 +714,7 @@ class bloom_filter_impl { #endif } + // Host-side stencil-gated Contains Entry Points template __host__ void contains_if_async(InputIt first, InputIt last, @@ -636,20 +726,26 @@ class bloom_filter_impl { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - auto constexpr cg_size = contains_optimal_cg_size(); - auto constexpr block_size = cuco::detail::default_block_size(); - auto const grid_size = - cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size); + auto constexpr block_size = 256; + auto constexpr cg_size = static_cast(contains_horizontal_layout); + auto const grid_size = tuning::use_warp_cooperative_contains_kernel + ? cuco::detail::int_div_ceil(num_keys, block_size) + : cuco::detail::int_div_ceil(num_keys * cg_size, block_size); detail::bloom_filter_ns::contains_if_n <<>>( first, num_keys, stencil, pred, output_begin, *this); } - __host__ constexpr void merge(bloom_filter_impl const& other, - cuda::stream_ref stream) + template + __host__ void contains_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda::stream_ref stream) const noexcept { - this->merge_async(other, stream); + this->contains_if_async(first, last, stencil, pred, output_begin, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else @@ -657,81 +753,332 @@ class bloom_filter_impl { #endif } - __host__ constexpr void merge_async(bloom_filter_impl const& other, - cuda::stream_ref stream) + // TODO + // [[nodiscard]] __host__ double occupancy() const; + // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const + // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks) + // template + // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const + // memory_to_use, cuda_thread_scope scope = {}) const noexcept; + + // private: + template + __device__ constexpr cuda::std::array vec_load_words(size_type index) const { - CUCO_EXPECTS(this->block_extent() == other.block_extent(), - "mismatching num_blocks in merge_async"); - CUCO_CUDA_TRY(cub::DeviceTransform::Transform( - cuda::std::tuple{this->data(), other.data()}, - this->data(), - this->block_extent() * words_per_block, - [] __device__(word_type a, word_type b) { return a | b; }, - stream.get())); + // The block storage is aligned to `alignment()`, but a per-lane load at offset `index` is + // only guaranteed to be aligned to `min(NumWords * sizeof(word_type), alignment())`. Hand the + // compiler the alignment that's actually delivered, not the block-level maximum. + constexpr auto load_alignment = + cuda::std::min(NumWords * sizeof(word_type), alignment()); + return *reinterpret_cast*>( + __builtin_assume_aligned(words_ + index, load_alignment)); } - __host__ constexpr void intersect(bloom_filter_impl const& other, - cuda::stream_ref stream) + //===--------------------------------------------------===// + // Parametric Filter Policy + //===--------------------------------------------------===// + /// Insert the given pattern into the filter + // Precondition: add_horizontal_layout == 1 + template + __device__ constexpr void add_pattern(uint32_t block_index, uint32_t lower_hash) { - this->intersect_async(other, stream); -#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) - stream.sync(); -#else - stream.wait(); -#endif + static_assert(add_horizontal_layout == 1, "add_pattern() requires add_horizontal_layout == 1"); + + if constexpr (LoopIndex < add_loop_count) { + auto const pattern = + policy_.template array_pattern(lower_hash); + auto* word_base = words_ + block_index * words_per_block + LoopIndex * add_vertical_layout; + + for (int i = 0; i < add_vertical_layout; ++i) { + atomic_or(word_base + i, pattern[i]); + } + + // Recurse. + add_pattern(block_index, lower_hash); + } } - __host__ constexpr void intersect_async( - bloom_filter_impl const& other, cuda::stream_ref stream) + // Precondition: add_horizontal_layout > 1 + template + __device__ constexpr void add_patterns(uint32_t block_index, + uint32_t lower_hash, + uint32_t thread_index) { - CUCO_EXPECTS(this->block_extent() == other.block_extent(), - "mismatching num_blocks in intersect_async"); - CUCO_CUDA_TRY(cub::DeviceTransform::Transform( - cuda::std::tuple{this->data(), other.data()}, - this->data(), - this->block_extent() * words_per_block, - [] __device__(word_type a, word_type b) { return a & b; }, - stream.get())); + static_assert(add_horizontal_layout > 1, "add_patterns() requires add_horizontal_layout > 1"); + + if constexpr (LoopIndex < add_loop_count) { + auto const pattern = + policy_.template array_pattern( + lower_hash, thread_index); + auto* word_base = words_ + block_index * words_per_block + + LoopIndex * add_vertical_layout * add_horizontal_layout + + thread_index * add_vertical_layout; + + for (int i = 0; i < add_vertical_layout; ++i) { + atomic_or(word_base + i, pattern[i]); + } + + // Recurse. + add_patterns(block_index, lower_hash, thread_index); + } } - [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; } + //===----------Cache-Sectorized Add----------===// + template + __device__ constexpr void add_pattern_cs(uint32_t block_index, + uint32_t lower_hash, + uint32_t group_hash) + { + auto constexpr add_groups_per_vertical_layout = policy_type::add_groups_per_vertical_layout; + auto constexpr group_index_width = policy_type::group_index_width; + auto constexpr group_index_mask = policy_type::group_index_mask; + auto constexpr words_per_group = policy_type::words_per_group; + + static_assert(add_horizontal_layout == 1, "add_pattern() requires add_horizontal_layout == 1"); + + if constexpr (LoopIndex < add_loop_count) { + auto const pattern = + policy_.template array_pattern(lower_hash); + auto* word_base = words_ + block_index * words_per_block + LoopIndex * add_vertical_layout; + + for (int i = 0; i < add_groups_per_vertical_layout; ++i) { + auto const group_index = + (group_hash >> (i + LoopIndex * add_groups_per_vertical_layout) * group_index_width) & + group_index_mask; + atomic_or(word_base + i * words_per_group + group_index, pattern[i]); + } - [[nodiscard]] __host__ __device__ constexpr word_type const* data() const noexcept + // Recurse. + add_pattern_cs(block_index, lower_hash, group_hash); + } + } + + template + __device__ constexpr void add_patterns_cs(uint32_t block_index, + uint32_t lower_hash, + uint32_t group_hash, + uint32_t thread_index) { - return words_; + auto constexpr add_groups_per_vertical_layout = policy_type::add_groups_per_vertical_layout; + auto constexpr group_index_width = policy_type::group_index_width; + auto constexpr group_index_mask = policy_type::group_index_mask; + auto constexpr words_per_group = policy_type::words_per_group; + + static_assert(add_horizontal_layout > 1, "add_patterns() requires add_horizontal_layout > 1"); + + if constexpr (LoopIndex < add_loop_count) { + auto const pattern = + policy_.template array_pattern( + lower_hash, thread_index); + auto* word_base = words_ + block_index * words_per_block + + LoopIndex * add_vertical_layout * add_horizontal_layout + + thread_index * add_vertical_layout; + + for (int i = 0; i < add_groups_per_vertical_layout; ++i) { + auto const group_index = + (group_hash >> (i + LoopIndex * add_groups_per_vertical_layout * add_horizontal_layout + + thread_index * add_groups_per_vertical_layout) * + group_index_width) & + group_index_mask; + atomic_or(word_base + i * words_per_group + group_index, pattern[i]); + } + + // Recurse. + add_patterns_cs( + block_index, lower_hash, group_hash, thread_index); + } } - [[nodiscard]] __host__ __device__ constexpr extent_type block_extent() const noexcept + template + __device__ constexpr void atomic_or(word_type* word_ptr, word_type pattern) const { - return num_blocks_; + if constexpr (tuning::use_cuda_atomic_ref) { + if constexpr (ConditionalAtomic) { + if ((*word_ptr & pattern) != pattern) { + auto atom_word = cuda::atomic_ref{*word_ptr}; + atom_word.fetch_or(pattern, cuda::memory_order_relaxed); + } + } else { + auto atom_word = cuda::atomic_ref{*word_ptr}; + atom_word.fetch_or(pattern, cuda::memory_order_relaxed); + } + } else { + if constexpr (ConditionalAtomic) { + if ((*word_ptr & pattern) != pattern) { + atomicOr(reinterpret_cast(word_ptr), + static_cast(pattern)); + } + } else { + atomicOr(reinterpret_cast(word_ptr), + static_cast(pattern)); + } + } } - // TODO - // [[nodiscard]] __host__ double occupancy() const; - // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const - // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks) - // template - // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const - // memory_to_use, cuda_thread_scope scope = {}) const noexcept; + /// Compare the stored pattern against the expected pattern for the given hash value. + // Precondition: contains_horizontal_layout == 1 + template + __device__ constexpr bool compare_pattern(uint32_t block_index, uint32_t lower_hash) const + { + static_assert(contains_horizontal_layout == 1, + "compare_pattern() requires contains_horizontal_layout == 1"); + + if constexpr (LoopIndex < contains_loop_count) { + auto const stored_pattern = this->vec_load_words( + block_index * words_per_block + LoopIndex * contains_vertical_layout); + auto const expected_pattern = + policy_.template array_pattern(lower_hash); + + bool match = true; + for (int i = 0; i < contains_vertical_layout; ++i) { + match &= (stored_pattern[i] & expected_pattern[i]) == expected_pattern[i]; + } - private: - template - __device__ constexpr cuda::std::array vec_load_words(size_type index) const + // Recurse. + // Early exit in this implementation occurs at the granulairy of contains_vertical_layout + // words. + if constexpr (tuning::use_early_exit) { + if (!match) { return false; } + return compare_pattern(block_index, lower_hash); + } else { + return compare_pattern(block_index, lower_hash) && match; + } + } else { + return true; + } + } + + // Precondition: contains_horizontal_layout > 1. + // Returns the per-thread AND across the loop-iteration slice owned by `thread_index`. Callers + // that need a per-CG result must reduce across the group (e.g. `group.all(...)`). + template + __device__ constexpr bool compare_patterns(uint32_t block_index, + uint32_t lower_hash, + uint32_t thread_index) const { - return *reinterpret_cast*>(__builtin_assume_aligned( - words_ + index, cuda::std::min(sizeof(word_type) * NumWords, max_vec_bytes()))); + static_assert(contains_horizontal_layout > 1, + "compare_patterns() requires HorizontalLayout > 1"); + + if constexpr (LoopIndex < contains_loop_count) { + auto const stored_pattern = this->vec_load_words( + block_index * words_per_block + + LoopIndex * contains_vertical_layout * contains_horizontal_layout + + thread_index * contains_vertical_layout); + auto const expected_pattern = + policy_ + .template array_pattern( + lower_hash, thread_index); + + bool match = true; + for (int i = 0; i < contains_vertical_layout; ++i) { + match &= (stored_pattern[i] & expected_pattern[i]) == expected_pattern[i]; + } + + // Per-thread early exit: short-circuit this thread's recursion if its slice already missed. + if constexpr (tuning::use_early_exit) { + if (!match) { return false; } + return compare_patterns(block_index, lower_hash, thread_index); + } else { + return compare_patterns(block_index, lower_hash, thread_index) && match; + } + } else { + return true; + } } - [[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size() + //===----------Cache-Sectorized Compare----------===// + template + __device__ constexpr bool compare_pattern_cs(uint32_t block_index, + uint32_t lower_hash, + uint32_t group_hash) const { - return words_per_block; // one thread per word so atomic updates can be coalesced + auto constexpr contains_groups_per_vertical_layout = + policy_type::contains_groups_per_vertical_layout; + auto constexpr group_index_width = policy_type::group_index_width; + auto constexpr group_index_mask = policy_type::group_index_mask; + auto constexpr words_per_group = policy_type::words_per_group; + + static_assert(contains_horizontal_layout == 1, + "compare_pattern() requires contains_horizontal_layout == 1"); + + if constexpr (LoopIndex < contains_loop_count) { + auto const* word_base = + words_ + block_index * words_per_block + LoopIndex * contains_vertical_layout; + auto const expected_pattern = + policy_.template array_pattern(lower_hash); + + bool match = true; + for (int i = 0; i < contains_groups_per_vertical_layout; ++i) { + auto const group_index = + (group_hash >> + (i + LoopIndex * contains_groups_per_vertical_layout) * group_index_width) & + group_index_mask; + match &= (word_base[i * words_per_group + group_index] & expected_pattern[i]) == + expected_pattern[i]; + } + + // Recurse. + // Early exit in this implementation occurs at the granulairy of contains_vertical_layout + // words. + if constexpr (tuning::use_early_exit) { + if (!match) { return false; } + return compare_pattern_cs(block_index, lower_hash, group_hash); + } else { + return compare_pattern_cs(block_index, lower_hash, group_hash) && match; + } + } else { + return true; + } } - [[nodiscard]] __host__ __device__ static constexpr int32_t contains_optimal_cg_size() + template + __device__ constexpr bool compare_patterns_cs(uint32_t block_index, + uint32_t lower_hash, + uint32_t group_hash, + uint32_t thread_index) const { - constexpr auto word_bytes = sizeof(word_type); - constexpr auto block_bytes = word_bytes * words_per_block; - return block_bytes / max_vec_bytes(); // one vector load per thread + auto constexpr contains_groups_per_vertical_layout = + policy_type::contains_groups_per_vertical_layout; + auto constexpr group_index_width = policy_type::group_index_width; + auto constexpr group_index_mask = policy_type::group_index_mask; + auto constexpr words_per_group = policy_type::words_per_group; + + static_assert(contains_horizontal_layout > 1, + "compare_patterns_cs() requires HorizontalLayout > 1"); + + if constexpr (LoopIndex < contains_loop_count) { + auto const* word_base = words_ + block_index * words_per_block + + LoopIndex * contains_vertical_layout * contains_horizontal_layout + + thread_index * contains_vertical_layout; + auto const expected_pattern = + policy_ + .template array_pattern( + lower_hash, thread_index); + + bool match = true; + for (int i = 0; i < contains_groups_per_vertical_layout; ++i) { + auto const group_index = + (group_hash >> + (i + LoopIndex * contains_groups_per_vertical_layout * contains_horizontal_layout + + thread_index * contains_groups_per_vertical_layout) * + group_index_width) & + group_index_mask; + match &= (word_base[i * words_per_group + group_index] & expected_pattern[i]) == + expected_pattern[i]; + } + + if constexpr (tuning::use_early_exit) { + if (!match) { return false; } + return compare_patterns_cs( + block_index, lower_hash, group_hash, thread_index); + } else { + return compare_patterns_cs( + block_index, lower_hash, group_hash, thread_index) && + match; + } + } else { + return true; + } } word_type* words_; diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl index b251dceee..e6f7d9d89 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,7 +52,7 @@ __host__ constexpr void bloom_filter_ref::clear(cuda template __host__ constexpr void bloom_filter_ref::clear_async( - cuda::stream_ref stream) + cuda::stream_ref stream) noexcept { impl_.clear_async(stream); } @@ -64,6 +64,13 @@ __device__ void bloom_filter_ref::add(ProbeKey const impl_.add(key); } +template +template +__device__ void bloom_filter_ref::add(InputIt first, InputIt last) +{ + impl_.add(first, last); +} + template template __device__ void bloom_filter_ref::add(CG group, ProbeKey const& key) @@ -92,7 +99,7 @@ __host__ constexpr void bloom_filter_ref::add(InputI template template __host__ constexpr void bloom_filter_ref::add_async( - InputIt first, InputIt last, cuda::stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) noexcept { impl_.add_async(first, last, stream); } @@ -121,6 +128,15 @@ template return impl_.contains(key); } +template +template +__device__ void bloom_filter_ref::contains(InputIt first, + InputIt last, + OutputIt output_begin) const +{ + impl_.contains(first, last, output_begin); +} + template template [[nodiscard]] __device__ bool bloom_filter_ref::contains( @@ -129,6 +145,16 @@ template return impl_.contains(group, key); } +template +template +__device__ void bloom_filter_ref::contains(CG group, + InputIt first, + InputIt last, + OutputIt output_begin) const +{ + impl_.contains(group, first, last, output_begin); +} + template template __host__ constexpr void bloom_filter_ref::contains( diff --git a/include/cuco/detail/bloom_filter/default_filter_policy.inl b/include/cuco/detail/bloom_filter/default_filter_policy.inl deleted file mode 100644 index eb8dbf703..000000000 --- a/include/cuco/detail/bloom_filter/default_filter_policy.inl +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace cuco { - -template -__host__ - __device__ constexpr default_filter_policy::default_filter_policy( - uint32_t pattern_bits, Hash hash) - : impl_{pattern_bits, hash} -{ -} - -template -__device__ constexpr typename default_filter_policy::hash_result_type -default_filter_policy::hash( - typename default_filter_policy::hash_argument_type const& key) const -{ - return impl_.hash(key); -} - -template -template -__device__ constexpr auto default_filter_policy::block_index( - typename default_filter_policy::hash_result_type hash, - Extent num_blocks) const -{ - return impl_.block_index(hash, num_blocks); -} - -template -__device__ constexpr typename default_filter_policy::word_type -default_filter_policy::word_pattern( - default_filter_policy::hash_result_type hash, - std::uint32_t word_index) const -{ - return impl_.word_pattern(hash, word_index); -} - -} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh deleted file mode 100644 index 14509b9b0..000000000 --- a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include -#include - -#include -#include - -namespace cuco::detail { - -template -class default_filter_policy_impl { - public: - using hasher = Hash; - using word_type = Word; - using hash_argument_type = typename hasher::argument_type; - using hash_result_type = decltype(std::declval()(std::declval())); - - static constexpr std::uint32_t words_per_block = WordsPerBlock; - - private: - static constexpr std::uint32_t word_bits = cuda::std::numeric_limits::digits; - static constexpr std::uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1); - - public: - __host__ __device__ explicit constexpr default_filter_policy_impl(uint32_t pattern_bits, - Hash hash) - : pattern_bits_{pattern_bits}, - min_bits_per_word_{pattern_bits_ / words_per_block}, - remainder_bits_{pattern_bits_ % words_per_block}, - hash_{hash} - { - NV_DISPATCH_TARGET( - NV_IS_HOST, - ( // This ensures each word in the block has at least one bit set; otherwise we would never - // use some of the words - constexpr uint32_t min_pattern_bits = words_per_block; - - // The maximum number of bits to be set for a key is capped by the total number of bits in - // the filter block - constexpr uint32_t max_pattern_bits = word_bits * words_per_block; - - constexpr uint32_t hash_bits = cuda::std::numeric_limits::digits; - constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width; - CUCO_EXPECTS( - pattern_bits <= max_pattern_bits_from_hash, - "`hash_result_type` too narrow to generate the requested number of `pattern_bits`"); - CUCO_EXPECTS(pattern_bits_ >= min_pattern_bits, - "`pattern_bits` must be at least `words_per_block`"); - CUCO_EXPECTS(pattern_bits_ <= max_pattern_bits, - "`pattern_bits` must be less than the total number of bits in a filter " - "block");)) - // TODO find a proper way to perform input checks/assertions on device without destroying the - // context (e.g. __trap()) - } - - __device__ constexpr hash_result_type hash(hash_argument_type const& key) const - { - return hash_(key); - } - - template - __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const - { - return hash % num_blocks; - } - - __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const - { - word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1; - - auto const bits_so_far = min_bits_per_word_ * word_index + - (word_index < remainder_bits_ ? word_index : remainder_bits_); - - hash >>= bits_so_far * bit_index_width; - - word_type word = 0; - int32_t const bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0); - - for (int32_t bit = 0; bit < bits_per_word; ++bit) { - word |= word_type{1} << (hash & bit_index_mask); - hash >>= bit_index_width; - } - - return word; - } - - private: - uint32_t pattern_bits_; - uint32_t min_bits_per_word_; - uint32_t remainder_bits_; - hasher hash_; -}; - -} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh index 6c918fb7d..55b1df7fe 100644 --- a/include/cuco/detail/bloom_filter/kernels.cuh +++ b/include/cuco/detail/bloom_filter/kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,39 +18,309 @@ #include #include +#include #include #include #include +#include namespace cuco::detail::bloom_filter_ns { CUCO_SUPPRESS_KERNEL_WARNINGS +template +__device__ void add_n_impl(InputIt first, cuco::detail::index_type n, Ref ref) +{ + namespace cg = cooperative_groups; + using key_type = typename cuda::std::iterator_traits::value_type; + + // Only use warp-cooperative kernels when CGSize > 1 + if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) { + auto const idx = cuco::detail::global_thread_id(); + auto group = cg::tiled_partition(cg::this_thread_block()); + auto const is_full_tile = (blockIdx.x + 1) * BlockSize <= n; + if (is_full_tile) { + key_type const& key = *(first + idx); + ref.add_coop(group, key); + } else { + auto const is_valid = idx < n; + key_type const& key = is_valid ? *(first + idx) : key_type{}; + ref.add_coop(group, key, is_valid); + } + } else { + auto const idx = cuco::detail::global_thread_id() / CGSize; + if constexpr (CGSize == 1) { + if (idx < n) { + key_type const& key = *(first + idx); + ref.add(key); + } + } else { + auto group = cg::tiled_partition(cg::this_thread_block()); + if (idx < n) { + key_type const& key = *(first + idx); + ref.add(group, key); + } + } + } +} + +template +CUCO_KERNEL __launch_bounds__(BlockSize) void add_n(InputIt first, + cuco::detail::index_type n, + Ref ref) +{ + add_n_impl(first, n, ref); +} -template -CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first, - cuco::detail::index_type n, - Ref ref) +template +__device__ void add_work_stealing_n_impl(InputIt first, cuco::detail::index_type n, Ref ref) { - namespace cg = cooperative_groups; + using key_type = typename cuda::std::iterator_traits::value_type; + + namespace cg = cooperative_groups; + namespace ptx = cuda::ptx; + + // Cluster launch control initialization: + __shared__ uint4 result; + __shared__ uint64_t bar; + int phase = 0; - constexpr auto tile_size = cuco::detail::warp_size(); + auto const block = cg::this_thread_block(); - auto const tile_idx = cuco::detail::global_thread_id() / tile_size; - auto const n_tiles = gridDim.x * BlockSize / tile_size; - auto const items_per_tile = cuco::detail::int_div_ceil(n, n_tiles); + cg::invoke_one(block, [&]() { ptx::mbarrier_init(&bar, 1); }); - auto const tile_start = tile_idx * items_per_tile; - if (tile_start >= n) { return; } - auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n; + int bx = blockIdx.x; - auto const tile = cg::tiled_partition(cg::this_thread_block()); + // Work-stealing loop: + while (true) { + // Protect result from overwrite in the next iteration, + // (also ensure barrier initialization at 1st iteration): + block.sync(); + + cg::invoke_one(block, [&]() { + // Acquire write of result in the async proxy: + ptx::fence_proxy_async_generic_sync_restrict( + ptx::sem_acquire, ptx::space_cluster, ptx::scope_cluster); + + cg::invoke_one(cg::coalesced_threads(), + [&]() { ptx::clusterlaunchcontrol_try_cancel(&result, &bar); }); + ptx::mbarrier_arrive_expect_tx( + ptx::sem_relaxed, ptx::scope_cta, ptx::space_shared, &bar, sizeof(uint4)); + }); + + // Computation: + // Only use warp-cooperative kernels when CGSize > 1 + if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) { + cuco::detail::index_type const idx = BlockSize * bx + threadIdx.x; + auto group = cg::tiled_partition(block); + auto const is_full_tile = (bx + 1) * BlockSize <= n; + if (is_full_tile) { + key_type const& key = *(first + idx); + ref.add_coop(group, key); + } else { + auto const is_valid = idx < n; + key_type const& key = is_valid ? *(first + idx) : key_type{}; + ref.add_coop(group, key, is_valid); + } + } else { + cuco::detail::index_type const idx = + (static_cast(BlockSize) * bx + threadIdx.x) / CGSize; + if constexpr (CGSize == 1) { + if (idx < n) { + key_type const& key = *(first + idx); + ref.add(key); + } + } else { + auto group = cg::tiled_partition(block); + if (idx < n) { + key_type const& key = *(first + idx); + ref.add(group, key); + } + } + } - ref.add(tile, first + tile_start, first + tile_stop); + // Cancellation request synchronization: + while (!ptx::mbarrier_try_wait_parity(ptx::sem_acquire, ptx::scope_cta, &bar, phase)) {} + phase ^= 1; + + // Cancellation request decoding: + bool success = ptx::clusterlaunchcontrol_query_cancel_is_canceled(result); + if (!success) break; + + bx = ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x(result); + + // Release read of result to the async proxy: + ptx::fence_proxy_async_generic_sync_restrict( + ptx::sem_release, ptx::space_shared, ptx::scope_cluster); + } } -template +CUCO_KERNEL __launch_bounds__(BlockSize) void add_work_stealing_n(InputIt first, + cuco::detail::index_type n, + Ref ref) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_100, + (add_work_stealing_n_impl(first, n, ref);), + (add_n_impl(first, n, ref);)) +} + +template +__device__ void contains_n_impl(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref ref) +{ + namespace cg = cooperative_groups; + using key_type = typename cuda::std::iterator_traits::value_type; + + // Only use warp-cooperative kernels when CGSize > 1 + if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) { + auto const idx = cuco::detail::global_thread_id(); + auto group = cg::tiled_partition(cg::this_thread_block()); + auto const is_full_tile = (blockIdx.x + 1) * BlockSize <= n; + if (is_full_tile) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains_coop(group, key); + } else { + auto const is_valid = idx < n; + key_type const& key = is_valid ? *(first + idx) : key_type{}; + auto const result = ref.contains_coop(group, key, is_valid); + if (is_valid) { *(output_begin + idx) = result; } + } + } else { + auto idx = cuco::detail::global_thread_id() / CGSize; + if constexpr (CGSize == 1) { + if (idx < n) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains(key); + } + } else { + auto group = cg::tiled_partition(cg::this_thread_block()); + if (idx < n) { + key_type const& key = *(first + idx); + // ref.contains(group, key) already reduces across the group via group.all(...). + auto const found = ref.contains(group, key); + if (group.thread_rank() == 0) { *(output_begin + idx) = found; } + } + } + } +} + +template +CUCO_KERNEL __launch_bounds__(BlockSize) void contains_n(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref ref) +{ + contains_n_impl(first, n, output_begin, ref); +} + +template +__device__ void contains_work_stealing_n_impl(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref ref) +{ + using key_type = typename cuda::std::iterator_traits::value_type; + + namespace cg = cooperative_groups; + namespace ptx = cuda::ptx; + + // Cluster launch control initialization: + __shared__ uint4 result; + __shared__ uint64_t bar; + int phase = 0; + + auto const block = cg::this_thread_block(); + + cg::invoke_one(block, [&]() { ptx::mbarrier_init(&bar, 1); }); + + int bx = blockIdx.x; + + // Work-stealing loop: + while (true) { + // Protect result from overwrite in the next iteration, + // (also ensure barrier initialization at 1st iteration): + block.sync(); + + cg::invoke_one(block, [&]() { + // Acquire write of result in the async proxy: + ptx::fence_proxy_async_generic_sync_restrict( + ptx::sem_acquire, ptx::space_cluster, ptx::scope_cluster); + + cg::invoke_one(cg::coalesced_threads(), + [&]() { ptx::clusterlaunchcontrol_try_cancel(&result, &bar); }); + ptx::mbarrier_arrive_expect_tx( + ptx::sem_relaxed, ptx::scope_cta, ptx::space_shared, &bar, sizeof(uint4)); + }); + + // Computation: + // Only use warp-cooperative kernels when CGSize > 1 + if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) { + cuco::detail::index_type const idx = BlockSize * bx + threadIdx.x; + auto group = cg::tiled_partition(block); + auto const is_full_tile = (bx + 1) * BlockSize <= n; + if (is_full_tile) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains_coop(group, key); + } else { + auto const is_valid = idx < n; + key_type const& key = is_valid ? *(first + idx) : key_type{}; + auto const result = ref.contains_coop(group, key, is_valid); + if (is_valid) { *(output_begin + idx) = result; } + } + } else { + cuco::detail::index_type const idx = + (static_cast(BlockSize) * bx + threadIdx.x) / CGSize; + if constexpr (CGSize == 1) { + if (idx < n) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains(key); + } + } else { + auto group = cg::tiled_partition(block); + if (idx < n) { + key_type const& key = *(first + idx); + // ref.contains(group, key) already reduces across the group via group.all(...). + auto const found = ref.contains(group, key); + if (group.thread_rank() == 0) { *(output_begin + idx) = found; } + } + } + } + + // Cancellation request synchronization: + while (!ptx::mbarrier_try_wait_parity(ptx::sem_acquire, ptx::scope_cta, &bar, phase)) {} + phase ^= 1; + + // Cancellation request decoding: + bool success = ptx::clusterlaunchcontrol_query_cancel_is_canceled(result); + if (!success) break; + + bx = ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x(result); + + // Release read of result to the async proxy: + ptx::fence_proxy_async_generic_sync_restrict( + ptx::sem_release, ptx::space_shared, ptx::scope_cluster); + } +} + +template +CUCO_KERNEL __launch_bounds__(BlockSize) void contains_work_stealing_n(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref ref) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_100, + (contains_work_stealing_n_impl(first, n, output_begin, ref);), + (contains_n_impl(first, n, output_begin, ref);)) +} + +template ::value_type; - [[maybe_unused]] auto const tile = - cg::tiled_partition(cg::this_thread_block()); - - while (idx < n) { - if (pred(*(stencil + idx))) { - typename cuda::std::iterator_traits::value_type const& insert_element{ - *(first + idx)}; - ref.add(tile, insert_element); + if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) { + auto const idx = cuco::detail::global_thread_id(); + auto group = cg::tiled_partition(cg::this_thread_block()); + auto const in_range = idx < n; + bool is_valid = false; + key_type key{}; + if (in_range) { + key = *(first + idx); + is_valid = pred(*(stencil + idx)); + } + ref.template add_coop(group, key, is_valid); + } else { + auto const idx = cuco::detail::global_thread_id() / CGSize; + if (idx < n && pred(*(stencil + idx))) { + key_type const& key = *(first + idx); + if constexpr (CGSize == 1) { + ref.template add(key); + } else { + auto group = cg::tiled_partition(cg::this_thread_block()); + ref.template add(group, key); + } } - idx += loop_stride; } } @@ -88,30 +368,44 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, - OutputIt out, + OutputIt output_begin, Ref ref) { - namespace cg = cooperative_groups; - - auto const loop_stride = cuco::detail::grid_stride() / CGSize; - auto idx = cuco::detail::global_thread_id() / CGSize; - - [[maybe_unused]] auto const tile = - cg::tiled_partition(cg::this_thread_block()); + namespace cg = cooperative_groups; + using key_type = typename cuda::std::iterator_traits::value_type; - if constexpr (CGSize == 1) { - while (idx < n) { - typename cuda::std::iterator_traits::value_type const& key = *(first + idx); - *(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false; - idx += loop_stride; + if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) { + auto const idx = cuco::detail::global_thread_id(); + auto group = cg::tiled_partition(cg::this_thread_block()); + auto const in_range = idx < n; + bool is_valid = false; + key_type key{}; + if (in_range) { + key = *(first + idx); + is_valid = pred(*(stencil + idx)); } + auto const result = ref.contains_coop(group, key, is_valid); + if (in_range) { *(output_begin + idx) = is_valid ? result : false; } } else { - auto const tile = cg::tiled_partition(cg::this_thread_block()); - while (idx < n) { - typename cuda::std::iterator_traits::value_type const& key = *(first + idx); - auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false; - if (tile.thread_rank() == 0) { *(out + idx) = found; } - idx += loop_stride; + auto const idx = cuco::detail::global_thread_id() / CGSize; + if (idx < n) { + if constexpr (CGSize == 1) { + if (pred(*(stencil + idx))) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains(key); + } else { + *(output_begin + idx) = false; + } + } else { + auto group = cg::tiled_partition(cg::this_thread_block()); + bool result = false; + if (pred(*(stencil + idx))) { + key_type const& key = *(first + idx); + // ref.contains(group, key) already reduces across the group via group.all(...). + result = ref.contains(group, key); + } + if (group.thread_rank() == 0) { *(output_begin + idx) = result; } + } } } } diff --git a/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh b/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh new file mode 100644 index 000000000..341850c6b --- /dev/null +++ b/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2025-2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace cuco::detail { + +/** + * @brief Sectorized Bloom filter policy with multiplicative-hashing fingerprint generation. + * + * Implements the Sectorized Bloom Filter (SBF) and Cache-Sectorized Bloom Filter (CSBF) variants + * from "Optimizing Bloom Filters for Modern GPU Architectures" (arXiv:2512.15595). Distributes + * `PatternBits` set bits across `WordsPerBlock` words using compile-time salt-based multiplicative + * hashing. The hash result is split into upper 32 bits (block selection via multiply-shift) and + * lower 32 bits (pattern generation), so a 64-bit hash function is required by design. + * + * @tparam Hash 64-bit hash functor whose return type satisfies `is_same_v`. + * @tparam Word Underlying word type of a filter block. Must be an atomically updatable integral. + * @tparam WordsPerBlock Words per filter block. Must be a power of two and <= 32. + * @tparam PatternBits Number of fingerprint bits (k in the paper). + * @tparam AddHorizontalLayout CG size used for `add` (paper's Theta). Must be a power of two and + * `AddHorizontalLayout * AddVerticalLayout <= WordsPerBlock`. + * @tparam AddVerticalLayout Contiguous words processed per thread per `add` step (paper's Phi). + * @tparam ContainsHorizontalLayout CG size used for `contains` (paper's Theta). + * @tparam ContainsVerticalLayout Contiguous words processed per thread per `contains` step (paper's + * Phi). + * @tparam GroupsPerBlock Cache-sectorization groups per block (paper's z). Defaults to + * `WordsPerBlock` for non-CSBF mode; setting `GroupsPerBlock < WordsPerBlock` enables CSBF. + */ +template +class parametric_filter_policy { + public: + using hasher = Hash; ///< 64-bit hash functor type + using word_type = Word; ///< Underlying filter-block word type + using hash_argument_type = typename hasher::argument_type; ///< Hash function input type + using hash_result_type = + decltype(std::declval()(std::declval())); ///< Hash function + ///< output type + + private: + static constexpr uint32_t max_salts = 64; + static constexpr cuda::std::array salts = { + 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, 0x705495c7U, 0x2df1424bU, 0x9efc4947U, + 0x5c6bfb31U, 0xb24bcdffU, 0xb6843d6dU, 0x6db04543U, 0x3a12efddU, 0xb0ddd463U, 0x8d22f6e7U, + 0xb82f1e53U, 0x7db9f86bU, 0xc7afe639U, 0xfb135cd7U, 0x693256e1U, 0x9466d871U, 0x23d3d02fU, + 0x6461d049U, 0x66a91621U, 0xbaa3006fU, 0x52fb8d99U, 0x3ea88b4fU, 0xf470cfdU, 0xb1db79a5U, + 0x9809fcd1U, 0xbced4445U, 0x2eb7c737U, 0x2cea6803U, 0x156f1955U, 0x8813c027U, 0xa26819f9U, + 0x4c3b57bdU, 0x7df94487U, 0xb975e769U, 0xb8f20cb5U, 0x5c9e2e77U, 0x5fb1735fU, 0x3a6f759bU, + 0x3c090923U, 0xfced424dU, 0xa187a6a9U, 0x6f070a41U, 0x2c85233bU, 0x7e62258bU, 0x2771ef17U, + 0x13bbf093U, 0x4ff059e5U, 0xe3ce3d0fU, 0xf1b4789fU, 0x9fbb6173U, 0x6a320cf5U, 0x1be2c481U, + 0x7ba8222bU, 0x6fd619b3U, 0x7b1bbf0dU, 0x8b8993adU, 0x448eca95U, 0x82ab09d9U, 0x2ce53909U, + 0x4f548685U}; + static constexpr uint32_t word_bits = cuda::std::numeric_limits::digits; + + public: + static constexpr uint32_t words_per_block = WordsPerBlock; ///< Number of words per filter block + static constexpr uint32_t pattern_bits = PatternBits; ///< Fingerprint bits per key + + static constexpr uint32_t add_horizontal_layout = + AddHorizontalLayout; ///< horizontal vectorization layout for add operation + static constexpr uint32_t add_vertical_layout = + AddVerticalLayout; ///< vertical vectorization layout for add operation + static constexpr uint32_t contains_horizontal_layout = + ContainsHorizontalLayout; ///< horizontal vectorization layout for contains operation + static constexpr uint32_t contains_vertical_layout = + ContainsVerticalLayout; ///< vertical vectorization layout for contains operation + + static constexpr size_t max_filter_blocks = + cuda::std::numeric_limits::max(); ///< Upper bound on the number of filter blocks + /// Lower bound on `pattern_bits`: at least one bit per word so every word contributes. + static constexpr auto min_pattern_bits = words_per_block; + /// Upper bound on `pattern_bits`: the total number of bits in a filter block, capped by the + /// number of available salts. + static constexpr auto max_pattern_bits = cuda::std::min(word_bits * words_per_block, max_salts); + + //===----------Cache-Sectorized----------===// + static constexpr uint32_t groups_per_block = + GroupsPerBlock; ///< Cache-sectorization groups per block (paper's z) + static constexpr bool is_cache_sectorized = + groups_per_block != words_per_block ? true : false; ///< CSBF mode flag + static constexpr uint32_t words_per_group = + words_per_block / groups_per_block; ///< Words per cache-sectorization group + // TODO: when `pattern_bits % groups_per_block != 0`, using a ceil packs all remainder bits into + // the first `pattern_bits / max_bits_per_group` groups, leaving later groups with a zero + // expected pattern. This wastes block capacity and inflates FPR. Distribute floor bits to every + // group plus one extra bit to the first `pattern_bits % groups_per_block` groups, and update + // the salt-to-group mapping in `set_bits` accordingly. + static constexpr uint32_t max_bits_per_group = cuco::detail::int_div_ceil( + pattern_bits, groups_per_block); ///< CSBF: max fingerprint bits set per group per key + static constexpr uint32_t add_groups_per_vertical_layout = + add_vertical_layout / words_per_group; ///< CSBF: groups touched per add vertical step + static constexpr uint32_t contains_groups_per_vertical_layout = + contains_vertical_layout / + words_per_group; ///< CSBF: groups touched per contains vertical step + static constexpr uint32_t group_index_salt = + 0x5bd1e995U; ///< CSBF: salt for selecting one word per group + static constexpr uint32_t group_index_width = cuda::std::bit_width( + words_per_group - 1); ///< CSBF: bits needed to encode an in-group word index + static constexpr uint32_t group_index_mask = + words_per_group - 1; ///< CSBF: mask for selecting an in-group word index + + private: + static constexpr uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1); + // TODO: same problem as `max_bits_per_group`. For non-multiple + // `(pattern_bits, words_per_block)` configs (e.g. PatternBits=12, WordsPerBlock=8), the salt + // walk in `set_bits` advances `PatternArrayIndex` every `max_bits_per_word` salts, packing all + // bits into the first `ceil(pattern_bits / words_per_block)` words and leaving the rest at + // zero. This wastes block capacity and inflates FPR. Distribute floor bits to every word plus + // one extra bit to the first `pattern_bits % words_per_block` words, and update the + // salt-to-word mapping in `set_bits` accordingly. + static constexpr uint32_t max_bits_per_word = + cuco::detail::int_div_ceil(pattern_bits, words_per_block); + + public: + /** + * @brief Constructs a parametric filter policy. + * + * @param hash Hash function used to generate fingerprints. + */ + __host__ __device__ constexpr parametric_filter_policy(Hash hash = {}) : hash_{hash} + { + static_assert(pattern_bits >= min_pattern_bits, + "pattern_bits must be at least words_per_block"); + static_assert(pattern_bits <= max_pattern_bits, + "pattern_bits must be less than the total number of bits in a filter block"); + // Require exact tiling. With `words_per_block` a power of two, this is equivalent to requiring + // both `add_horizontal_layout` and `add_vertical_layout` to be powers of two with product + // <= `words_per_block`. The internal loop count uses integer division on the product; non- + // dividing layouts would leave trailing words uninserted on add while contains still expects + // non-zero patterns there, producing false negatives for every inserted key. + static_assert(words_per_block % (add_horizontal_layout * add_vertical_layout) == 0, + "add_horizontal_layout * add_vertical_layout must evenly divide words_per_block"); + static_assert( + words_per_block % (contains_horizontal_layout * contains_vertical_layout) == 0, + "contains_horizontal_layout * contains_vertical_layout must evenly divide words_per_block"); + // The split_hash() design requires a 64-bit hash split into upper 32 bits (block selection + // via multiply-shift) and lower 32 bits (pattern generation via salt-based multiplicative + // hashing). This is a permanent design requirement, not a temporary limitation. + static_assert(cuda::std::is_same_v, + "parametric_filter_policy requires a 64-bit hash function"); + + //===----------Cache-Sectorized----------===// + static_assert( + is_cache_sectorized == false || (groups_per_block > 0 && groups_per_block < words_per_block && + (words_per_block % groups_per_block == 0)), + "in cache-sectorized filter, the number of groups must be positive, be fewer " + "than words_per_block, and evenly divide words_per_block"); + // Require the vertical layout to be a multiple of `words_per_group`. Floor-dividing + // `add_vertical_layout / words_per_group` to derive `add_groups_per_vertical_layout` would + // otherwise drop group-coverage for the trailing partial group, producing the same false- + // negative bug as the non-tiling check above. + static_assert(is_cache_sectorized == false || (add_vertical_layout % words_per_group == 0), + "in cache-sectorized filter, add_vertical_layout must be a multiple of " + "words_per_group"); + static_assert(is_cache_sectorized == false || (contains_vertical_layout % words_per_group == 0), + "in cache-sectorized filter, contains_vertical_layout must be a multiple of " + "words_per_group"); + static_assert(is_cache_sectorized == false || groups_per_block * group_index_width <= 32, + "in cache-sectorized filter, the number of bits needed to index groups must fit " + "within 32 bits"); + } + + /** + * @brief Splits the 64-bit hash of a key into its upper and lower 32 bits. + * + * The upper half is used for block selection (via multiply-shift); the lower half drives the + * per-word fingerprint pattern via salt-based multiplicative hashing. + * + * @param key Key to hash. + * + * @return `{upper 32 bits, lower 32 bits}` of the 64-bit hash. + */ + __device__ constexpr cuda::std::pair split_hash(hash_argument_type key) const + { + auto const hash_value = hash_(key); + return {static_cast(hash_value >> 32), static_cast(hash_value)}; + } + + /** + * @brief Determines the filter block a key maps to via fast multiply-shift modulo. + * + * @tparam Extent Size type used to determine the number of blocks in the filter. + * + * @param upper_hash_value Upper 32 bits of the key's hash. + * @param num_blocks Number of blocks in the filter. + * + * @return Block index in `[0, num_blocks)`. + */ + template + __device__ constexpr auto block_index(uint32_t upper_hash_value, Extent num_blocks) const + { + return static_cast((static_cast(upper_hash_value) * + static_cast(num_blocks)) >> + 32); + } + + /** + * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is 1. + * + * @tparam LoopIndex Outer-loop iteration index when `words_per_block / VerticalLayout > 1`. + * @tparam VerticalLayout Number of contiguous words this call produces. + * + * @param lower_hash_value Lower 32 bits of the key's hash. + * + * @return Array of `VerticalLayout` (or `groups_per_vertical_layout` in CSBF mode) words. + */ + template + __device__ constexpr auto array_pattern(uint32_t lower_hash_value) const + { + return pattern_impl(lower_hash_value); + } + + /** + * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is > 1. + * + * @tparam LoopIndex Outer-loop iteration index. + * @tparam HorizontalLayout Cooperative-group size cooperating on a single key. + * @tparam VerticalLayout Number of contiguous words this call produces. + * + * @param lower_hash_value Lower 32 bits of the key's hash. + * @param thread_index Caller's rank within the cooperative group. + * + * @return Array of `VerticalLayout` (or `groups_per_vertical_layout` in CSBF mode) words owned + * by the calling thread. + */ + template + __device__ constexpr auto array_pattern(uint32_t lower_hash_value, uint32_t thread_index) const + { + return pattern_impl(lower_hash_value, + thread_index); + } + + private: + hasher hash_; + + /** + * @brief pattern_impl - Computes the bit pattern for a vertical layout of words. + * I use the terminology of a `virtual thread` to refer to an ordering of the vertical layouts, + * namely + * virtual_thread_index = LoopIndex * HorizontalLayout + thread_index, + * where LoopIndex is the index of the outermost loop in the range: + * [0, words_per_block / (HorizontalLayout * VerticalLayout)). + * @param hash + * @return cuda::std::array - The bit pattern for the vertical layout + * defined by the LoopIndex. + */ + + // Precondition: _horizontal_layout == 1 + template + __device__ constexpr auto pattern_impl(uint32_t hash) const + { + // For cache-sectorized, we set bits for one word for each group per vertical layout. + constexpr uint32_t groups_per_vertical_layout = VerticalLayout / words_per_group; + using pattern_array_t = + cuda::std::conditional_t, + cuda::std::array>; + + // Sanity check + constexpr uint32_t num_iterations = words_per_block / VerticalLayout; + static_assert(LoopIndex < num_iterations, + "the loop index cannot exceed the number of loop iterations"); + + pattern_array_t pattern_array{0}; + constexpr uint32_t salt_start_index = + is_cache_sectorized ? max_bits_per_group * groups_per_vertical_layout * LoopIndex + : max_bits_per_word * VerticalLayout * LoopIndex; + constexpr uint32_t salt_end_index = + is_cache_sectorized + ? cuda::std::min(salt_start_index + max_bits_per_group * groups_per_vertical_layout, + pattern_bits) + : cuda::std::min(salt_start_index + max_bits_per_word * VerticalLayout, pattern_bits); + constexpr uint32_t pattern_array_start_index = 0; + set_bits(hash, pattern_array); + return pattern_array; + } + + // Precondition: _horizontal_layout > 1 + template + __device__ constexpr auto pattern_impl(uint32_t hash, uint32_t thread_index) const + { + // For cache-sectorized, we set bits for one word for each group per vertical layout. + constexpr uint32_t groups_per_vertical_layout = VerticalLayout / words_per_group; + using pattern_array_t = + cuda::std::conditional_t, + cuda::std::array>; + + // Sanity check + constexpr uint32_t num_iterations = words_per_block / (HorizontalLayout * VerticalLayout); + static_assert(LoopIndex < num_iterations, + "the loop index cannot exceed the number of loop iterations"); + + // [lower_bound, upper_bound) defines the range of virtual thread indices for this loop + // iteration. + constexpr uint32_t lower_bound = LoopIndex * HorizontalLayout; + constexpr uint32_t upper_bound = lower_bound + HorizontalLayout; + + // A virtual thread flips max_bits_per_virtual_thread bits in the pattern array, excepting + // potentially some of the last virtual threads (if [sectorized] pattern_bits % words_per_block + // != 0 or [cache-sectorized] pattern_bits & groups_per_block != 0). + constexpr uint32_t max_bits_per_virtual_thread = + is_cache_sectorized ? max_bits_per_group * groups_per_vertical_layout + : max_bits_per_word * VerticalLayout; + + pattern_array_t pattern_array{0}; + if constexpr (num_iterations == 1) { + thread_dispatch( + hash, thread_index, pattern_array); + } else { + const uint32_t virtual_thread_index = LoopIndex * HorizontalLayout + thread_index; + thread_dispatch( + hash, virtual_thread_index, pattern_array); + } + return pattern_array; + } + + // Dispatches a dynamic virtual thread index to a static virtual thread index by building a + // compile-time decision tree over the range [LowerBound, UpperBound) for the virtual thread + // index. This method is only used when _horizontal_layout > 1. + template + __device__ constexpr void thread_dispatch(uint32_t hash, + uint32_t thread_index, + PatternArrayT& pattern_array) const + { + // Sanity check + static_assert(LowerBound < UpperBound); + + if constexpr (LowerBound + 1 == UpperBound) { + // Base case: thread_index == LowerBound + constexpr uint32_t salt_start_index = MaxBitsPerVirtualThread * LowerBound; + constexpr uint32_t salt_end_index = + cuda::std::min(salt_start_index + MaxBitsPerVirtualThread, pattern_bits); + constexpr uint32_t pattern_array_start_index = 0; + set_bits(hash, pattern_array); + } else { + // Recursive case: thread_index > LowerBound + constexpr uint32_t mid = (LowerBound + UpperBound) / 2; + if (thread_index < mid) { + thread_dispatch( + hash, thread_index, pattern_array); + } else { + thread_dispatch( + hash, thread_index, pattern_array); + } + } + } + + // Set bits in the pattern array using salts starting from SaltIndex. + template + __device__ constexpr void set_bits(uint32_t hash, PatternArrayT& pattern_array) const + { + if constexpr (SaltIndex < SaltEndIndex) { + // Select top bit_index_width bits from salted hash to determine the bit index. + // if (threadIdx.x == 0) { printf("Salt Idx: %u\n", SaltIndex); } + const uint32_t bit_index = + (cuda::std::get(salts) * hash) >> (32 - bit_index_width); + + // Set the bit in the pattern array. + cuda::std::get(pattern_array) |= word_type{1} << bit_index; + + // Recurse. + constexpr uint32_t next_salt_index = SaltIndex + 1; + constexpr uint32_t next_pattern_array_index = + is_cache_sectorized + ? PatternArrayIndex + (next_salt_index % max_bits_per_group == 0 ? 1 : 0) + : PatternArrayIndex + (next_salt_index % max_bits_per_word == 0 ? 1 : 0); + set_bits(hash, pattern_array); + } + } +}; + +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/utility/cuda.hpp b/include/cuco/detail/utility/cuda.hpp index ae2613f21..5d6342427 100644 --- a/include/cuco/detail/utility/cuda.hpp +++ b/include/cuco/detail/utility/cuda.hpp @@ -75,5 +75,16 @@ constexpr auto max_occupancy_grid_size(std::int32_t block_size, return max_active_blocks_per_multiprocessor * num_multiprocessors; } +inline auto l2_cache_size() +{ + int device = 0; + CUCO_CUDA_TRY(cudaGetDevice(&device)); + + int l2_cache_size = -1; + CUCO_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device)); + + return l2_cache_size; +} + } // namespace detail } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 51d4f42c1..08da93a47 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -154,9 +154,9 @@ ConfigureTest(HYPERLOGLOG_TEST # - bloom_filter ---------------------------------------------------------------------------------- ConfigureTest(BLOOM_FILTER_TEST bloom_filter/unique_sequence_test.cu - bloom_filter/arrow_policy_test.cu bloom_filter/variable_cg_test.cu - bloom_filter/merge_intersect_test.cu) + bloom_filter/merge_intersect_test.cu + bloom_filter/arrow_compat_test.cu) ################################################################################################### # - roaring_bitmap --------------------------------------------------------------------------------- diff --git a/tests/bloom_filter/arrow_policy_test.cu b/tests/bloom_filter/arrow_compat_test.cu similarity index 82% rename from tests/bloom_filter/arrow_policy_test.cu rename to tests/bloom_filter/arrow_compat_test.cu index 1ca349162..2e820fde2 100644 --- a/tests/bloom_filter/arrow_policy_test.cu +++ b/tests/bloom_filter/arrow_compat_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,18 +14,23 @@ * limitations under the License. */ +// Verifies that a `parametric_filter_policy` instantiation with Apache Arrow's Block-Split Bloom +// Filter parameters (256-bit blocks of 8 x uint32_t, 8 fingerprint bits per key, fully horizontal +// add, fully vertical contains) produces byte-identical bitsets to a precomputed Arrow reference. + #include #include #include #include -#include #include -#include +#include #include +#include +#include namespace { @@ -73,11 +78,11 @@ thrust::device_vector get_arrow_filter_reference_bitset() }; if constexpr (std::is_same_v) { - return reference_bitsets[0]; // int32 + return reference_bitsets[0]; } else if constexpr (std::is_same_v) { - return reference_bitsets[1]; // int64 + return reference_bitsets[1]; } else if constexpr (std::is_same_v) { - return reference_bitsets[2]; // float + return reference_bitsets[2]; } else { throw std::invalid_argument("Reference bitsets available for int32, int64, float only.\n\n"); } @@ -93,11 +98,11 @@ std::pair get_arrow_filter_test_settings() }; if constexpr (std::is_same_v) { - return test_settings[0]; // int32 + return test_settings[0]; } else if constexpr (std::is_same_v) { - return test_settings[1]; // int64 + return test_settings[1]; } else if constexpr (std::is_same_v) { - return test_settings[2]; // float + return test_settings[2]; } else { throw std::invalid_argument("Test settings available for int32, int64, float only.\n\n"); } @@ -119,20 +124,14 @@ void test_filter_bitset(Filter& filter, size_t num_keys) using key_type = typename Filter::key_type; using word_type = typename Filter::word_type; - // Generate keys auto const h_keys = sequence_values(num_keys); thrust::device_vector d_keys(h_keys.begin(), h_keys.end()); - // Insert to the bloom filter filter.add(d_keys.begin(), d_keys.begin() + num_keys); - // Get reference words device_vector auto const reference_words = get_arrow_filter_reference_bitset(); + auto const num_words = filter.block_extent() * filter.words_per_block; - // Number of words in the filter - auto const num_words = filter.block_extent() * filter.words_per_block; - - // Get the bitset thrust::device_vector filter_words(filter.data(), filter.data() + num_words); REQUIRE(cuco::test::equal( @@ -144,17 +143,19 @@ void test_filter_bitset(Filter& filter, size_t num_keys) }))); } -TEMPLATE_TEST_CASE_SIG("bloom_filter arrow filter policy bitset validation", +TEMPLATE_TEST_CASE_SIG("bloom_filter arrow-compatible parametric policy bitset validation", "", (class Key), (int32_t), (int64_t), (float)) { - // Get test settings auto const [sub_filters, num_keys] = get_arrow_filter_test_settings(); - using policy_type = cuco::arrow_filter_policy; + // Apache Arrow Block-Split Bloom Filter parameters: 256-bit blocks (8 x uint32_t), 8 fingerprint + // bits per key, fully horizontal add (Theta=8) and fully vertical contains (Phi=8). + using policy_type = + cuco::parametric_filter_policy, std::uint32_t, 8, 8, 8, 1, 1, 8>; cuco::bloom_filter, cuda::thread_scope_device, policy_type> filter{ sub_filters}; diff --git a/tests/bloom_filter/merge_intersect_test.cu b/tests/bloom_filter/merge_intersect_test.cu index 5638d58a8..43708c359 100644 --- a/tests/bloom_filter/merge_intersect_test.cu +++ b/tests/bloom_filter/merge_intersect_test.cu @@ -162,42 +162,15 @@ TEMPLATE_TEST_CASE_SIG( "bloom_filter merge and intersect tests", "", ((class Key, class Policy), Key, Policy), - (int32_t, cuco::default_filter_policy, uint32_t, 1>), - (int32_t, cuco::default_filter_policy, uint32_t, 8>), - (int64_t, cuco::default_filter_policy, uint64_t, 1>), - (int64_t, cuco::default_filter_policy, uint64_t, 8>)) + (int32_t, cuco::parametric_filter_policy, uint32_t, 1, 1, 1, 1, 1, 1>), + (int32_t, cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 1, 8>), + (int64_t, cuco::parametric_filter_policy, uint64_t, 1, 1, 1, 1, 1, 1>), + (int64_t, cuco::parametric_filter_policy, uint64_t, 8, 8, 8, 1, 1, 8>)) { using filter_type = cuco::bloom_filter, cuda::thread_scope_device, Policy>; constexpr size_type capacity{1000}; - uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1); - - // some parameter combinations might be invalid so we skip them - try { - [[maybe_unused]] auto policy = Policy{pattern_bits}; - } catch (std::exception const& e) { - SKIP(e.what()); - } - - auto filter_a = filter_type{capacity, {}, {pattern_bits}}; - auto filter_b = filter_type{capacity, {}, {pattern_bits}}; - auto filter_c = filter_type{static_cast(capacity) * 2, {}, {pattern_bits}}; - - test_merge_intersect(filter_a, filter_b, filter_c, capacity); -} - -TEMPLATE_TEST_CASE_SIG("bloom_filter merge and intersect arrow tests", - "", - ((class Key, class Policy), Key, Policy), - (int32_t, cuco::arrow_filter_policy), - (int64_t, cuco::arrow_filter_policy), - (float, cuco::arrow_filter_policy)) -{ - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, Policy>; - constexpr size_type capacity{1000}; // Must match capacity used in helper logic - auto filter_a = filter_type{capacity}; auto filter_b = filter_type{capacity}; auto filter_c = filter_type{static_cast(capacity) * 2}; diff --git a/tests/bloom_filter/unique_sequence_test.cu b/tests/bloom_filter/unique_sequence_test.cu index 4cf24563c..2215bdbbd 100644 --- a/tests/bloom_filter/unique_sequence_test.cu +++ b/tests/bloom_filter/unique_sequence_test.cu @@ -86,38 +86,14 @@ void test_unique_sequence(Filter& filter, size_type num_keys) } TEMPLATE_TEST_CASE_SIG( - "bloom_filter default policy tests", + "bloom_filter parametric policy tests", "", ((class Key, class Policy), Key, Policy), - (int32_t, cuco::default_filter_policy, uint32_t, 1>), - (int32_t, cuco::default_filter_policy, uint32_t, 8>), - (int32_t, cuco::default_filter_policy, uint64_t, 1>), - (int32_t, cuco::default_filter_policy, uint64_t, 8>)) -{ - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, Policy>; - constexpr size_type num_keys{400}; - - uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4); - - // some parameter combinations might be invalid so we skip them - try { - [[maybe_unused]] auto policy = Policy{pattern_bits}; - } catch (std::exception const& e) { - SKIP(e.what()); - } - - auto filter = filter_type{1000, {}, {pattern_bits}}; - - test_unique_sequence(filter, num_keys); -} - -TEMPLATE_TEST_CASE_SIG("bloom_filter arrow policy tests", - "", - ((class Key, class Policy), Key, Policy), - (int32_t, cuco::arrow_filter_policy), - (uint64_t, cuco::arrow_filter_policy), - (float, cuco::arrow_filter_policy)) + (int32_t, cuco::parametric_filter_policy, uint32_t, 1, 1, 1, 1, 1, 1>), + (uint64_t, + cuco::parametric_filter_policy, uint32_t, 8, 12, 8, 1, 4, 2>), + (float, cuco::parametric_filter_policy, uint64_t, 4, 4, 2, 2, 1, 2>), + (int32_t, cuco::parametric_filter_policy, uint32_t, 8, 8, 2, 2, 1, 8>)) { using filter_type = cuco::bloom_filter, cuda::thread_scope_device, Policy>; diff --git a/tests/bloom_filter/variable_cg_test.cu b/tests/bloom_filter/variable_cg_test.cu index adb6b703f..1c152aa44 100644 --- a/tests/bloom_filter/variable_cg_test.cu +++ b/tests/bloom_filter/variable_cg_test.cu @@ -14,100 +14,139 @@ * limitations under the License. */ +// Exercises `cuco::bloom_filter_ref` device-side APIs (`ref.add(key)`, `ref.contains(key)`, +// `ref.contains(group, key)`) directly from custom kernels, varying the cooperative group size. +// The bulk host-side APIs route through the warp-cooperative kernel which already had correct +// CG-reduction semantics; the scalar and CG ref methods are separate code paths that need their +// own coverage. + #include #include -#include -#include +#include #include #include -#include #include +#include + #include -#include #include -#include using size_type = int32_t; -template -void test_variable_cg_size(Filter& filter, size_type num_keys) +namespace cg = cooperative_groups; + +template +__global__ void scalar_add_kernel(Ref ref, Key const* keys, size_type n) { - constexpr int32_t block_size = 128; - constexpr int32_t grid_size = 128; + auto const i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i < n) { ref.add(keys[i]); } +} - using Key = typename Filter::key_type; +template +__global__ void scalar_contains_kernel(Ref ref, Key const* keys, size_type n, bool* out) +{ + auto const i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i < n) { out[i] = ref.contains(keys[i]); } +} - auto ref = filter.ref(); +template +__global__ void cg_contains_consistency_kernel(Ref ref, + Key const* keys, + size_type n, + int* mismatches) +{ + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + auto const idx = static_cast((blockIdx.x * blockDim.x + threadIdx.x) / CGSize); + if (idx >= n) { return; } + bool const got = ref.contains(tile, keys[idx]); + bool const all_agree = tile.all(got); + bool const any_agree = tile.any(got); + if (tile.thread_rank() == 0 && all_agree != any_agree) { atomicAdd(mismatches, 1); } +} + +TEMPLATE_TEST_CASE_SIG( + "bloom_filter device ref scalar add and contains", + "", + ((class Key, class Policy), Key, Policy), + (int32_t, cuco::default_filter_policy), + (int32_t, cuco::parametric_filter_policy, uint32_t, 1, 1, 1, 1, 1, 1>), + (int32_t, cuco::parametric_filter_policy, uint32_t, 8, 8, 4, 2, 4, 2>)) +{ + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, Policy>; + constexpr size_type num_keys{400}; + + auto filter = filter_type{1000}; - // Generate keys thrust::device_vector keys(num_keys); thrust::sequence(thrust::device, keys.begin(), keys.end()); - thrust::device_vector contained(num_keys, false); - auto const always_true = cuda::constant_iterator{true}; - - SECTION("Check if fallback kernels work for varying combinations of CG sizes.") - { - cuco::detail::bloom_filter_ns::add_if_n - <<>>(keys.begin(), num_keys, always_true, cuda::std::identity{}, ref); - cuco::detail::bloom_filter_ns::contains_if_n - <<>>( - keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref); - REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); - } - - filter.clear(); - thrust::fill(contained.begin(), contained.end(), false); // reset output vector - - SECTION("Check if adaptive add kernel works with fallback contains kernel.") - { - cuco::detail::bloom_filter_ns::add - <<>>(keys.begin(), num_keys, ref); - cuco::detail::bloom_filter_ns::contains_if_n - <<>>( - keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref); - REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); - } - - // TODO adaptive vs. adaptive and fallback add vs. adaptive contains (requires #673) + auto ref = filter.ref(); + + constexpr int block_size = 128; + int const grid_size = (num_keys + block_size - 1) / block_size; + + scalar_add_kernel<<>>( + ref, thrust::raw_pointer_cast(keys.data()), num_keys); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + scalar_contains_kernel<<>>(ref, + thrust::raw_pointer_cast(keys.data()), + num_keys, + thrust::raw_pointer_cast(contained.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); } TEMPLATE_TEST_CASE_SIG( - "bloom_filter variable CG size tests", + "bloom_filter device ref CG contains is reduced across the group", "", - ((int32_t AddCGSize, int32_t ContainsCGSize, class Key, class Policy), - AddCGSize, - ContainsCGSize, - Key, - Policy), - (1, 4, int32_t, cuco::default_filter_policy, uint32_t, 1>), - (1, 4, int32_t, cuco::default_filter_policy, uint32_t, 8>), - (1, 4, int32_t, cuco::default_filter_policy, uint64_t, 1>), - (1, 4, int32_t, cuco::default_filter_policy, uint64_t, 8>), - (4, 1, int32_t, cuco::default_filter_policy, uint32_t, 1>), - (4, 1, int32_t, cuco::default_filter_policy, uint32_t, 8>), - (4, 1, int32_t, cuco::default_filter_policy, uint64_t, 1>), - (4, 1, int32_t, cuco::default_filter_policy, uint64_t, 8>)) + ((int32_t CGSize, class Key, class Policy), CGSize, Key, Policy), + (4, + int32_t, + cuco::parametric_filter_policy, uint32_t, 8, 8, 4, 2, 4, 2>), + (8, + int32_t, + cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 8, 1>)) { using filter_type = cuco::bloom_filter, cuda::thread_scope_device, Policy>; - constexpr size_type num_keys{400}; + // Small filter + many probes drive non-inserted keys into the partial-match regime where + // different lanes in the tile see different per-slice match results. Without a group reduction + // in `ref.contains(group, key)`, each lane returns its slice's partial result and the tile + // disagrees on the answer. + constexpr size_type num_inserted{200}; + constexpr size_type num_probed{2000}; + constexpr size_type num_blocks{16}; + + auto filter = filter_type{num_blocks}; + auto ref = filter.ref(); + + thrust::device_vector insert_keys(num_inserted); + thrust::sequence(thrust::device, insert_keys.begin(), insert_keys.end()); + filter.add(insert_keys.begin(), insert_keys.end()); + + thrust::device_vector probe_keys(num_probed); + thrust::sequence(thrust::device, probe_keys.begin(), probe_keys.end()); - uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4); + thrust::device_vector mismatches(1, 0); - // some parameter combinations might be invalid so we skip them - try { - [[maybe_unused]] auto policy = Policy{pattern_bits}; - } catch (std::exception const& e) { - SKIP(e.what()); - } + constexpr int block_size = 128; + int const grid_size = (num_probed * CGSize + block_size - 1) / block_size; - auto filter = filter_type{1000, {}, {pattern_bits}}; + cg_contains_consistency_kernel + <<>>(ref, + thrust::raw_pointer_cast(probe_keys.data()), + num_probed, + thrust::raw_pointer_cast(mismatches.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); - test_variable_cg_size(filter, num_keys); + REQUIRE(static_cast(mismatches[0]) == 0); }