From db77cede68ebb2ae72bb64fe0cf09e8947f863da Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 17:02:27 -0400 Subject: [PATCH 01/15] Add initial templates for Robin Hood probing. --- .../probing_scheme/probing_scheme_impl.inl | 83 ++++++++++++++++ include/cuco/probing_scheme.cuh | 97 +++++++++++++++++++ 2 files changed, 180 insertions(+) diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 8bed41778..8b93076c3 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -208,4 +208,87 @@ double_hashing::hash_function() const noexcept return {hash1_, hash2_}; } +namespace detail { + +/** + * @brief Robin Hood inverse primitive for the linear probing sequence. + * + * @note Recovers a resident's probe distance ("age") from the slot it occupies: how many probing + * steps the resident sits from its own home bucket. For the linear sequence this is a single + * subtract — `(slot_base - resident_home) / stride mod num_buckets`. This is the linear-only + * overload; a `double_hashing` variant would add its own overload here (a modular inverse of the + * resident's per-key step, or a stored age), which is the single place that change lands. + * + * @tparam BucketSize Size of the bucket + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash Unary callable type + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param scheme The underlying linear probing scheme (supplies the hash function) + * @param resident_key The key currently residing in the slot + * @param slot_index The slot index at which `resident_key` resides + * @param upper_bound Upper bound of the iteration + * @return The resident's probe distance, in probing steps + */ +template +[[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( + linear_probing const& scheme, + ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) noexcept +{ + using size_type = typename Extent::value_type; + size_type constexpr stride = CGSize * BucketSize; + auto const bound = static_cast(upper_bound); + auto const hash = scheme.hash_function(); + + // Home bucket base of the resident, using the same alignment as `make_iterator`. + size_type const resident_home = + cuco::detail::sanitize_hash(hash(resident_key)) % (bound / stride) * stride; + + // Bucket-strided base of the slot the resident currently occupies. The per-lane `thread_rank` + // offset (which is < stride) is stripped by the floor division so that the distance is measured + // in whole probing steps, consistent with the forward sequence. + size_type const slot_base = (slot_index / stride) * stride; + + // (slot_base - resident_home) mod capacity, expressed in probing steps. + return static_cast((slot_base + bound - resident_home) % bound) / stride; +} + +} // namespace detail + +template +__host__ __device__ constexpr robin_hood_probing::robin_hood_probing( + Underlying const& probing) + : Underlying{probing} +{ +} + +template +template +__host__ __device__ constexpr auto robin_hood_probing::rebind_hash_function( + NewHash const& hash) const noexcept +{ + auto const inner = static_cast(*this).rebind_hash_function(hash); + return robin_hood_probing>{inner}; +} + +template +template +__host__ __device__ constexpr typename Extent::value_type +robin_hood_probing::probe_distance(ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) const noexcept +{ + // Delegate to the per-underlying-scheme inverse overload. Only `linear_probing` provides one + // today; wrapping a scheme without a matching overload is a compile-time error. + return cuco::detail::probe_distance( + static_cast(*this), resident_key, slot_index, upper_bound); +} + } // namespace cuco diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index c809794dc..f74c8fe80 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -209,6 +209,87 @@ class double_hashing : private detail::probing_scheme_base { Hash2 hash2_; }; +/** + * @brief Public Robin Hood probing scheme class. + * + * @note Robin Hood probing wraps an underlying probe sequence (e.g. `cuco::linear_probing`) and + * pairs it with the Robin Hood invariant: on insert, an in-flight key displaces any resident that + * sits closer to its own home than the in-flight key is to its home, and the displaced resident is + * then re-inserted from that point onward. This keeps probe lengths tightly distributed, which is + * especially valuable on GPUs where a tile's tail latency is set by its longest probe. + * + * @note This class is a thin decorator over `Underlying`. It forwards the forward probe sequence + * (`make_iterator`, `hash_function`) unchanged and contributes the `cuco::is_robin_hood_probing` + * trait that selects the displacement (insert) and early-termination (find) control flow in the + * open-addressing ref implementation. The invariant's one extra requirement — the inverse + * primitive `probe_distance`, which recovers a resident's probe distance ("age") from the slot it + * occupies — is delegated to `cuco::detail::probe_distance`, which is overloaded per underlying + * scheme. Only `cuco::linear_probing` provides that overload today; a `cuco::double_hashing` + * variant would compose simply by adding a matching `cuco::detail::probe_distance` overload, with + * no change to this class or the ref implementation. + * + * @tparam Underlying The wrapped probe-sequence scheme (e.g. `cuco::linear_probing`) + */ +template +class robin_hood_probing : private Underlying { + public: + using Underlying::cg_size; ///< Cooperative group size (from the underlying scheme) + using typename Underlying::hasher; ///< Hash function type (from the underlying scheme) + using Underlying::hash_function; ///< Forwarded: gets the function(s) used to hash keys + using Underlying::make_iterator; ///< Forwarded: the (unchanged) forward probe sequence + + /** + * @brief Constructs a Robin Hood probing scheme wrapping the given underlying scheme. + * + * @param probing The underlying probe-sequence scheme to wrap + */ + __host__ __device__ constexpr robin_hood_probing(Underlying const& probing = {}); + + /** + * @brief Makes a copy of the current probing method with the given hasher. + * + * @note Forwards to the underlying scheme's `rebind_hash_function` and re-wraps the result so the + * returned scheme is again a `robin_hood_probing`. + * + * @tparam NewHash New hasher type + * + * @param hash Hasher + * + * @return Copy of the current probing method + */ + template + [[nodiscard]] __host__ __device__ constexpr auto rebind_hash_function( + NewHash const& hash) const noexcept; + + /** + * @brief Computes the probe distance ("age") of a resident key. + * + * @note This is the inverse of the probe sequence: given a resident key and the slot index at + * which it currently lives, it returns how many probing steps that resident is from its own home + * bucket. The Robin Hood insert/find logic compares this against the in-flight key's own probe + * distance to decide displacement (insert) or early termination (find). + * + * @note Delegates to the `cuco::detail::probe_distance` overload for `Underlying`. Instantiating + * this for an `Underlying` without such an overload (e.g. `cuco::double_hashing` today) is a + * compile-time error — that is the single seam where a new underlying scheme would supply its own + * inverse. + * + * @tparam BucketSize Size of the bucket + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param resident_key The key currently residing in the slot + * @param slot_index The slot index at which `resident_key` resides + * @param upper_bound Upper bound of the iteration + * @return The resident's probe distance, in probing steps + */ + template + [[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( + ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) const noexcept; +}; + /** * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not * @@ -227,6 +308,22 @@ struct is_double_hashing : cuda::std::false_type {}; template struct is_double_hashing> : cuda::std::true_type {}; +/** + * @brief Trait indicating whether the given probing scheme is of `robin_hood_probing` type or not + * + * @tparam T Input probing scheme type + */ +template +struct is_robin_hood_probing : cuda::std::false_type {}; + +/** + * @brief Trait indicating whether the given probing scheme is of `robin_hood_probing` type or not + * + * @tparam Underlying The wrapped probe-sequence scheme + */ +template +struct is_robin_hood_probing> : cuda::std::true_type {}; + } // namespace cuco #include From d17754602b8af5fa18365a9e2ab66d8fe2cfcd34 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 17:32:26 -0400 Subject: [PATCH 02/15] Add Robin Hood probe distance test. --- tests/utility/probing_scheme_test.cu | 72 ++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/utility/probing_scheme_test.cu b/tests/utility/probing_scheme_test.cu index 39048946b..97f26dc18 100644 --- a/tests/utility/probing_scheme_test.cu +++ b/tests/utility/probing_scheme_test.cu @@ -78,6 +78,37 @@ __global__ void generate_cg_probing_sequence(Key key, } } +// Walks each lane's probe iterator and records, at every step, the probe distance reported for the +// slot that lane is visiting. Because the resident under test is `key` itself, the slot visited at +// step `i` is at probe distance `i` from `key`'s home — for every lane. Recording one column per +// lane lets the host check both that `probe_distance` inverts `make_iterator` and that it strips +// the per-lane intra-stride offset (so all lanes at a given step agree). +template +__global__ void generate_cg_probe_distance_sequence(Key key, + Extent upper_bound, + size_t seq_length, + OutputIt out_seq) +{ + auto constexpr cg_size = ProbingScheme::cg_size; + + auto const tid = blockIdx.x * blockDim.x + threadIdx.x; + auto probing_scheme = ProbingScheme{}; + + if (tid < cg_size) { + auto const tile = + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); + + auto iter = probing_scheme.template make_iterator(tile, key, upper_bound); + + for (size_t i = 0; i < seq_length; ++i) { + out_seq[i * cg_size + tile.thread_rank()] = + probing_scheme.template probe_distance(key, *iter, upper_bound); + ++iter; + } + } +} + TEMPLATE_TEST_CASE_SIG( "utility probing_scheme tests", "", @@ -111,3 +142,44 @@ TEMPLATE_TEST_CASE_SIG( REQUIRE(cuco::test::equal( scalar_seq.begin(), scalar_seq.end(), cg_seq.begin(), cuda::std::equal_to{})); } + +TEMPLATE_TEST_CASE_SIG( + "utility robin_hood probe_distance inverts make_iterator", + "", + ((typename Key, int32_t CGSize, int32_t BucketSize), Key, CGSize, BucketSize), + (int32_t, 1, 1), + (int32_t, 4, 1), + (int32_t, 8, 1), + (int32_t, 8, 2), + (int64_t, 4, 1), + (int64_t, 8, 2)) +{ + // Robin Hood wraps a linear probe sequence; `probe_distance` is its inverse. For `key`'s own + // probe sequence, the slot visited at step `i` must report probe distance `i`. + using probe = + cuco::robin_hood_probing>>; + + // A deliberately small capacity, so the probe sequence wraps around the table within + // `seq_length` steps — this exercises the modular-subtraction (wrap) path in `probe_distance`. + auto const upper_bound = + cuco::make_valid_extent>(cuco::extent{64}); + + // Probe distance is measured in whole probing steps and lives in `[0, num_buckets)`, where one + // step spans the full `cg_size * bucket_size` stride. Taking `seq_length` past `num_buckets` + // guarantees the walk wraps at least once. + auto const capacity = static_cast(upper_bound); + auto const num_buckets = capacity / (CGSize * BucketSize); + auto const seq_length = num_buckets + 3; + constexpr Key key{42}; + + thrust::device_vector distances(seq_length * CGSize); + generate_cg_probe_distance_sequence + <<<1, CGSize>>>(key, upper_bound, seq_length, distances.begin()); + + // Under wrap, the slot visited at step `i` sits at probe distance `i mod num_buckets`. + for (std::size_t i = 0; i < seq_length; ++i) { + for (std::int32_t r = 0; r < CGSize; ++r) { + REQUIRE(distances[i * CGSize + r] == i % num_buckets); + } + } +} From 75a78dfaf6e62e79c0969d7d2973d66ddda6e7af Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 17:52:22 -0400 Subject: [PATCH 03/15] Add Robin Hood get methods. --- .../open_addressing_ref_impl.cuh | 77 ++++++++++++++++++- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index ada8792bb..5c0967ff9 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -777,7 +777,8 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -791,6 +792,13 @@ class open_addressing_ref_impl case detail::equal_result::EQUAL: return true; } } + // Robin Hood: a resident richer than us proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { + return false; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -816,7 +824,8 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -834,6 +843,13 @@ class open_addressing_ref_impl if (group.any(state == detail::equal_result::EQUAL)) { return true; } if (group.any(state == detail::equal_result::EMPTY)) { return false; } + // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { + return false; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -857,7 +873,8 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -875,6 +892,13 @@ class open_addressing_ref_impl default: continue; } } + // Robin Hood: a resident richer than us proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { + return this->end(); + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -900,7 +924,8 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -930,6 +955,13 @@ class open_addressing_ref_impl // Find an empty slot, meaning that the probe key isn't present in the container if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } + // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { + return this->end(); + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -1592,6 +1624,43 @@ class open_addressing_ref_impl return storage_ref_.data() + probing_idx + intra_bucket_idx; } + /** + * @brief Determines whether the Robin Hood invariant proves the probe key absent at the current + * probe step. + * + * @note Only meaningful for Robin Hood probing. The key is proven absent when the bucket holds a + * resident that is "richer" than the probe key — i.e. whose own probe distance is smaller than + * the probe key's probe distance at the current step (`probe_step`). Such a resident would have + * been displaced on insertion if the probe key lived here, so the probe key cannot be present. + * + * @note Behavior is only well-defined when every slot in the bucket is occupied (the callers + * reach this check only after ruling out empty and matching slots), since probe distance is + * meaningless for an empty slot. + * + * @tparam BucketSlots Bucket slot array type + * + * @param bucket_slots The slots of the bucket currently being probed + * @param bucket_base The slot index of the first slot in the bucket + * @param probe_step The probe key's own probe distance at the current step + * + * @return True if some resident in the bucket is richer than the probe key + */ + template + [[nodiscard]] __device__ bool robin_hood_proves_absent(BucketSlots const& bucket_slots, + size_type bucket_base, + size_type probe_step) const noexcept + { + bool richer = false; + cuda::static_for([&](auto i) { + auto const resident_age = probing_scheme_.template probe_distance( + this->extract_key(bucket_slots[i()]), + static_cast(bucket_base + i()), + storage_ref_.extent()); + if (resident_age < probe_step) { richer = true; } + }); + return richer; + } + /** * @brief Extracts the key from a given value type. * From dc90ebe6b6783450efb33228cb314d28696aa551 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 18:03:11 -0400 Subject: [PATCH 04/15] Add Robin Hood invariant test. --- tests/CMakeLists.txt | 1 + .../probing_scheme_invariants_test.cu | 112 ++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 tests/static_map/probing_scheme_invariants_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 51d4f42c1..07465b86e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -89,6 +89,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/insert_or_assign_test.cu static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu + static_map/probing_scheme_invariants_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu static_map/rehash_test.cu diff --git a/tests/static_map/probing_scheme_invariants_test.cu b/tests/static_map/probing_scheme_invariants_test.cu new file mode 100644 index 000000000..2fb4bdd88 --- /dev/null +++ b/tests/static_map/probing_scheme_invariants_test.cu @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include + +#include +#include + +namespace { + +// Identity hash. `cuco::detail::sanitize_hash` is `to_positive`, so for a non-negative key `k` this +// puts its home bucket at slot `k % capacity` — letting the test hand-craft an exact Robin Hood +// layout instead of reverse-engineering a real hash function. +template +struct identity_hash { + __host__ __device__ constexpr Key operator()(Key key) const noexcept { return key; } +}; + +} // namespace + +// Validates the Robin Hood read-path early-exit (`find` / `contains`) against a hand-seeded layout, +// before the displacing `insert` exists. A scalar (cg_size == 1) Robin Hood map over a linear probe +// sequence is seeded directly through the storage pointer with a known-valid Robin Hood cluster, +// then queried with keys chosen to exercise each lookup-termination rule. +TEST_CASE("static_map robin_hood read-path early-exit", "") +{ + using Key = std::int32_t; + using Value = std::int32_t; + using size_type = std::int32_t; + auto constexpr capacity = size_type{16}; + + using extent_type = cuco::extent; + using probe_type = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + probe_type, + cuco::cuda_allocator, + cuco::storage<1>>; + using value_type = typename map_type::value_type; // cuco::pair + + auto map = map_type{extent_type{}, cuco::empty_key{-1}, cuco::empty_value{-1}}; + // Ensure the constructor's slot initialization (empty sentinels) has completed before we seed. + REQUIRE(cudaDeviceSynchronize() == cudaSuccess); + + // Hand-seed a valid Robin Hood layout (identity hash => home(k) = k % capacity): + // slot 0: key 0 home 0, distance 0 + // slot 1: key 16 home 0, distance 1 (displaced past key 0) + // slot 2: key 2 home 2, distance 0 + // slots 3..15: empty + // This is exactly what inserting {0, 16, 2} would produce, so it satisfies the Robin Hood + // invariant. We only write the three occupied (and contiguous) slots; the rest stay empty. + // The ref is used purely to reach the storage pointer; the bulk queries below build their own. + auto const ref = map.ref(cuco::op::find); + std::vector const seed{value_type{0, 0}, value_type{16, 16}, value_type{2, 2}}; + REQUIRE(cudaMemcpy(ref.storage_ref().data(), + seed.data(), + seed.size() * sizeof(value_type), + cudaMemcpyHostToDevice) == cudaSuccess); + + // Probe keys chosen to exercise every lookup-termination rule: + // 0 : present, found immediately at its home (distance 0). + // 16 : present at distance 1 — found only if we do NOT early-exit at slot 0, where the resident + // distance (0) equals our probe step (0). Guards the strict `<` (vs `<=`) richer rule. + // 2 : present at its home. + // 32 : home 0, absent — terminates via the richer-resident early-exit at slot 2 (key 2 sits at + // distance 0 < our probe step 2), before reaching the empty slot 3. + // 1 : home 1, absent — also terminates via the early-exit at slot 2. + // 3 : home 3, absent — terminates on the empty slot 3. + std::vector const probe_keys{0, 16, 2, 32, 1, 3}; + std::vector const expected_contained{true, true, true, false, false, false}; + + thrust::device_vector const d_keys(probe_keys.begin(), probe_keys.end()); + + thrust::device_vector d_contained(probe_keys.size()); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + for (std::size_t i = 0; i < probe_keys.size(); ++i) { + INFO("contains, probe key = " << probe_keys[i]); + REQUIRE(static_cast(d_contained[i]) == expected_contained[i]); + } + + // `find` must return the stored value for present keys and the empty-value sentinel otherwise. + thrust::device_vector d_values(probe_keys.size()); + map.find(d_keys.begin(), d_keys.end(), d_values.begin()); + std::vector const expected_values{0, 16, 2, -1, -1, -1}; + for (std::size_t i = 0; i < probe_keys.size(); ++i) { + INFO("find, probe key = " << probe_keys[i]); + REQUIRE(d_values[i] == expected_values[i]); + } +} From 12b8e57d54b06409bc43345e5125c51fdb94a1d6 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 23:00:31 -0400 Subject: [PATCH 05/15] Add non-CG insert for Robin Hood. --- .../open_addressing_ref_impl.cuh | 66 +++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 5c0967ff9..e8e5ad931 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -378,16 +379,19 @@ class open_addressing_ref_impl { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(slot_content)); @@ -407,11 +411,65 @@ class open_addressing_ref_impl return false; } } - case insert_result::CONTINUE: continue; + case insert_result::CONTINUE: { + // Retry on a lost CAS. Plain probing keeps scanning this (now stale) bucket; Robin + // Hood must re-read it instead, so the in-flight pair is re-evaluated against the new + // occupants -- otherwise it could be placed past a slot it should have displaced, + // breaking the invariant (and therefore lookups). + if constexpr (cuco::is_robin_hood_probing::value) { + retry = true; + break; + } else { + continue; + } + } case insert_result::SUCCESS: return true; } + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { break; } // leave the scan to re-read the bucket + } + } + + // Robin Hood swap test. A resident "richer" than the in-flight pair (a smaller probe + // distance than our current probe step) is displaced: we swap our pair into its slot, + // adopt the evicted resident, and re-probe forward. + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL) { + auto const intra_bucket_index = + cuda::std::distance(bucket_slots.begin(), &slot_content); + auto const evicted_age = probing_scheme_.template probe_distance( + this->extract_key(slot_content), + static_cast(*probing_iter + intra_bucket_index), + storage_ref_.extent()); + if (evicted_age < probe_step) { + if (this->attempt_insert( + this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val) == + insert_result::SUCCESS) { + // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is + // `evicted_age`, and it may belong in another slot of the same bucket: an empty one, + // or one holding an even-richer resident it can displace in turn. Re-reading the + // bucket (rather than advancing past it) is the within-bucket linear probe, i.e. the + // combined bucket+slot distance that makes displacement correct for bucket_size > 1. + // The `slot_distance` term cancels in every comparison, so it never appears here; it + // shows up only as this slot-by-slot continuation. `bit_cast` keeps the adoption + // valid for heterogeneous insert types (layout-compatible by contract; identity in + // the common case). + val = cuda::std::bit_cast(slot_content); + key = this->extract_key(val); + probe_step = evicted_age; + } + retry = true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS + break; + } + } } } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } // re-probe (re-read this bucket, or move on after displacement) + ++probe_step; + } + ++probing_iter; if (*probing_iter == init_idx) { return false; } } From 0cc3fbd4369289a99467b65911dc2210eeb65fc5 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 23:07:07 -0400 Subject: [PATCH 06/15] Add CG insert for Robin Hood. --- .../open_addressing_ref_impl.cuh | 61 ++++++++++++++++++- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index e8e5ad931..c46db6cef 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -490,11 +490,12 @@ class open_addressing_ref_impl __device__ bool insert(cooperative_groups::thread_block_tile group, Value value) noexcept { - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -554,6 +555,60 @@ class open_addressing_ref_impl default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement: no match, no empty slot in this bucket. Displace the first + // resident in probe (lane) order that is richer than the in-flight pair, adopt it, and + // re-probe THIS bucket -- the victim may belong in another slot of it. The within-bucket + // linear probe (combined bucket+slot distance) is identical to the scalar path; the + // `slot_distance` term cancels, so the test is again `resident distance < probe_step`. + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + auto const age = probing_scheme_.template probe_distance( + this->extract_key(bucket_slots[i()]), + static_cast(*probing_iter + i()), + storage_ref_.extent()); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = insert_result::CONTINUE; + // Only `src_lane` reads `evicted` meaningfully; other lanes just need a valid value to + // feed the broadcast `shfl` below, so seed it with the empty-slot sentinel. + value_type evicted = this->empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = + attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + // Broadcast the evicted pair and its probe distance from the winning lane, and adopt + // it on every lane (all lanes need the new in-flight pair for the next scan). + auto const new_key = group.shfl(this->extract_key(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + value_type evicted_slot; + if constexpr (has_payload) { + auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); + evicted_slot = value_type{new_key, new_payload}; + } else { + evicted_slot = new_key; + } + val = cuda::std::bit_cast(evicted_slot); + key = this->extract_key(val); + probe_step = new_age; + } + continue; // success: re-probe this bucket with the victim; lost CAS: re-read it + } + // No displaceable resident: fall through to the shared advance below. + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } From 2a921319fe544639f739f3b0070f918b5aaaf2b2 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 23:38:48 -0400 Subject: [PATCH 07/15] Propagate Robin Hood insert to variants. --- .../open_addressing_ref_impl.cuh | 126 +++++++++++- .../cuco/detail/static_map/static_map_ref.inl | 183 ++++++++++++++++-- 2 files changed, 284 insertions(+), 25 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index c46db6cef..7fc5daa59 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -641,15 +641,21 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; + // Robin Hood may displace the original key before the chain ends; remember the slot it landed in + // so we return an iterator to it (not to a later victim's slot). + [[maybe_unused]] value_type* placed_ptr = nullptr; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto i = 0; i < bucket_size; ++i) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i])); @@ -663,16 +669,58 @@ class open_addressing_ref_impl if (eq_res == detail::equal_result::AVAILABLE) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { - this->maybe_wait_for_payload(slot_ptr); - return {iterator{slot_ptr}, true}; + // The in-flight pair is placed in an empty slot, ending any displacement chain. The + // iterator to return is the original key's slot (captured on its first placement). + auto* result_ptr = slot_ptr; + if constexpr (cuco::is_robin_hood_probing::value) { + if (placed_ptr != nullptr) { result_ptr = placed_ptr; } + } + this->maybe_wait_for_payload(result_ptr); + return {iterator{result_ptr}, true}; } case insert_result::DUPLICATE: { this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - default: continue; + case insert_result::CONTINUE: { + if constexpr (cuco::is_robin_hood_probing::value) { + retry = true; + break; + } else { + continue; + } + } + } + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { break; } } } + + // Robin Hood swap test (see `insert` for the full rationale). + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL) { + auto const evicted_age = probing_scheme_.template probe_distance( + this->extract_key(bucket_slots[i]), + static_cast(*probing_iter + i), + storage_ref_.extent()); + if (evicted_age < probe_step) { + if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == + insert_result::SUCCESS) { + if (placed_ptr == nullptr) { placed_ptr = slot_ptr; } // original key's slot + val = cuda::std::bit_cast(bucket_slots[i]); + key = this->extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } + } + } + } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; } ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } @@ -707,11 +755,15 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; + // Robin Hood may displace the original key before the chain ends; remember (broadcast) the slot + // it first landed in so we return an iterator to it. 0 means "not yet placed". + [[maybe_unused]] intptr_t placed_ptr = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -751,9 +803,15 @@ class open_addressing_ref_impl switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { + // The in-flight pair is placed in an empty slot, ending any displacement chain. Return + // the original key's slot (the first placement) if it was displaced earlier. + auto result = res; + if constexpr (cuco::is_robin_hood_probing::value) { + if (placed_ptr != 0) { result = placed_ptr; } + } if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); - return {iterator{reinterpret_cast(res)}, true}; + return {iterator{reinterpret_cast(result)}, true}; } case insert_result::DUPLICATE: { if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } @@ -763,6 +821,54 @@ class open_addressing_ref_impl default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `insert` for the full rationale). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + auto const age = probing_scheme_.template probe_distance( + this->extract_key(bucket_slots[i()]), + static_cast(*probing_iter + i()), + storage_ref_.extent()); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = insert_result::CONTINUE; + value_type evicted = this->empty_slot_sentinel(); + intptr_t displaced = 0; + if (group.thread_rank() == src_lane) { + auto* dptr = this->get_slot_ptr(*probing_iter, displace_idx); + evicted = bucket_slots[displace_idx]; + status = attempt_insert(dptr, evicted, val); + displaced = reinterpret_cast(dptr); + } + if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } + auto const new_key = group.shfl(this->extract_key(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + value_type evicted_slot; + if constexpr (has_payload) { + auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); + evicted_slot = value_type{new_key, new_payload}; + } else { + evicted_slot = new_key; + } + val = cuda::std::bit_cast(evicted_slot); + key = this->extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } } diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 7f8ff043b..315e5b3fb 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -495,6 +495,7 @@ class operator_impl< using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; using mapped_type = T; + using size_type = typename base_type::size_type; static constexpr auto cg_size = base_type::cg_size; static constexpr auto bucket_size = base_type::bucket_size; @@ -515,17 +516,20 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(key, storage_ref.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = ref_.impl_.predicate_.template operator()(key, slot_content.first); @@ -540,8 +544,37 @@ class operator_impl< } if (eq_res == detail::equal_result::AVAILABLE) { if (attempt_insert_or_assign(slot_ptr, val)) { return; } + if constexpr (cuco::is_robin_hood_probing::value) { + retry = true; + break; + } + } + + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL) { + auto const evicted_age = probing_scheme.template probe_distance( + ref_.impl_.extract_key(slot_content), + static_cast(*probing_iter + intra_bucket_index), + storage_ref.extent()); + if (evicted_age < probe_step) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + val = cuda::std::bit_cast(slot_content); + key = ref_.impl_.extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } + } } } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return; } } @@ -565,13 +598,14 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(group, key, storage_ref.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref[*probing_iter]; @@ -612,6 +646,45 @@ class operator_impl< // Exit if inserted or assigned if (group.shfl(status, src_lane)) { return; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + auto const age = probing_scheme.template probe_distance( + ref_.impl_.extract_key(bucket_slots[i()]), + static_cast(*probing_iter + i()), + storage_ref.extent()); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = detail::insert_result::CONTINUE; + value_type evicted = ref_.impl_.empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = ref_.impl_.attempt_insert( + ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); + auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + val = cuda::std::bit_cast(value_type{new_key, new_payload}); + key = ref_.impl_.extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return; } } @@ -670,6 +743,7 @@ class operator_impl< using ref_type = static_map_ref; using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; static constexpr auto cg_size = base_type::cg_size; static constexpr auto bucket_size = base_type::bucket_size; @@ -886,14 +960,15 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(key, storage_ref.extent()); - auto const init_idx = *probing_iter; - auto const empty_value = ref_.empty_value_sentinel(); + auto const init_idx = *probing_iter; + auto const empty_value = ref_.empty_value_sentinel(); + [[maybe_unused]] size_type probe_step = 0; // wait for payload only when init != sentinel and insert strategy is not `packed_cas` auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8); @@ -901,6 +976,8 @@ class operator_impl< while (true) { auto const bucket_slots = storage_ref[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = ref_.impl_.predicate_.template operator()(key, slot_content.first); @@ -928,9 +1005,45 @@ class operator_impl< op(cuda::atomic_ref{slot_ptr->second}, val.second); return false; } - default: continue; + case insert_result::CONTINUE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + retry = true; + break; + } else { + continue; + } + } + } + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { break; } } } + + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL) { + auto const evicted_age = probing_scheme.template probe_distance( + ref_.impl_.extract_key(slot_content), + static_cast(*probing_iter + intra_bucket_index), + storage_ref.extent()); + if (evicted_age < probe_step) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + val = cuda::std::bit_cast(slot_content); + key = ref_.impl_.extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } + } + } + } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; } ++probing_iter; if (*probing_iter == init_idx) { return false; } @@ -964,14 +1077,15 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(group, key, storage_ref.extent()); - auto const init_idx = *probing_iter; - auto const empty_value = ref_.empty_value_sentinel(); + auto const init_idx = *probing_iter; + auto const empty_value = ref_.empty_value_sentinel(); + [[maybe_unused]] size_type probe_step = 0; // wait for payload only when init != sentinel and insert strategy is not `packed_cas` auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8); @@ -1030,6 +1144,45 @@ class operator_impl< default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + auto const age = probing_scheme.template probe_distance( + ref_.impl_.extract_key(bucket_slots[i()]), + static_cast(*probing_iter + i()), + storage_ref.extent()); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = detail::insert_result::CONTINUE; + value_type evicted = ref_.impl_.empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = ref_.impl_.attempt_insert( + ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); + auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + val = cuda::std::bit_cast(value_type{new_key, new_payload}); + key = ref_.impl_.extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } From c27f483efbb0aaa5b354ae79c65d9c199da5edf6 Mon Sep 17 00:00:00 2001 From: aterenin Date: Thu, 4 Jun 2026 12:11:56 -0400 Subject: [PATCH 08/15] Add initial tests and fix insert_or_assign for Robin Hood. --- .../cuco/detail/static_map/static_map_ref.inl | 137 +++++++++++++++--- tests/static_map/contains_test.cu | 27 +++- tests/static_map/find_test.cu | 38 ++++- tests/static_map/insert_and_find_test.cu | 31 +++- tests/static_map/insert_or_apply_test.cu | 76 +++++++++- tests/static_map/insert_or_assign_test.cu | 93 +++++++++++- tests/static_map/robin_hood_invariant.cuh | 111 ++++++++++++++ tests/test_utils.hpp | 2 +- 8 files changed, 466 insertions(+), 49 deletions(-) create mode 100644 tests/static_map/robin_hood_invariant.cuh diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 315e5b3fb..df323db6d 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -538,15 +538,35 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - cuda::atomic_ref payload_ref(slot_ptr->second); - payload_ref.store(val.second, cuda::memory_order_relaxed); - return; + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood may relocate this key; assign via a full-slot CAS that keeps the key and + // fails if it moved. On failure re-probe and retry -- the loop re-finds the key. + auto desired = slot_content; + desired.second = val.second; + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == + detail::insert_result::SUCCESS) { + return; + } + retry = true; + break; + } else { + cuda::atomic_ref payload_ref(slot_ptr->second); + payload_ref.store(val.second, cuda::memory_order_relaxed); + return; + } } if (eq_res == detail::equal_result::AVAILABLE) { - if (attempt_insert_or_assign(slot_ptr, val)) { return; } if constexpr (cuco::is_robin_hood_probing::value) { + // Insert the new pair with a full-slot CAS; on a lost CAS (rival insert) or a duplicate, + // retry -- the loop re-finds the key and assigns it via the EQUAL full-slot CAS. + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + return; + } retry = true; break; + } else { + if (attempt_insert_or_assign(slot_ptr, val)) { return; } } } @@ -629,22 +649,48 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if (group.thread_rank() == src_lane) { - cuda::atomic_ref payload_ref(slot_ptr->second); - payload_ref.store(val.second, cuda::memory_order_relaxed); + if constexpr (cuco::is_robin_hood_probing::value) { + // src_lane assigns via a full-slot CAS (key fixed); a relocation or rival update fails it, + // so the group re-probes and retries -- the loop re-finds the key. + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + auto desired = bucket_slots[target_idx]; + desired.second = val.second; + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return; } + continue; + } else { + if (group.thread_rank() == src_lane) { + cuda::atomic_ref payload_ref(slot_ptr->second); + payload_ref.store(val.second, cuda::memory_order_relaxed); + } + group.sync(); + return; } - group.sync(); - return; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; - auto const status = - (group.thread_rank() == src_lane) ? attempt_insert_or_assign(slot_ptr, val) : false; + if constexpr (cuco::is_robin_hood_probing::value) { + // Insert the new pair with a full-slot CAS; on a lost CAS or duplicate, re-probe and retry + // (the loop re-finds the key and assigns via the EQUAL full-slot CAS). + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], val) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return; } + continue; + } else { + auto const status = + (group.thread_rank() == src_lane) ? attempt_insert_or_assign(slot_ptr, val) : false; - // Exit if inserted or assigned - if (group.shfl(status, src_lane)) { return; } + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } + } } else { if constexpr (cuco::is_robin_hood_probing::value) { // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). @@ -986,18 +1032,41 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - // wait for payload only when performing insert operation - if constexpr (wait_for_payload) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Lift `op` to the whole slot, keeping the key, and CAS it. A relocation (or a rival + // update) makes the CAS fail; re-probe and retry -- the loop re-finds the key. + auto desired = slot_content; + // `desired` is a local copy, so this `op` is just local arithmetic -- the `atomic_ref`'s + // atomicity does nothing here and is used only because `Op`'s signature requires one. The + // real atomic is the full-slot CAS below. + op(cuda::atomic_ref{desired.second}, val.second); + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == + detail::insert_result::SUCCESS) { + return false; + } + retry = true; + break; + } else { + // wait for payload only when performing insert operation + if constexpr (wait_for_payload) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } + op(cuda::atomic_ref{slot_ptr->second}, val.second); + return false; } - op(cuda::atomic_ref{slot_ptr->second}, val.second); - return false; } if (eq_res == detail::equal_result::AVAILABLE) { switch (ref_.template attempt_insert_or_apply( slot_ptr, slot_content, val, op)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Key is present now; re-probe so it is found EQUAL and updated via the full-slot CAS. + retry = true; + break; + } // wait for payload only when performing insert operation if constexpr (wait_for_payload) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); @@ -1112,13 +1181,31 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if (group.thread_rank() == src_lane) { - if constexpr (wait_for_payload) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // src_lane lifts `op` to the slot (key fixed) and CASes it; a relocation or rival update + // fails the CAS, so the group re-probes and retries -- the loop re-finds the key. + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + auto desired = bucket_slots[target_idx]; + // `desired` is a local copy, so this `op` is just local arithmetic -- the `atomic_ref`'s + // atomicity does nothing here and is used only because `Op`'s signature requires one. The + // real atomic is the full-slot CAS below. + op(cuda::atomic_ref{desired.second}, val.second); + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return false; } + continue; + } else { + if (group.thread_rank() == src_lane) { + if constexpr (wait_for_payload) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } + op(cuda::atomic_ref{slot_ptr->second}, val.second); } - op(cuda::atomic_ref{slot_ptr->second}, val.second); + return false; } - return false; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); @@ -1133,6 +1220,10 @@ class operator_impl< switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + continue; // key present now: re-probe, find it EQUAL, apply via the full-slot CAS + } if (group.thread_rank() == src_lane) { if constexpr (wait_for_payload) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index 0b3604528..9f2769a56 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -79,6 +80,12 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + SECTION("All inserted keys should be contained.") { REQUIRE(map.count(keys_begin, keys_begin + num_keys) == num_keys); @@ -131,12 +138,19 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -145,9 +159,12 @@ TEMPLATE_TEST_CASE_SIG( // XXX: testing static extent is intended, DO NOT CHANGE using extent_type = cuco::extent; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_map #include #include @@ -73,6 +74,12 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket Robin Hood layout invariant + // (a no-op for linear/double hashing). + if constexpr (cuco::is_robin_hood_probing::value) { + cuco::test::check_robin_hood_invariant(map); + } + SECTION("All inserted keys should be correctly recovered during find") { map.find(keys_begin, keys_begin + num_keys, d_results.begin()); @@ -163,12 +170,30 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional: RH displacement swaps into an *occupied* slot, which requires a packed CAS of + // the whole slot. The wider-slot RH rows therefore live under CUCO_HAS_128BIT_ATOMICS below + // (LP/DH avoid this because they only ever CAS into empty slots via back-to-back CAS). + (int8_t, int8_t, cuco::test::probe_sequence::robin_hood, 1), + (int8_t, int8_t, cuco::test::probe_sequence::robin_hood, 2), + (int8_t, int16_t, cuco::test::probe_sequence::robin_hood, 2), + (int16_t, int16_t, cuco::test::probe_sequence::robin_hood, 1), + (int16_t, int16_t, cuco::test::probe_sequence::robin_hood, 2), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -177,9 +202,12 @@ TEMPLATE_TEST_CASE_SIG( // XXX: testing static extent is intended, DO NOT CHANGE using extent_type = cuco::extent; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; constexpr size_type gold_capacity = [&]() { if constexpr (cuco::is_double_hashing::value) { diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index 17665eb2b..d380bf724 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -15,6 +15,7 @@ * limitations under the License. */ +#include #include #include @@ -52,12 +53,23 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -66,9 +78,12 @@ TEMPLATE_TEST_CASE_SIG( #endif { using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; constexpr size_type num_keys{400}; @@ -100,6 +115,12 @@ TEMPLATE_TEST_CASE_SIG( map.insert_and_find(pairs_begin, pairs_begin + num_keys, found2.begin(), inserted.begin()); REQUIRE(cuco::test::none_of(inserted.begin(), inserted.end(), cuda::std::identity{})); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + // both found1 and found2 should be same, as keys will be referring to same slot REQUIRE( cuco::test::equal(found1.begin(), found1.end(), found2.begin(), cuda::std::equal_to{})); diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 0a4d07ea3..6bc151b17 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -54,6 +55,12 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key map.insert_or_apply(pairs_begin, pairs_begin + num_keys, plus_op); } + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + REQUIRE(map.size() == num_unique_keys); thrust::device_vector d_keys(num_unique_keys); @@ -161,12 +168,23 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -174,9 +192,12 @@ TEMPLATE_TEST_CASE_SIG( constexpr size_type num_unique_keys{100}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; using map_type = cuco::static_map(map, num_keys, num_unique_keys, static_cast(0)); } } + +// Dedicated Robin Hood coverage for insert_or_apply: the probe-enum test above is disabled upstream, +// so this is the only active exercise of the displacing RH insert_or_apply (reduction + lock-free +// displacement together). It runs at a high load factor (~0.95 on the unique keys) so displacement +// actually fires, and reuses `test_insert_or_apply`, whose tail asserts the structural RH invariant. +TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_apply (high load)", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_unique_keys = 5'000; + constexpr size_type num_keys = 10'000; // each unique key inserted twice + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + // Size the table for ~0.95 load on the unique keys, so it is nearly full and the displacing insert + // path (and the structural invariant check inside the helper) is genuinely stressed. + constexpr size_type capacity = static_cast(num_unique_keys / 0.95); + + SECTION("sentinel equals init; has_init = true") + { + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply(map, num_keys, num_unique_keys, static_cast(0)); + } + SECTION("sentinel equals init; has_init = false") + { + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply(map, num_keys, num_unique_keys, static_cast(0)); + } +} diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 2cccd62ad..067224e62 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -21,9 +22,11 @@ #include #include +#include #include #include #include +#include #include #include @@ -53,6 +56,12 @@ void test_insert_or_assign(Map& map, size_type num_keys) map.insert_or_assign(query_pairs_begin, query_pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + auto const updated_size = map.size(); // all keys are present in the map so the size shouldn't change REQUIRE(updated_size == initial_size); @@ -93,21 +102,35 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { constexpr size_type num_keys{400}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_mapkey value swap from a misplaced assign). +TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_assign (high load)", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_unique_keys = 5'000; + constexpr size_type num_keys = 10'000; // each unique key assigned twice (same value) + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + // ~0.95 load on the unique keys, so the table is nearly full and displacement is stressed. + constexpr size_type capacity = static_cast(num_unique_keys / 0.95); + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + // Every occurrence of key k carries the same value (k * 2). + auto pairs_begin = cuda::make_transform_iterator( + cuda::counting_iterator(0), + cuda::proclaim_return_type>([num_unique_keys] __device__(auto i) { + auto const k = static_cast(i % num_unique_keys); + return cuco::pair{k, static_cast(k * 2)}; + })); + + map.insert_or_assign(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.size() == num_unique_keys); + cuco::test::check_robin_hood_invariant(map); + + thrust::device_vector d_keys(num_unique_keys); + thrust::device_vector d_values(num_unique_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + auto const zip = thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); + REQUIRE(cuco::test::all_of( + zip, zip + num_unique_keys, cuda::proclaim_return_type([] __device__(auto const& p) { + return cuda::std::get<1>(p) == static_cast(cuda::std::get<0>(p) * 2); + }))); +} diff --git a/tests/static_map/robin_hood_invariant.cuh b/tests/static_map/robin_hood_invariant.cuh new file mode 100644 index 000000000..5d44be7e6 --- /dev/null +++ b/tests/static_map/robin_hood_invariant.cuh @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include + +namespace cuco { +namespace test { + +// Per-probe-step Robin Hood layout check. The unit is the *stride group* of `cg_size * bucket_size` +// contiguous slots that one probing step examines -- a single bucket for scalar probing, the whole +// cooperative-group window for CG probing. Within a stride group the slot order is free (the probe +// step distance is identical for every slot in it, so the intra-group offset cancels in all +// comparisons), so the invariant is only meaningful *between* groups. For each occupied group `g` +// (with predecessor `pg`), the resident probe-step distances ("ages") must satisfy: +// +// (1) Contiguity. If `g` holds any overflowed resident (distance >= 1), `pg` must be full -- +// otherwise that resident would have stopped in `pg`'s free slot instead of probing past it. +// (2) Balance. No resident of `pg` may be more than one probing step *richer* than the poorest +// resident of `g` (`min_age(pg) >= max_age(g) - 1`) -- otherwise the poorest resident of `g` +// should have displaced it. This is the property that distinguishes Robin Hood from plain +// linear probing, and (via condition 1) it inductively forces the whole home-to-position run +// to be full. +// +// `probe_distance` is reused here -- it is exercised independently by the utility probe-distance +// test, so a bug in *insert* (a layout that violates the invariant) is still caught. +template +__global__ void robin_hood_invariant_kernel(Ref ref, int* violations) +{ + using size_type = typename Ref::size_type; + constexpr int bs = Ref::bucket_size; + constexpr int stride = Ref::cg_size * Ref::bucket_size; + auto const storage_ref = ref.storage_ref(); + auto const slots = storage_ref.data(); + auto const num_groups = storage_ref.capacity() / stride; + auto const extent = storage_ref.extent(); + auto const empty_key = ref.empty_key_sentinel(); + auto const scheme = ref.probing_scheme(); + + for (size_type g = blockIdx.x * blockDim.x + threadIdx.x; g < num_groups; + g += gridDim.x * blockDim.x) { + int occupied_g = 0; + size_type max_age_g = 0; + for (int s = 0; s < stride; ++s) { + auto const slot = slots[g * stride + s]; + if (slot.first != empty_key) { + ++occupied_g; + auto const age = scheme.template probe_distance( + slot.first, static_cast(g * stride + s), extent); + if (age > max_age_g) { max_age_g = age; } + } + } + if (occupied_g == 0) { continue; } + + size_type const pg = (g + num_groups - 1) % num_groups; + int occupied_p = 0; + size_type min_age_p = 0; + for (int s = 0; s < stride; ++s) { + auto const slot = slots[pg * stride + s]; + if (slot.first != empty_key) { + auto const age = scheme.template probe_distance( + slot.first, static_cast(pg * stride + s), extent); + if (occupied_p == 0 || age < min_age_p) { min_age_p = age; } + ++occupied_p; + } + } + + if (max_age_g >= 1 && occupied_p < stride) { atomicAdd(violations, 1); } // (1) + if (occupied_p > 0 && min_age_p + 1 < max_age_g) { atomicAdd(violations, 1); } // (2) + } +} + +// Asserts that a populated Robin Hood `map` satisfies the per-bucket layout invariant above. No-op +// to call only on Robin Hood maps -- `probe_distance` exists only on `robin_hood_probing`, so guard +// the call site with `cuco::is_robin_hood_probing<...>`. +template +void check_robin_hood_invariant(Map& map) +{ + auto const ref = map.ref(cuco::op::find); + + thrust::device_vector d_violations(1, 0); + auto constexpr block_size = 128; + auto const grid_size = (map.capacity() + block_size - 1) / block_size; + robin_hood_invariant_kernel<<>>( + ref, thrust::raw_pointer_cast(d_violations.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(d_violations[0] == 0); +} + +} // namespace test +} // namespace cuco diff --git a/tests/test_utils.hpp b/tests/test_utils.hpp index 2c77eda36..d71336f8b 100644 --- a/tests/test_utils.hpp +++ b/tests/test_utils.hpp @@ -33,7 +33,7 @@ namespace cg = cooperative_groups; constexpr int32_t block_size = 128; -enum class probe_sequence { linear_probing, double_hashing }; +enum class probe_sequence { linear_probing, double_hashing, robin_hood }; // User-defined logical algorithms to reduce compilation time template From d2341e53ce74f6c60a20a1723a59e8f7aac17f6f Mon Sep 17 00:00:00 2001 From: aterenin Date: Fri, 5 Jun 2026 15:53:47 -0400 Subject: [PATCH 09/15] Add Robin Hood erase and tests. --- .../open_addressing_ref_impl.cuh | 196 +++++++++++--- .../cuco/detail/static_map/static_map_ref.inl | 84 ++++-- tests/static_map/erase_test.cu | 254 +++++++++++++++++- tests/static_map/robin_hood_invariant.cuh | 16 +- 4 files changed, 474 insertions(+), 76 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 7fc5daa59..e64efb43d 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -400,7 +400,13 @@ class open_addressing_ref_impl // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { return false; } } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone carries an age and is handled as a + // resident by the displacement test below. Skipping it must gate the CAS (once claimed it is + // already consumed), so it is folded into this condition. For non-Robin-Hood the second + // clause is a compile-time `false`, leaving the original `eq_res == AVAILABLE`. + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + this->is_erased(slot_content))) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); switch (attempt_insert( this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val)) { @@ -431,20 +437,22 @@ class open_addressing_ref_impl } // Robin Hood swap test. A resident "richer" than the in-flight pair (a smaller probe - // distance than our current probe step) is displaced: we swap our pair into its slot, - // adopt the evicted resident, and re-probe forward. + // distance than our current probe step) is displaced: we swap our pair into its slot, adopt + // the evicted resident, and re-probe forward. A tombstone is treated as a resident too -- + // its age comes from its payload (`robin_hood_age`) -- but picking one up *consumes* it: we + // take the slot and are done, since there is nothing to carry forward. if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL) { + if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(slot_content)) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - auto const evicted_age = probing_scheme_.template probe_distance( - this->extract_key(slot_content), - static_cast(*probing_iter + intra_bucket_index), - storage_ref_.extent()); + auto const evicted_age = this->robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { if (this->attempt_insert( this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val) == insert_result::SUCCESS) { + // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. + if (this->is_erased(slot_content)) { return true; } // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is // `evicted_age`, and it may belong in another slot of the same bucket: an empty one, // or one holding an even-richer resident it can displace in turn. Re-reading the @@ -506,9 +514,18 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { switch (this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()]))) { - case detail::equal_result::AVAILABLE: - result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + case detail::equal_result::AVAILABLE: { + // Robin Hood: only a true empty is AVAILABLE; a tombstone is a resident handled by + // the displacement scan below, so leave it UNEQUAL here. + bool empty_slot = true; + if constexpr (cuco::is_robin_hood_probing::value) { + empty_slot = not this->is_erased(bucket_slots[i()]); + } + if (empty_slot) { + result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + } break; + } case detail::equal_result::EQUAL: { if constexpr (!allows_duplicates) { result = bucket_probing_results{detail::equal_result::EQUAL, i()}; @@ -565,10 +582,10 @@ class open_addressing_ref_impl size_type evicted_age = 0; cuda::static_for([&] __device__(auto i) { if (displace_idx < 0) { - auto const age = probing_scheme_.template probe_distance( - this->extract_key(bucket_slots[i()]), - static_cast(*probing_iter + i()), - storage_ref_.extent()); + // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. + // consumed) exactly when richer than the in-flight pair, like any other resident. + auto const age = + this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -589,6 +606,8 @@ class open_addressing_ref_impl attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); } if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. + if (group.shfl(this->is_erased(evicted), src_lane)) { return true; } // Broadcast the evicted pair and its probe distance from the winning lane, and adopt // it on every lane (all lanes need the new in-flight pair for the next scan). auto const new_key = group.shfl(this->extract_key(evicted), src_lane); @@ -666,7 +685,11 @@ class open_addressing_ref_impl this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the + // displacement test below (see `insert`). + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + this->is_erased(bucket_slots[i]))) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { // The in-flight pair is placed in an empty slot, ending any displacement chain. The @@ -696,16 +719,23 @@ class open_addressing_ref_impl } } - // Robin Hood swap test (see `insert` for the full rationale). + // Robin Hood swap test (see `insert` for the full rationale). A tombstone is a resident too + // (age from its payload); picking one up consumes it -- the in-flight pair lands there and we + // are done. if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL) { - auto const evicted_age = probing_scheme_.template probe_distance( - this->extract_key(bucket_slots[i]), - static_cast(*probing_iter + i), - storage_ref_.extent()); + if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(bucket_slots[i])) { + auto const evicted_age = + this->robin_hood_age(bucket_slots[i], static_cast(*probing_iter + i)); if (evicted_age < probe_step) { if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == insert_result::SUCCESS) { + if (this->is_erased(bucket_slots[i])) { + // Consumed a tombstone: the in-flight pair is placed here; return the original + // key's slot (this one if it was never displaced). + auto* result_ptr = (placed_ptr != nullptr) ? placed_ptr : slot_ptr; + this->maybe_wait_for_payload(result_ptr); + return {iterator{result_ptr}, true}; + } if (placed_ptr == nullptr) { placed_ptr = slot_ptr; } // original key's slot val = cuda::std::bit_cast(bucket_slots[i]); key = this->extract_key(val); @@ -774,6 +804,13 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { auto res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()])); + // Robin Hood: a tombstone is a resident handled by the displacement scan below, not + // AVAILABLE, so leave it UNEQUAL here. + if constexpr (cuco::is_robin_hood_probing::value) { + if (res == detail::equal_result::AVAILABLE and this->is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = bucket_probing_results{res, i()}; } } }); @@ -827,10 +864,10 @@ class open_addressing_ref_impl size_type evicted_age = 0; cuda::static_for([&] __device__(auto i) { if (displace_idx < 0) { - auto const age = probing_scheme_.template probe_distance( - this->extract_key(bucket_slots[i()]), - static_cast(*probing_iter + i()), - storage_ref_.extent()); + // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. + // consumed) exactly when richer than the in-flight pair, like any other resident. + auto const age = + this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -852,6 +889,15 @@ class open_addressing_ref_impl } if (group.shfl(status, src_lane) == insert_result::SUCCESS) { if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } + // Consumed a tombstone: the in-flight pair is placed in its slot; we are done. Return + // the original key's slot (`placed_ptr`, which is this slot if it was never displaced). + if (group.shfl(this->is_erased(evicted), src_lane)) { + if (group.thread_rank() == src_lane) { + this->maybe_wait_for_payload(reinterpret_cast(displaced)); + } + group.sync(); + return {iterator{reinterpret_cast(placed_ptr)}, true}; + } auto const new_key = group.shfl(this->extract_key(evicted), src_lane); auto const new_age = group.shfl(evicted_age, src_lane); value_type evicted_slot; @@ -905,9 +951,15 @@ class open_addressing_ref_impl // Key exists, return true if successfully deleted if (eq_res == detail::equal_result::EQUAL) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - switch (attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - slot_content, - this->erased_slot_sentinel())) { + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes use + // the plain erased sentinel. + value_type erased = this->erased_slot_sentinel(); + if constexpr (cuco::is_robin_hood_probing::value) { + erased = this->robin_hood_erased_sentinel( + slot_content, static_cast(*probing_iter + intra_bucket_index)); + } + switch (attempt_insert_stable( + this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, erased)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: return false; default: continue; @@ -956,12 +1008,19 @@ class open_addressing_ref_impl auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - auto const status = - (group.thread_rank() == src_lane) - ? attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - bucket_slots[intra_bucket_index], - this->erased_slot_sentinel()) - : insert_result::CONTINUE; + auto status = insert_result::CONTINUE; + if (group.thread_rank() == src_lane) { + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes use + // the plain erased sentinel. + value_type erased = this->erased_slot_sentinel(); + if constexpr (cuco::is_robin_hood_probing::value) { + erased = this->robin_hood_erased_sentinel( + bucket_slots[intra_bucket_index], + static_cast(*probing_iter + intra_bucket_index)); + } + status = attempt_insert_stable( + this->get_slot_ptr(*probing_iter, intra_bucket_index), bucket_slots[intra_bucket_index], erased); + } switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; @@ -1871,15 +1930,76 @@ class open_addressing_ref_impl { bool richer = false; cuda::static_for([&](auto i) { - auto const resident_age = probing_scheme_.template probe_distance( - this->extract_key(bucket_slots[i()]), - static_cast(bucket_base + i()), - storage_ref_.extent()); + auto const resident_age = + this->robin_hood_age(bucket_slots[i()], static_cast(bucket_base + i())); if (resident_age < probe_step) { richer = true; } }); return richer; } + /** + * @brief Whether `slot` holds a tombstone (erased marker). + * + * @note Returns false when erase is disabled (the erased and empty sentinels coincide, so no slot + * is a tombstone) -- this keeps the test correct even for empty slots. + * + * @param slot The slot to test + * + * @return True if `slot` is an erased tombstone + */ + [[nodiscard]] __device__ bool is_erased(value_type const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(this->erased_key_sentinel(), + this->empty_key_sentinel()) and + cuco::detail::bitwise_compare(this->extract_key(slot), this->erased_key_sentinel()); + } + + /** + * @brief Robin Hood probe distance ("age") of an occupied slot. + * + * A live key's age is its `probe_distance`. A Robin Hood tombstone keeps the age of the key it + * replaced in its payload (the original key is gone and cannot be rehashed; see `erase`), so it + * is read back here -- a tombstone then participates in every Robin Hood comparison exactly like + * the resident it stood in for. + * + * @param slot The (occupied) slot + * @param slot_index The slot's index + * + * @return The slot's probe distance + */ + [[nodiscard]] __device__ size_type robin_hood_age(value_type const& slot, + size_type slot_index) const noexcept + { + if constexpr (has_payload) { + if (this->is_erased(slot)) { return static_cast(this->extract_payload(slot)); } + } + return probing_scheme_.template probe_distance( + this->extract_key(slot), slot_index, storage_ref_.extent()); + } + + /** + * @brief The Robin Hood tombstone for erasing the live key currently in `slot` at `slot_index`. + * + * The erased key's age is stashed in the payload (1a) so the tombstone keeps its place in the + * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes use + * the plain `erased_slot_sentinel()` and never call this. + * + * @param slot The slot's current (live) contents + * @param slot_index The slot's index + * + * @return The value to CAS into the slot to erase it + */ + [[nodiscard]] __device__ value_type robin_hood_erased_sentinel(value_type const& slot, + size_type slot_index) const noexcept + { + static_assert(has_payload, + "Robin Hood erase requires a mapped payload to store the tombstone age"); + auto const age = probing_scheme_.template probe_distance( + this->extract_key(slot), slot_index, storage_ref_.extent()); + return cuco::pair{this->erased_key_sentinel(), + static_castempty_value_sentinel())>(age)}; + } + /** * @brief Extracts the key from a given value type. * diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index df323db6d..767e3b944 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -557,29 +557,33 @@ class operator_impl< } if (eq_res == detail::equal_result::AVAILABLE) { if constexpr (cuco::is_robin_hood_probing::value) { - // Insert the new pair with a full-slot CAS; on a lost CAS (rival insert) or a duplicate, - // retry -- the loop re-finds the key and assigns it via the EQUAL full-slot CAS. - if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == - detail::insert_result::SUCCESS) { - return; + // Claim only a true empty; a tombstone is handled as a resident by the displacement test + // below. On a lost CAS (rival insert) or a duplicate, retry -- the loop re-finds the key + // and assigns it via the EQUAL full-slot CAS. + if (not ref_.impl_.is_erased(slot_content)) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + return; + } + retry = true; + break; } - retry = true; - break; } else { if (attempt_insert_or_assign(slot_ptr, val)) { return; } } } - // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A + // tombstone is a resident too (age from its payload); picking one up consumes it -- the pair + // lands there and we are done. if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL) { - auto const evicted_age = probing_scheme.template probe_distance( - ref_.impl_.extract_key(slot_content), - static_cast(*probing_iter + intra_bucket_index), - storage_ref.extent()); + if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { + auto const evicted_age = ref_.impl_.robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == detail::insert_result::SUCCESS) { + if (ref_.impl_.is_erased(slot_content)) { return; } // consumed tombstone -- done val = cuda::std::bit_cast(slot_content); key = ref_.impl_.extract_key(val); probe_step = evicted_age; @@ -636,6 +640,14 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); + // Robin Hood: a tombstone is a resident handled by the displacement scan, not AVAILABLE. + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + if (res == detail::equal_result::AVAILABLE and + ref_.impl_.is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = detail::bucket_probing_results{res, i()}; } @@ -698,10 +710,9 @@ class operator_impl< size_type evicted_age = 0; cuda::static_for([&] __device__(auto i) { if (displace_idx < 0) { - auto const age = probing_scheme.template probe_distance( - ref_.impl_.extract_key(bucket_slots[i()]), - static_cast(*probing_iter + i()), - storage_ref.extent()); + // `robin_hood_age` so a tombstone uses its payload-stored age (like any resident). + auto const age = ref_.impl_.robin_hood_age( + bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -720,6 +731,8 @@ class operator_impl< ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); } if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + // Consuming a tombstone reuses its slot -- nothing to carry, so we are done. + if (group.shfl(ref_.impl_.is_erased(evicted), src_lane)) { return; } auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); auto const new_age = group.shfl(evicted_age, src_lane); @@ -1056,7 +1069,11 @@ class operator_impl< return false; } } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the + // displacement test below. + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + ref_.impl_.is_erased(slot_content))) { switch (ref_.template attempt_insert_or_apply( slot_ptr, slot_content, val, op)) { case insert_result::SUCCESS: return true; @@ -1089,16 +1106,18 @@ class operator_impl< } } - // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A + // tombstone is a resident too (age from its payload); picking one up consumes it -- the + // in-flight pair lands there, completing the insert. if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL) { - auto const evicted_age = probing_scheme.template probe_distance( - ref_.impl_.extract_key(slot_content), - static_cast(*probing_iter + intra_bucket_index), - storage_ref.extent()); + if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { + auto const evicted_age = ref_.impl_.robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == detail::insert_result::SUCCESS) { + // Consuming a tombstone places the in-flight pair in its slot -- insert complete. + if (ref_.impl_.is_erased(slot_content)) { return true; } val = cuda::std::bit_cast(slot_content); key = ref_.impl_.extract_key(val); probe_step = evicted_age; @@ -1168,6 +1187,14 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); + // Robin Hood: a tombstone is a resident handled by the displacement scan, not AVAILABLE. + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + if (res == detail::equal_result::AVAILABLE and + ref_.impl_.is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = detail::bucket_probing_results{res, i()}; } @@ -1241,10 +1268,9 @@ class operator_impl< size_type evicted_age = 0; cuda::static_for([&] __device__(auto i) { if (displace_idx < 0) { - auto const age = probing_scheme.template probe_distance( - ref_.impl_.extract_key(bucket_slots[i()]), - static_cast(*probing_iter + i()), - storage_ref.extent()); + // `robin_hood_age` so a tombstone uses its payload-stored age (like any resident). + auto const age = ref_.impl_.robin_hood_age( + bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -1263,6 +1289,8 @@ class operator_impl< ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); } if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + // Consuming a tombstone places the in-flight pair in its slot -- insert complete. + if (group.shfl(ref_.impl_.is_erased(evicted), src_lane)) { return true; } auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); auto const new_age = group.shfl(evicted_age, src_lane); diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 8bf09abe2..024068242 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -14,10 +14,12 @@ * limitations under the License. */ +#include #include #include #include +#include #include #include @@ -102,21 +104,35 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { constexpr size_type num_keys{1'000'000}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_map>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + constexpr size_type capacity = static_cast(num_keys / 0.85); + auto map = map_type{capacity, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + // Keys 1..num_keys (avoid the -1 / -2 sentinels). + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + constexpr size_type num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of( + d_contained.begin(), d_contained.begin() + num_erased, cuda::std::identity{})); + REQUIRE( + cuco::test::all_of(d_contained.begin() + num_erased, d_contained.end(), cuda::std::identity{})); +} + +// Robin Hood erase reuse + structural invariant at high load: insert, erase half (-> tombstones), +// re-insert (-> consume tombstones), checking the per-bucket Robin Hood layout invariant after each +// step (tombstones counted as residents, age read from their payload). A wrongly consumed tombstone +// (the age inversion) corrupts the layout and trips the invariant; a lost/duplicate key trips +// size/contains. +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase reuse + invariant", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_keys = 10'000; + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + constexpr size_type capacity = static_cast(num_keys / 0.85); + auto map = map_type{capacity, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + constexpr size_type num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + cuco::test::check_robin_hood_invariant(map); // tombstones-as-residents layout still valid + + map.insert(pairs_begin, pairs_begin + num_erased); // consume tombstones / fill empties + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); // layout valid after reuse + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); +} + +namespace { +enum class reinsert_via { insert_or_assign, insert_or_apply, insert_and_find }; + +// Robin Hood erase + reuse through a specific re-insert API: insert, check invariant, erase the first +// half (-> tombstones), check invariant, then re-insert that half via `how` (-> consume tombstones) +// and check invariant + that every key is present. Exercises the tombstone path of the chosen insert +// variant. +template +void test_rh_erase_reuse(size_type num_keys, reinsert_via how) +{ + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + auto const capacity = static_cast(num_keys / 0.85); + auto map = map_type{capacity, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + auto const num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + cuco::test::check_robin_hood_invariant(map); + + // re-insert the erased half through the variant under test (consuming tombstones) + switch (how) { + case reinsert_via::insert_or_assign: + map.insert_or_assign(pairs_begin, pairs_begin + num_erased); + break; + case reinsert_via::insert_or_apply: + map.insert_or_apply(pairs_begin, pairs_begin + num_erased, cuco::reduce::plus{}); + break; + case reinsert_via::insert_and_find: { + thrust::device_vector found(num_erased); + thrust::device_vector inserted(num_erased); + map.insert_and_find(pairs_begin, pairs_begin + num_erased, found.begin(), inserted.begin()); + break; + } + } + + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); +} +} // namespace + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_or_assign reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_or_assign); +} + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_or_apply reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_or_apply); +} + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_and_find reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_and_find); +} diff --git a/tests/static_map/robin_hood_invariant.cuh b/tests/static_map/robin_hood_invariant.cuh index 5d44be7e6..78750c4ad 100644 --- a/tests/static_map/robin_hood_invariant.cuh +++ b/tests/static_map/robin_hood_invariant.cuh @@ -54,6 +54,7 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) auto const num_groups = storage_ref.capacity() / stride; auto const extent = storage_ref.extent(); auto const empty_key = ref.empty_key_sentinel(); + auto const erased_key = ref.erased_key_sentinel(); auto const scheme = ref.probing_scheme(); for (size_type g = blockIdx.x * blockDim.x + threadIdx.x; g < num_groups; @@ -62,10 +63,13 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) size_type max_age_g = 0; for (int s = 0; s < stride; ++s) { auto const slot = slots[g * stride + s]; - if (slot.first != empty_key) { + if (slot.first != empty_key) { // tombstones count as residents (erase enabled => != empty) ++occupied_g; - auto const age = scheme.template probe_distance( - slot.first, static_cast(g * stride + s), extent); + // A tombstone keeps its age in its payload; a live key's age is its probe distance. + auto const age = (slot.first == erased_key) + ? static_cast(slot.second) + : scheme.template probe_distance( + slot.first, static_cast(g * stride + s), extent); if (age > max_age_g) { max_age_g = age; } } } @@ -77,8 +81,10 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) for (int s = 0; s < stride; ++s) { auto const slot = slots[pg * stride + s]; if (slot.first != empty_key) { - auto const age = scheme.template probe_distance( - slot.first, static_cast(pg * stride + s), extent); + auto const age = (slot.first == erased_key) + ? static_cast(slot.second) + : scheme.template probe_distance( + slot.first, static_cast(pg * stride + s), extent); if (occupied_p == 0 || age < min_age_p) { min_age_p = age; } ++occupied_p; } From 1f3d64e4eb072741f5bb5b7b032741b6b9506480 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:56:32 +0000 Subject: [PATCH 10/15] [pre-commit.ci] auto code formatting --- .../open_addressing_ref_impl.cuh | 76 ++++++++++--------- .../probing_scheme/probing_scheme_impl.inl | 6 +- .../cuco/detail/static_map/static_map_ref.inl | 51 +++++++------ include/cuco/probing_scheme.cuh | 8 +- tests/static_map/contains_test.cu | 2 +- tests/static_map/erase_test.cu | 28 +++---- tests/static_map/find_test.cu | 6 +- tests/static_map/insert_and_find_test.cu | 2 +- tests/static_map/insert_or_apply_test.cu | 15 ++-- tests/static_map/insert_or_assign_test.cu | 6 +- .../probing_scheme_invariants_test.cu | 22 +++--- tests/static_map/robin_hood_invariant.cuh | 6 +- 12 files changed, 113 insertions(+), 115 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index e64efb43d..0c63eef8b 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -401,8 +401,8 @@ class open_addressing_ref_impl if (eq_res == detail::equal_result::EQUAL) { return false; } } // Robin Hood claims only a true empty here; a tombstone carries an age and is handled as a - // resident by the displacement test below. Skipping it must gate the CAS (once claimed it is - // already consumed), so it is folded into this condition. For non-Robin-Hood the second + // resident by the displacement test below. Skipping it must gate the CAS (once claimed it + // is already consumed), so it is folded into this condition. For non-Robin-Hood the second // clause is a compile-time `false`, leaving the original `eq_res == AVAILABLE`. if (eq_res == detail::equal_result::AVAILABLE and not(cuco::is_robin_hood_probing::value and @@ -448,25 +448,26 @@ class open_addressing_ref_impl auto const evicted_age = this->robin_hood_age( slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { - if (this->attempt_insert( - this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val) == - insert_result::SUCCESS) { + if (this->attempt_insert(this->get_slot_ptr(*probing_iter, intra_bucket_index), + slot_content, + val) == insert_result::SUCCESS) { // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. if (this->is_erased(slot_content)) { return true; } // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is - // `evicted_age`, and it may belong in another slot of the same bucket: an empty one, - // or one holding an even-richer resident it can displace in turn. Re-reading the - // bucket (rather than advancing past it) is the within-bucket linear probe, i.e. the - // combined bucket+slot distance that makes displacement correct for bucket_size > 1. - // The `slot_distance` term cancels in every comparison, so it never appears here; it - // shows up only as this slot-by-slot continuation. `bit_cast` keeps the adoption - // valid for heterogeneous insert types (layout-compatible by contract; identity in - // the common case). + // `evicted_age`, and it may belong in another slot of the same bucket: an empty + // one, or one holding an even-richer resident it can displace in turn. Re-reading + // the bucket (rather than advancing past it) is the within-bucket linear probe, + // i.e. the combined bucket+slot distance that makes displacement correct for + // bucket_size > 1. The `slot_distance` term cancels in every comparison, so it + // never appears here; it shows up only as this slot-by-slot continuation. + // `bit_cast` keeps the adoption valid for heterogeneous insert types + // (layout-compatible by contract; identity in the common case). val = cuda::std::bit_cast(slot_content); key = this->extract_key(val); probe_step = evicted_age; } - retry = true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS + retry = + true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS break; } } @@ -584,8 +585,8 @@ class open_addressing_ref_impl if (displace_idx < 0) { // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. // consumed) exactly when richer than the in-flight pair, like any other resident. - auto const age = - this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); + auto const age = this->robin_hood_age(bucket_slots[i()], + static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -666,8 +667,8 @@ class open_addressing_ref_impl probing_scheme_.template make_iterator(key, storage_ref_.extent()); auto const init_idx = *probing_iter; [[maybe_unused]] size_type probe_step = 0; - // Robin Hood may displace the original key before the chain ends; remember the slot it landed in - // so we return an iterator to it (not to a later victim's slot). + // Robin Hood may displace the original key before the chain ends; remember the slot it landed + // in so we return an iterator to it (not to a later victim's slot). [[maybe_unused]] value_type* placed_ptr = nullptr; while (true) { @@ -720,15 +721,14 @@ class open_addressing_ref_impl } // Robin Hood swap test (see `insert` for the full rationale). A tombstone is a resident too - // (age from its payload); picking one up consumes it -- the in-flight pair lands there and we - // are done. + // (age from its payload); picking one up consumes it -- the in-flight pair lands there and + // we are done. if constexpr (cuco::is_robin_hood_probing::value) { if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(bucket_slots[i])) { auto const evicted_age = this->robin_hood_age(bucket_slots[i], static_cast(*probing_iter + i)); if (evicted_age < probe_step) { - if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == - insert_result::SUCCESS) { + if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == insert_result::SUCCESS) { if (this->is_erased(bucket_slots[i])) { // Consumed a tombstone: the in-flight pair is placed here; return the original // key's slot (this one if it was never displaced). @@ -866,8 +866,8 @@ class open_addressing_ref_impl if (displace_idx < 0) { // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. // consumed) exactly when richer than the in-flight pair, like any other resident. - auto const age = - this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); + auto const age = this->robin_hood_age(bucket_slots[i()], + static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -890,7 +890,8 @@ class open_addressing_ref_impl if (group.shfl(status, src_lane) == insert_result::SUCCESS) { if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } // Consumed a tombstone: the in-flight pair is placed in its slot; we are done. Return - // the original key's slot (`placed_ptr`, which is this slot if it was never displaced). + // the original key's slot (`placed_ptr`, which is this slot if it was never + // displaced). if (group.shfl(this->is_erased(evicted), src_lane)) { if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(reinterpret_cast(displaced)); @@ -951,8 +952,8 @@ class open_addressing_ref_impl // Key exists, return true if successfully deleted if (eq_res == detail::equal_result::EQUAL) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes use - // the plain erased sentinel. + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes + // use the plain erased sentinel. value_type erased = this->erased_slot_sentinel(); if constexpr (cuco::is_robin_hood_probing::value) { erased = this->robin_hood_erased_sentinel( @@ -1010,16 +1011,17 @@ class open_addressing_ref_impl auto const src_lane = __ffs(group_contains_equal) - 1; auto status = insert_result::CONTINUE; if (group.thread_rank() == src_lane) { - // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes use - // the plain erased sentinel. + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes + // use the plain erased sentinel. value_type erased = this->erased_slot_sentinel(); if constexpr (cuco::is_robin_hood_probing::value) { erased = this->robin_hood_erased_sentinel( bucket_slots[intra_bucket_index], static_cast(*probing_iter + intra_bucket_index)); } - status = attempt_insert_stable( - this->get_slot_ptr(*probing_iter, intra_bucket_index), bucket_slots[intra_bucket_index], erased); + status = attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), + bucket_slots[intra_bucket_index], + erased); } switch (group.shfl(status, src_lane)) { @@ -1925,8 +1927,8 @@ class open_addressing_ref_impl */ template [[nodiscard]] __device__ bool robin_hood_proves_absent(BucketSlots const& bucket_slots, - size_type bucket_base, - size_type probe_step) const noexcept + size_type bucket_base, + size_type probe_step) const noexcept { bool richer = false; cuda::static_for([&](auto i) { @@ -1981,16 +1983,16 @@ class open_addressing_ref_impl * @brief The Robin Hood tombstone for erasing the live key currently in `slot` at `slot_index`. * * The erased key's age is stashed in the payload (1a) so the tombstone keeps its place in the - * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes use - * the plain `erased_slot_sentinel()` and never call this. + * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes + * use the plain `erased_slot_sentinel()` and never call this. * * @param slot The slot's current (live) contents * @param slot_index The slot's index * * @return The value to CAS into the slot to erase it */ - [[nodiscard]] __device__ value_type robin_hood_erased_sentinel(value_type const& slot, - size_type slot_index) const noexcept + [[nodiscard]] __device__ value_type + robin_hood_erased_sentinel(value_type const& slot, size_type slot_index) const noexcept { static_assert(has_payload, "Robin Hood erase requires a mapped payload to store the tombstone age"); diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 8b93076c3..2dbb9bdcb 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -231,11 +231,7 @@ namespace detail { * @param upper_bound Upper bound of the iteration * @return The resident's probe distance, in probing steps */ -template +template [[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( linear_probing const& scheme, ProbeKey resident_key, diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 767e3b944..b71a40f9f 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -538,7 +538,8 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - if constexpr (cuco::is_robin_hood_probing::value) { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { // Robin Hood may relocate this key; assign via a full-slot CAS that keeps the key and // fails if it moved. On failure re-probe and retry -- the loop re-finds the key. auto desired = slot_content; @@ -556,10 +557,11 @@ class operator_impl< } } if (eq_res == detail::equal_result::AVAILABLE) { - if constexpr (cuco::is_robin_hood_probing::value) { - // Claim only a true empty; a tombstone is handled as a resident by the displacement test - // below. On a lost CAS (rival insert) or a duplicate, retry -- the loop re-finds the key - // and assigns it via the EQUAL full-slot CAS. + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Claim only a true empty; a tombstone is handled as a resident by the displacement + // test below. On a lost CAS (rival insert) or a duplicate, retry -- the loop re-finds + // the key and assigns it via the EQUAL full-slot CAS. if (not ref_.impl_.is_erased(slot_content)) { if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == detail::insert_result::SUCCESS) { @@ -574,8 +576,8 @@ class operator_impl< } // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A - // tombstone is a resident too (age from its payload); picking one up consumes it -- the pair - // lands there and we are done. + // tombstone is a resident too (age from its payload); picking one up consumes it -- the + // pair lands there and we are done. if constexpr (cuco::is_robin_hood_probing::value) { if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { auto const evicted_age = ref_.impl_.robin_hood_age( @@ -640,7 +642,8 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); - // Robin Hood: a tombstone is a resident handled by the displacement scan, not AVAILABLE. + // Robin Hood: a tombstone is a resident handled by the displacement scan, not + // AVAILABLE. if constexpr (cuco::is_robin_hood_probing< typename ref_type::probing_scheme_type>::value) { if (res == detail::equal_result::AVAILABLE and @@ -662,8 +665,8 @@ class operator_impl< if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; if constexpr (cuco::is_robin_hood_probing::value) { - // src_lane assigns via a full-slot CAS (key fixed); a relocation or rival update fails it, - // so the group re-probes and retries -- the loop re-finds the key. + // src_lane assigns via a full-slot CAS (key fixed); a relocation or rival update fails + // it, so the group re-probes and retries -- the loop re-finds the key. auto const success = [&, target_idx = intra_bucket_index]() { if (group.thread_rank() != src_lane) { return false; } auto desired = bucket_slots[target_idx]; @@ -687,8 +690,8 @@ class operator_impl< if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; if constexpr (cuco::is_robin_hood_probing::value) { - // Insert the new pair with a full-slot CAS; on a lost CAS or duplicate, re-probe and retry - // (the loop re-finds the key and assigns via the EQUAL full-slot CAS). + // Insert the new pair with a full-slot CAS; on a lost CAS or duplicate, re-probe and + // retry (the loop re-finds the key and assigns via the EQUAL full-slot CAS). auto const success = [&, target_idx = intra_bucket_index]() { if (group.thread_rank() != src_lane) { return false; } return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], val) == @@ -1050,9 +1053,9 @@ class operator_impl< // Lift `op` to the whole slot, keeping the key, and CAS it. A relocation (or a rival // update) makes the CAS fail; re-probe and retry -- the loop re-finds the key. auto desired = slot_content; - // `desired` is a local copy, so this `op` is just local arithmetic -- the `atomic_ref`'s - // atomicity does nothing here and is used only because `Op`'s signature requires one. The - // real atomic is the full-slot CAS below. + // `desired` is a local copy, so this `op` is just local arithmetic -- the + // `atomic_ref`'s atomicity does nothing here and is used only because `Op`'s signature + // requires one. The real atomic is the full-slot CAS below. op(cuda::atomic_ref{desired.second}, val.second); if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == detail::insert_result::SUCCESS) { @@ -1080,7 +1083,8 @@ class operator_impl< case insert_result::DUPLICATE: { if constexpr (cuco::is_robin_hood_probing< typename ref_type::probing_scheme_type>::value) { - // Key is present now; re-probe so it is found EQUAL and updated via the full-slot CAS. + // Key is present now; re-probe so it is found EQUAL and updated via the full-slot + // CAS. retry = true; break; } @@ -1101,7 +1105,8 @@ class operator_impl< } } } - if constexpr (cuco::is_robin_hood_probing::value) { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { if (retry) { break; } } } @@ -1187,7 +1192,8 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); - // Robin Hood: a tombstone is a resident handled by the displacement scan, not AVAILABLE. + // Robin Hood: a tombstone is a resident handled by the displacement scan, not + // AVAILABLE. if constexpr (cuco::is_robin_hood_probing< typename ref_type::probing_scheme_type>::value) { if (res == detail::equal_result::AVAILABLE and @@ -1208,16 +1214,15 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { + if constexpr (cuco::is_robin_hood_probing::value) { // src_lane lifts `op` to the slot (key fixed) and CASes it; a relocation or rival update // fails the CAS, so the group re-probes and retries -- the loop re-finds the key. auto const success = [&, target_idx = intra_bucket_index]() { if (group.thread_rank() != src_lane) { return false; } auto desired = bucket_slots[target_idx]; - // `desired` is a local copy, so this `op` is just local arithmetic -- the `atomic_ref`'s - // atomicity does nothing here and is used only because `Op`'s signature requires one. The - // real atomic is the full-slot CAS below. + // `desired` is a local copy, so this `op` is just local arithmetic -- the + // `atomic_ref`'s atomicity does nothing here and is used only because `Op`'s signature + // requires one. The real atomic is the full-slot CAS below. op(cuda::atomic_ref{desired.second}, val.second); return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == detail::insert_result::SUCCESS; diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index f74c8fe80..3bf8b98c2 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -233,10 +233,10 @@ class double_hashing : private detail::probing_scheme_base { template class robin_hood_probing : private Underlying { public: - using Underlying::cg_size; ///< Cooperative group size (from the underlying scheme) - using typename Underlying::hasher; ///< Hash function type (from the underlying scheme) - using Underlying::hash_function; ///< Forwarded: gets the function(s) used to hash keys - using Underlying::make_iterator; ///< Forwarded: the (unchanged) forward probe sequence + using typename Underlying::hasher; ///< Hash function type (from the underlying scheme) + using Underlying::cg_size; ///< Cooperative group size (from the underlying scheme) + using Underlying::hash_function; ///< Forwarded: gets the function(s) used to hash keys + using Underlying::make_iterator; ///< Forwarded: the (unchanged) forward probe sequence /** * @brief Constructs a Robin Hood probing scheme wrapping the given underlying scheme. diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index 9f2769a56..9c1312482 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -30,6 +29,7 @@ #include #include +#include using size_type = int32_t; diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 024068242..5f2942ef1 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -26,6 +25,7 @@ #include #include +#include using size_type = int32_t; @@ -180,10 +180,8 @@ TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase read-path", cuco::storage<2>>; constexpr size_type capacity = static_cast(num_keys / 0.85); - auto map = map_type{capacity, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; // Keys 1..num_keys (avoid the -1 / -2 sentinels). auto keys_begin = cuda::counting_iterator(1); @@ -237,10 +235,8 @@ TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase reuse + invariant", cuco::storage<2>>; constexpr size_type capacity = static_cast(num_keys / 0.85); - auto map = map_type{capacity, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; auto keys_begin = cuda::counting_iterator(1); auto pairs_begin = cuda::make_transform_iterator( @@ -269,10 +265,10 @@ TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase reuse + invariant", namespace { enum class reinsert_via { insert_or_assign, insert_or_apply, insert_and_find }; -// Robin Hood erase + reuse through a specific re-insert API: insert, check invariant, erase the first -// half (-> tombstones), check invariant, then re-insert that half via `how` (-> consume tombstones) -// and check invariant + that every key is present. Exercises the tombstone path of the chosen insert -// variant. +// Robin Hood erase + reuse through a specific re-insert API: insert, check invariant, erase the +// first half (-> tombstones), check invariant, then re-insert that half via `how` (-> consume +// tombstones) and check invariant + that every key is present. Exercises the tombstone path of the +// chosen insert variant. template void test_rh_erase_reuse(size_type num_keys, reinsert_via how) { @@ -287,10 +283,8 @@ void test_rh_erase_reuse(size_type num_keys, reinsert_via how) cuco::storage<2>>; auto const capacity = static_cast(num_keys / 0.85); - auto map = map_type{capacity, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; auto keys_begin = cuda::counting_iterator(1); auto pairs_begin = cuda::make_transform_iterator( diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index e9fbe3b48..7d6b85ed7 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -31,6 +30,7 @@ #include #include +#include using size_type = int32_t; @@ -74,8 +74,8 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); - // Robin Hood-specific: the populated table must satisfy the per-bucket Robin Hood layout invariant - // (a no-op for linear/double hashing). + // Robin Hood-specific: the populated table must satisfy the per-bucket Robin Hood layout + // invariant (a no-op for linear/double hashing). if constexpr (cuco::is_robin_hood_probing::value) { cuco::test::check_robin_hood_invariant(map); } diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index d380bf724..3010c3333 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -15,7 +15,6 @@ * limitations under the License. */ -#include #include #include @@ -27,6 +26,7 @@ #include #include +#include using size_type = std::size_t; diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 6bc151b17..692e7af31 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -28,6 +27,7 @@ #include #include +#include #include @@ -310,10 +310,11 @@ TEMPLATE_TEST_CASE_SIG( } } -// Dedicated Robin Hood coverage for insert_or_apply: the probe-enum test above is disabled upstream, -// so this is the only active exercise of the displacing RH insert_or_apply (reduction + lock-free -// displacement together). It runs at a high load factor (~0.95 on the unique keys) so displacement -// actually fires, and reuses `test_insert_or_apply`, whose tail asserts the structural RH invariant. +// Dedicated Robin Hood coverage for insert_or_apply: the probe-enum test above is disabled +// upstream, so this is the only active exercise of the displacing RH insert_or_apply (reduction + +// lock-free displacement together). It runs at a high load factor (~0.95 on the unique keys) so +// displacement actually fires, and reuses `test_insert_or_apply`, whose tail asserts the structural +// RH invariant. TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_apply (high load)", "", ((typename Key, typename Value, int CGSize), Key, Value, CGSize), @@ -339,8 +340,8 @@ TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_apply (high load)", cuco::cuda_allocator, cuco::storage<2>>; - // Size the table for ~0.95 load on the unique keys, so it is nearly full and the displacing insert - // path (and the structural invariant check inside the helper) is genuinely stressed. + // Size the table for ~0.95 load on the unique keys, so it is nearly full and the displacing + // insert path (and the structural invariant check inside the helper) is genuinely stressed. constexpr size_type capacity = static_cast(num_unique_keys / 0.95); SECTION("sentinel equals init; has_init = true") diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 067224e62..66b1359a7 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -30,6 +29,7 @@ #include #include +#include using size_type = std::size_t; @@ -146,8 +146,8 @@ TEMPLATE_TEST_CASE_SIG( } // Dedicated Robin Hood coverage for *concurrent* insert + assign. The probe-enum test above is -// phased (insert all keys, then assign in a separate pass), so it never exercises an assign racing a -// displacement -- the hazard RH introduces. Here every occurrence of key k assigns the same value +// phased (insert all keys, then assign in a separate pass), so it never exercises an assign racing +// a displacement -- the hazard RH introduces. Here every occurrence of key k assigns the same value // (k * 2), so the final value is deterministic regardless of ordering; but a displacement-vs-assign // race would land an assign on a different key's slot and corrupt it. Runs at ~0.95 load in one // concurrent pass with duplicates, and verifies each key's value individually (a value-multiset diff --git a/tests/static_map/probing_scheme_invariants_test.cu b/tests/static_map/probing_scheme_invariants_test.cu index 2fb4bdd88..277ec1921 100644 --- a/tests/static_map/probing_scheme_invariants_test.cu +++ b/tests/static_map/probing_scheme_invariants_test.cu @@ -44,21 +44,21 @@ struct identity_hash { // then queried with keys chosen to exercise each lookup-termination rule. TEST_CASE("static_map robin_hood read-path early-exit", "") { - using Key = std::int32_t; - using Value = std::int32_t; - using size_type = std::int32_t; - auto constexpr capacity = size_type{16}; + using Key = std::int32_t; + using Value = std::int32_t; + using size_type = std::int32_t; + auto constexpr capacity = size_type{16}; using extent_type = cuco::extent; using probe_type = cuco::robin_hood_probing>>; using map_type = cuco::static_map, - probe_type, - cuco::cuda_allocator, - cuco::storage<1>>; + Value, + extent_type, + cuda::thread_scope_device, + cuda::std::equal_to, + probe_type, + cuco::cuda_allocator, + cuco::storage<1>>; using value_type = typename map_type::value_type; // cuco::pair auto map = map_type{extent_type{}, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_map/robin_hood_invariant.cuh b/tests/static_map/robin_hood_invariant.cuh index 78750c4ad..34165435c 100644 --- a/tests/static_map/robin_hood_invariant.cuh +++ b/tests/static_map/robin_hood_invariant.cuh @@ -75,7 +75,7 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) } if (occupied_g == 0) { continue; } - size_type const pg = (g + num_groups - 1) % num_groups; + size_type const pg = (g + num_groups - 1) % num_groups; int occupied_p = 0; size_type min_age_p = 0; for (int s = 0; s < stride; ++s) { @@ -90,8 +90,8 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) } } - if (max_age_g >= 1 && occupied_p < stride) { atomicAdd(violations, 1); } // (1) - if (occupied_p > 0 && min_age_p + 1 < max_age_g) { atomicAdd(violations, 1); } // (2) + if (max_age_g >= 1 && occupied_p < stride) { atomicAdd(violations, 1); } // (1) + if (occupied_p > 0 && min_age_p + 1 < max_age_g) { atomicAdd(violations, 1); } // (2) } } From f26495adc0fd4827be0362398b5c3e2b65d43e66 Mon Sep 17 00:00:00 2001 From: aterenin Date: Sat, 6 Jun 2026 12:38:04 -0400 Subject: [PATCH 11/15] Disallow 96-bit slots for Robin Hood. --- .../open_addressing/open_addressing_ref_impl.cuh | 16 ++++++++++++++++ tests/static_map/erase_test.cu | 9 ++++----- tests/static_map/find_test.cu | 9 ++++----- tests/static_map/insert_and_find_test.cu | 9 ++++----- tests/static_map/insert_or_assign_test.cu | 9 ++++----- 5 files changed, 32 insertions(+), 20 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 0c63eef8b..60a845eac 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -130,6 +130,22 @@ class open_addressing_ref_impl storage_ref_type::bucket_size; ///< Number of elements handled per bucket static constexpr auto thread_scope = Scope; ///< CUDA thread scope + // Robin Hood displacement swaps the in-flight pair into an *occupied* slot, which needs a single + // atomic CAS of the whole slot. That requires a packable slot: <= 8 bytes (atom.cas.b64), or + // padding-free and <= 16 bytes on an sm_90+ build (atom.cas.b128). A non-packable slot (e.g. a + // padded `pair`) would fall back to a split key/value CAS, which cannot move an + // occupied slot -- displacement would livelock. Reject it at compile time rather than hang. + static constexpr bool robin_hood_slot_is_single_cas = sizeof(value_type) <= 8 +#if defined(CUCO_HAS_128BIT_ATOMICS) + or cuco::detail::is_packable() +#endif + ; + static_assert(not cuco::is_robin_hood_probing::value or + robin_hood_slot_is_single_cas, + "Robin Hood probing requires a single-CAS slot: the key+value must fit in 8 bytes, " + "or be packable (padding-free) and <= 16 bytes on an sm_90+ build. A padded slot " + "(e.g. pair) is unsupported -- displacement would livelock."); + /** * @brief Constructs open_addressing_ref_impl. * diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 5f2942ef1..749c9e25e 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -114,11 +114,10 @@ TEMPLATE_TEST_CASE_SIG( (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index 7d6b85ed7..2dda5e89f 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -187,11 +187,10 @@ TEMPLATE_TEST_CASE_SIG( (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index 3010c3333..be3c55a45 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -63,11 +63,10 @@ TEMPLATE_TEST_CASE_SIG( (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 66b1359a7..843417072 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -112,11 +112,10 @@ TEMPLATE_TEST_CASE_SIG( (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif From 826eb845d884aab015242425a5ba1613289b6579 Mon Sep 17 00:00:00 2001 From: aterenin Date: Wed, 24 Jun 2026 17:23:49 -0400 Subject: [PATCH 12/15] Revert to original version. --- .../open_addressing_ref_impl.cuh | 494 ++---------------- .../probing_scheme/probing_scheme_impl.inl | 79 --- .../cuco/detail/static_map/static_map_ref.inl | 355 ++----------- include/cuco/probing_scheme.cuh | 97 ---- tests/CMakeLists.txt | 1 - tests/static_map/contains_test.cu | 27 +- tests/static_map/erase_test.cu | 247 +-------- tests/static_map/find_test.cu | 37 +- tests/static_map/insert_and_find_test.cu | 30 +- tests/static_map/insert_or_apply_test.cu | 77 +-- tests/static_map/insert_or_assign_test.cu | 92 +--- .../probing_scheme_invariants_test.cu | 112 ---- tests/static_map/robin_hood_invariant.cuh | 117 ----- tests/test_utils.hpp | 2 +- tests/utility/probing_scheme_test.cu | 72 --- 15 files changed, 104 insertions(+), 1735 deletions(-) delete mode 100644 tests/static_map/probing_scheme_invariants_test.cu delete mode 100644 tests/static_map/robin_hood_invariant.cuh diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 60a845eac..ada8792bb 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -27,7 +27,6 @@ #include #include -#include #include #include #include @@ -130,22 +129,6 @@ class open_addressing_ref_impl storage_ref_type::bucket_size; ///< Number of elements handled per bucket static constexpr auto thread_scope = Scope; ///< CUDA thread scope - // Robin Hood displacement swaps the in-flight pair into an *occupied* slot, which needs a single - // atomic CAS of the whole slot. That requires a packable slot: <= 8 bytes (atom.cas.b64), or - // padding-free and <= 16 bytes on an sm_90+ build (atom.cas.b128). A non-packable slot (e.g. a - // padded `pair`) would fall back to a split key/value CAS, which cannot move an - // occupied slot -- displacement would livelock. Reject it at compile time rather than hang. - static constexpr bool robin_hood_slot_is_single_cas = sizeof(value_type) <= 8 -#if defined(CUCO_HAS_128BIT_ATOMICS) - or cuco::detail::is_packable() -#endif - ; - static_assert(not cuco::is_robin_hood_probing::value or - robin_hood_slot_is_single_cas, - "Robin Hood probing requires a single-CAS slot: the key+value must fit in 8 bytes, " - "or be packable (padding-free) and <= 16 bytes on an sm_90+ build. A padded slot " - "(e.g. pair) is unsupported -- displacement would livelock."); - /** * @brief Constructs open_addressing_ref_impl. * @@ -395,19 +378,16 @@ class open_addressing_ref_impl { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto val = this->heterogeneous_value(value); - auto key = this->extract_key(val); + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; - [[maybe_unused]] bool retry = false; - for (auto& slot_content : bucket_slots) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(slot_content)); @@ -416,13 +396,7 @@ class open_addressing_ref_impl // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { return false; } } - // Robin Hood claims only a true empty here; a tombstone carries an age and is handled as a - // resident by the displacement test below. Skipping it must gate the CAS (once claimed it - // is already consumed), so it is folded into this condition. For non-Robin-Hood the second - // clause is a compile-time `false`, leaving the original `eq_res == AVAILABLE`. - if (eq_res == detail::equal_result::AVAILABLE and - not(cuco::is_robin_hood_probing::value and - this->is_erased(slot_content))) { + if (eq_res == detail::equal_result::AVAILABLE) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); switch (attempt_insert( this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val)) { @@ -433,68 +407,11 @@ class open_addressing_ref_impl return false; } } - case insert_result::CONTINUE: { - // Retry on a lost CAS. Plain probing keeps scanning this (now stale) bucket; Robin - // Hood must re-read it instead, so the in-flight pair is re-evaluated against the new - // occupants -- otherwise it could be placed past a slot it should have displaced, - // breaking the invariant (and therefore lookups). - if constexpr (cuco::is_robin_hood_probing::value) { - retry = true; - break; - } else { - continue; - } - } + case insert_result::CONTINUE: continue; case insert_result::SUCCESS: return true; } - if constexpr (cuco::is_robin_hood_probing::value) { - if (retry) { break; } // leave the scan to re-read the bucket - } - } - - // Robin Hood swap test. A resident "richer" than the in-flight pair (a smaller probe - // distance than our current probe step) is displaced: we swap our pair into its slot, adopt - // the evicted resident, and re-probe forward. A tombstone is treated as a resident too -- - // its age comes from its payload (`robin_hood_age`) -- but picking one up *consumes* it: we - // take the slot and are done, since there is nothing to carry forward. - if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(slot_content)) { - auto const intra_bucket_index = - cuda::std::distance(bucket_slots.begin(), &slot_content); - auto const evicted_age = this->robin_hood_age( - slot_content, static_cast(*probing_iter + intra_bucket_index)); - if (evicted_age < probe_step) { - if (this->attempt_insert(this->get_slot_ptr(*probing_iter, intra_bucket_index), - slot_content, - val) == insert_result::SUCCESS) { - // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. - if (this->is_erased(slot_content)) { return true; } - // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is - // `evicted_age`, and it may belong in another slot of the same bucket: an empty - // one, or one holding an even-richer resident it can displace in turn. Re-reading - // the bucket (rather than advancing past it) is the within-bucket linear probe, - // i.e. the combined bucket+slot distance that makes displacement correct for - // bucket_size > 1. The `slot_distance` term cancels in every comparison, so it - // never appears here; it shows up only as this slot-by-slot continuation. - // `bit_cast` keeps the adoption valid for heterogeneous insert types - // (layout-compatible by contract; identity in the common case). - val = cuda::std::bit_cast(slot_content); - key = this->extract_key(val); - probe_step = evicted_age; - } - retry = - true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS - break; - } - } } } - - if constexpr (cuco::is_robin_hood_probing::value) { - if (retry) { continue; } // re-probe (re-read this bucket, or move on after displacement) - ++probe_step; - } - ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -515,12 +432,11 @@ class open_addressing_ref_impl __device__ bool insert(cooperative_groups::thread_block_tile group, Value value) noexcept { - auto val = this->heterogeneous_value(value); - auto key = this->extract_key(val); + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -531,18 +447,9 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { switch (this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()]))) { - case detail::equal_result::AVAILABLE: { - // Robin Hood: only a true empty is AVAILABLE; a tombstone is a resident handled by - // the displacement scan below, so leave it UNEQUAL here. - bool empty_slot = true; - if constexpr (cuco::is_robin_hood_probing::value) { - empty_slot = not this->is_erased(bucket_slots[i()]); - } - if (empty_slot) { - result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; - } + case detail::equal_result::AVAILABLE: + result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; break; - } case detail::equal_result::EQUAL: { if constexpr (!allows_duplicates) { result = bucket_probing_results{detail::equal_result::EQUAL, i()}; @@ -589,62 +496,6 @@ class open_addressing_ref_impl default: continue; } } else { - if constexpr (cuco::is_robin_hood_probing::value) { - // Robin Hood displacement: no match, no empty slot in this bucket. Displace the first - // resident in probe (lane) order that is richer than the in-flight pair, adopt it, and - // re-probe THIS bucket -- the victim may belong in another slot of it. The within-bucket - // linear probe (combined bucket+slot distance) is identical to the scalar path; the - // `slot_distance` term cancels, so the test is again `resident distance < probe_step`. - cuda::std::int32_t displace_idx = -1; - size_type evicted_age = 0; - cuda::static_for([&] __device__(auto i) { - if (displace_idx < 0) { - // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. - // consumed) exactly when richer than the in-flight pair, like any other resident. - auto const age = this->robin_hood_age(bucket_slots[i()], - static_cast(*probing_iter + i())); - if (age < probe_step) { - displace_idx = i(); - evicted_age = age; - } - } - }); - - auto const group_displaceable = group.ballot(displace_idx >= 0); - if (group_displaceable) { - auto const src_lane = __ffs(group_displaceable) - 1; - auto status = insert_result::CONTINUE; - // Only `src_lane` reads `evicted` meaningfully; other lanes just need a valid value to - // feed the broadcast `shfl` below, so seed it with the empty-slot sentinel. - value_type evicted = this->empty_slot_sentinel(); - if (group.thread_rank() == src_lane) { - evicted = bucket_slots[displace_idx]; - status = - attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); - } - if (group.shfl(status, src_lane) == insert_result::SUCCESS) { - // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. - if (group.shfl(this->is_erased(evicted), src_lane)) { return true; } - // Broadcast the evicted pair and its probe distance from the winning lane, and adopt - // it on every lane (all lanes need the new in-flight pair for the next scan). - auto const new_key = group.shfl(this->extract_key(evicted), src_lane); - auto const new_age = group.shfl(evicted_age, src_lane); - value_type evicted_slot; - if constexpr (has_payload) { - auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); - evicted_slot = value_type{new_key, new_payload}; - } else { - evicted_slot = new_key; - } - val = cuda::std::bit_cast(evicted_slot); - key = this->extract_key(val); - probe_step = new_age; - } - continue; // success: re-probe this bucket with the victim; lost CAS: re-read it - } - // No displaceable resident: fall through to the shared advance below. - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -677,21 +528,15 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto val = this->heterogeneous_value(value); - auto key = this->extract_key(val); + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; - // Robin Hood may displace the original key before the chain ends; remember the slot it landed - // in so we return an iterator to it (not to a later victim's slot). - [[maybe_unused]] value_type* placed_ptr = nullptr; + auto const init_idx = *probing_iter; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; - [[maybe_unused]] bool retry = false; - for (auto i = 0; i < bucket_size; ++i) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i])); @@ -702,72 +547,20 @@ class open_addressing_ref_impl this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the - // displacement test below (see `insert`). - if (eq_res == detail::equal_result::AVAILABLE and - not(cuco::is_robin_hood_probing::value and - this->is_erased(bucket_slots[i]))) { + if (eq_res == detail::equal_result::AVAILABLE) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { - // The in-flight pair is placed in an empty slot, ending any displacement chain. The - // iterator to return is the original key's slot (captured on its first placement). - auto* result_ptr = slot_ptr; - if constexpr (cuco::is_robin_hood_probing::value) { - if (placed_ptr != nullptr) { result_ptr = placed_ptr; } - } - this->maybe_wait_for_payload(result_ptr); - return {iterator{result_ptr}, true}; + this->maybe_wait_for_payload(slot_ptr); + return {iterator{slot_ptr}, true}; } case insert_result::DUPLICATE: { this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - case insert_result::CONTINUE: { - if constexpr (cuco::is_robin_hood_probing::value) { - retry = true; - break; - } else { - continue; - } - } - } - if constexpr (cuco::is_robin_hood_probing::value) { - if (retry) { break; } - } - } - - // Robin Hood swap test (see `insert` for the full rationale). A tombstone is a resident too - // (age from its payload); picking one up consumes it -- the in-flight pair lands there and - // we are done. - if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(bucket_slots[i])) { - auto const evicted_age = - this->robin_hood_age(bucket_slots[i], static_cast(*probing_iter + i)); - if (evicted_age < probe_step) { - if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == insert_result::SUCCESS) { - if (this->is_erased(bucket_slots[i])) { - // Consumed a tombstone: the in-flight pair is placed here; return the original - // key's slot (this one if it was never displaced). - auto* result_ptr = (placed_ptr != nullptr) ? placed_ptr : slot_ptr; - this->maybe_wait_for_payload(result_ptr); - return {iterator{result_ptr}, true}; - } - if (placed_ptr == nullptr) { placed_ptr = slot_ptr; } // original key's slot - val = cuda::std::bit_cast(bucket_slots[i]); - key = this->extract_key(val); - probe_step = evicted_age; - } - retry = true; - break; - } + default: continue; } } } - - if constexpr (cuco::is_robin_hood_probing::value) { - if (retry) { continue; } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } }; @@ -801,15 +594,11 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto val = this->heterogeneous_value(value); - auto key = this->extract_key(val); + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; - // Robin Hood may displace the original key before the chain ends; remember (broadcast) the slot - // it first landed in so we return an iterator to it. 0 means "not yet placed". - [[maybe_unused]] intptr_t placed_ptr = 0; + auto const init_idx = *probing_iter; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -820,13 +609,6 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { auto res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()])); - // Robin Hood: a tombstone is a resident handled by the displacement scan below, not - // AVAILABLE, so leave it UNEQUAL here. - if constexpr (cuco::is_robin_hood_probing::value) { - if (res == detail::equal_result::AVAILABLE and this->is_erased(bucket_slots[i()])) { - res = detail::equal_result::UNEQUAL; - } - } if (res != detail::equal_result::UNEQUAL) { result = bucket_probing_results{res, i()}; } } }); @@ -856,15 +638,9 @@ class open_addressing_ref_impl switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { - // The in-flight pair is placed in an empty slot, ending any displacement chain. Return - // the original key's slot (the first placement) if it was displaced earlier. - auto result = res; - if constexpr (cuco::is_robin_hood_probing::value) { - if (placed_ptr != 0) { result = placed_ptr; } - } if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); - return {iterator{reinterpret_cast(result)}, true}; + return {iterator{reinterpret_cast(res)}, true}; } case insert_result::DUPLICATE: { if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } @@ -874,64 +650,6 @@ class open_addressing_ref_impl default: continue; } } else { - if constexpr (cuco::is_robin_hood_probing::value) { - // Robin Hood displacement (see CG `insert` for the full rationale). - cuda::std::int32_t displace_idx = -1; - size_type evicted_age = 0; - cuda::static_for([&] __device__(auto i) { - if (displace_idx < 0) { - // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. - // consumed) exactly when richer than the in-flight pair, like any other resident. - auto const age = this->robin_hood_age(bucket_slots[i()], - static_cast(*probing_iter + i())); - if (age < probe_step) { - displace_idx = i(); - evicted_age = age; - } - } - }); - - auto const group_displaceable = group.ballot(displace_idx >= 0); - if (group_displaceable) { - auto const src_lane = __ffs(group_displaceable) - 1; - auto status = insert_result::CONTINUE; - value_type evicted = this->empty_slot_sentinel(); - intptr_t displaced = 0; - if (group.thread_rank() == src_lane) { - auto* dptr = this->get_slot_ptr(*probing_iter, displace_idx); - evicted = bucket_slots[displace_idx]; - status = attempt_insert(dptr, evicted, val); - displaced = reinterpret_cast(dptr); - } - if (group.shfl(status, src_lane) == insert_result::SUCCESS) { - if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } - // Consumed a tombstone: the in-flight pair is placed in its slot; we are done. Return - // the original key's slot (`placed_ptr`, which is this slot if it was never - // displaced). - if (group.shfl(this->is_erased(evicted), src_lane)) { - if (group.thread_rank() == src_lane) { - this->maybe_wait_for_payload(reinterpret_cast(displaced)); - } - group.sync(); - return {iterator{reinterpret_cast(placed_ptr)}, true}; - } - auto const new_key = group.shfl(this->extract_key(evicted), src_lane); - auto const new_age = group.shfl(evicted_age, src_lane); - value_type evicted_slot; - if constexpr (has_payload) { - auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); - evicted_slot = value_type{new_key, new_payload}; - } else { - evicted_slot = new_key; - } - val = cuda::std::bit_cast(evicted_slot); - key = this->extract_key(val); - probe_step = new_age; - } - continue; - } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } } @@ -968,15 +686,9 @@ class open_addressing_ref_impl // Key exists, return true if successfully deleted if (eq_res == detail::equal_result::EQUAL) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes - // use the plain erased sentinel. - value_type erased = this->erased_slot_sentinel(); - if constexpr (cuco::is_robin_hood_probing::value) { - erased = this->robin_hood_erased_sentinel( - slot_content, static_cast(*probing_iter + intra_bucket_index)); - } - switch (attempt_insert_stable( - this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, erased)) { + switch (attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), + slot_content, + this->erased_slot_sentinel())) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: return false; default: continue; @@ -1025,20 +737,12 @@ class open_addressing_ref_impl auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - auto status = insert_result::CONTINUE; - if (group.thread_rank() == src_lane) { - // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes - // use the plain erased sentinel. - value_type erased = this->erased_slot_sentinel(); - if constexpr (cuco::is_robin_hood_probing::value) { - erased = this->robin_hood_erased_sentinel( - bucket_slots[intra_bucket_index], - static_cast(*probing_iter + intra_bucket_index)); - } - status = attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - bucket_slots[intra_bucket_index], - erased); - } + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), + bucket_slots[intra_bucket_index], + this->erased_slot_sentinel()) + : insert_result::CONTINUE; switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; @@ -1073,8 +777,7 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; while (true) { // TODO atomic_ref::load if insert operator is present @@ -1088,13 +791,6 @@ class open_addressing_ref_impl case detail::equal_result::EQUAL: return true; } } - // Robin Hood: a resident richer than us proves the key is absent. - if constexpr (cuco::is_robin_hood_probing::value) { - if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { - return false; - } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -1120,8 +816,7 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -1139,13 +834,6 @@ class open_addressing_ref_impl if (group.any(state == detail::equal_result::EQUAL)) { return true; } if (group.any(state == detail::equal_result::EMPTY)) { return false; } - // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. - if constexpr (cuco::is_robin_hood_probing::value) { - if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { - return false; - } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -1169,8 +857,7 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; while (true) { // TODO atomic_ref::load if insert operator is present @@ -1188,13 +875,6 @@ class open_addressing_ref_impl default: continue; } } - // Robin Hood: a resident richer than us proves the key is absent. - if constexpr (cuco::is_robin_hood_probing::value) { - if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { - return this->end(); - } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -1220,8 +900,7 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -1251,13 +930,6 @@ class open_addressing_ref_impl // Find an empty slot, meaning that the probe key isn't present in the container if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } - // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. - if constexpr (cuco::is_robin_hood_probing::value) { - if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { - return this->end(); - } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -1920,104 +1592,6 @@ class open_addressing_ref_impl return storage_ref_.data() + probing_idx + intra_bucket_idx; } - /** - * @brief Determines whether the Robin Hood invariant proves the probe key absent at the current - * probe step. - * - * @note Only meaningful for Robin Hood probing. The key is proven absent when the bucket holds a - * resident that is "richer" than the probe key — i.e. whose own probe distance is smaller than - * the probe key's probe distance at the current step (`probe_step`). Such a resident would have - * been displaced on insertion if the probe key lived here, so the probe key cannot be present. - * - * @note Behavior is only well-defined when every slot in the bucket is occupied (the callers - * reach this check only after ruling out empty and matching slots), since probe distance is - * meaningless for an empty slot. - * - * @tparam BucketSlots Bucket slot array type - * - * @param bucket_slots The slots of the bucket currently being probed - * @param bucket_base The slot index of the first slot in the bucket - * @param probe_step The probe key's own probe distance at the current step - * - * @return True if some resident in the bucket is richer than the probe key - */ - template - [[nodiscard]] __device__ bool robin_hood_proves_absent(BucketSlots const& bucket_slots, - size_type bucket_base, - size_type probe_step) const noexcept - { - bool richer = false; - cuda::static_for([&](auto i) { - auto const resident_age = - this->robin_hood_age(bucket_slots[i()], static_cast(bucket_base + i())); - if (resident_age < probe_step) { richer = true; } - }); - return richer; - } - - /** - * @brief Whether `slot` holds a tombstone (erased marker). - * - * @note Returns false when erase is disabled (the erased and empty sentinels coincide, so no slot - * is a tombstone) -- this keeps the test correct even for empty slots. - * - * @param slot The slot to test - * - * @return True if `slot` is an erased tombstone - */ - [[nodiscard]] __device__ bool is_erased(value_type const& slot) const noexcept - { - return not cuco::detail::bitwise_compare(this->erased_key_sentinel(), - this->empty_key_sentinel()) and - cuco::detail::bitwise_compare(this->extract_key(slot), this->erased_key_sentinel()); - } - - /** - * @brief Robin Hood probe distance ("age") of an occupied slot. - * - * A live key's age is its `probe_distance`. A Robin Hood tombstone keeps the age of the key it - * replaced in its payload (the original key is gone and cannot be rehashed; see `erase`), so it - * is read back here -- a tombstone then participates in every Robin Hood comparison exactly like - * the resident it stood in for. - * - * @param slot The (occupied) slot - * @param slot_index The slot's index - * - * @return The slot's probe distance - */ - [[nodiscard]] __device__ size_type robin_hood_age(value_type const& slot, - size_type slot_index) const noexcept - { - if constexpr (has_payload) { - if (this->is_erased(slot)) { return static_cast(this->extract_payload(slot)); } - } - return probing_scheme_.template probe_distance( - this->extract_key(slot), slot_index, storage_ref_.extent()); - } - - /** - * @brief The Robin Hood tombstone for erasing the live key currently in `slot` at `slot_index`. - * - * The erased key's age is stashed in the payload (1a) so the tombstone keeps its place in the - * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes - * use the plain `erased_slot_sentinel()` and never call this. - * - * @param slot The slot's current (live) contents - * @param slot_index The slot's index - * - * @return The value to CAS into the slot to erase it - */ - [[nodiscard]] __device__ value_type - robin_hood_erased_sentinel(value_type const& slot, size_type slot_index) const noexcept - { - static_assert(has_payload, - "Robin Hood erase requires a mapped payload to store the tombstone age"); - auto const age = probing_scheme_.template probe_distance( - this->extract_key(slot), slot_index, storage_ref_.extent()); - return cuco::pair{this->erased_key_sentinel(), - static_castempty_value_sentinel())>(age)}; - } - /** * @brief Extracts the key from a given value type. * diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 2dbb9bdcb..8bed41778 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -208,83 +208,4 @@ double_hashing::hash_function() const noexcept return {hash1_, hash2_}; } -namespace detail { - -/** - * @brief Robin Hood inverse primitive for the linear probing sequence. - * - * @note Recovers a resident's probe distance ("age") from the slot it occupies: how many probing - * steps the resident sits from its own home bucket. For the linear sequence this is a single - * subtract — `(slot_base - resident_home) / stride mod num_buckets`. This is the linear-only - * overload; a `double_hashing` variant would add its own overload here (a modular inverse of the - * resident's per-key step, or a stored age), which is the single place that change lands. - * - * @tparam BucketSize Size of the bucket - * @tparam CGSize Size of CUDA Cooperative Groups - * @tparam Hash Unary callable type - * @tparam ProbeKey Type of probing key - * @tparam Extent Type of extent - * - * @param scheme The underlying linear probing scheme (supplies the hash function) - * @param resident_key The key currently residing in the slot - * @param slot_index The slot index at which `resident_key` resides - * @param upper_bound Upper bound of the iteration - * @return The resident's probe distance, in probing steps - */ -template -[[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( - linear_probing const& scheme, - ProbeKey resident_key, - typename Extent::value_type slot_index, - Extent upper_bound) noexcept -{ - using size_type = typename Extent::value_type; - size_type constexpr stride = CGSize * BucketSize; - auto const bound = static_cast(upper_bound); - auto const hash = scheme.hash_function(); - - // Home bucket base of the resident, using the same alignment as `make_iterator`. - size_type const resident_home = - cuco::detail::sanitize_hash(hash(resident_key)) % (bound / stride) * stride; - - // Bucket-strided base of the slot the resident currently occupies. The per-lane `thread_rank` - // offset (which is < stride) is stripped by the floor division so that the distance is measured - // in whole probing steps, consistent with the forward sequence. - size_type const slot_base = (slot_index / stride) * stride; - - // (slot_base - resident_home) mod capacity, expressed in probing steps. - return static_cast((slot_base + bound - resident_home) % bound) / stride; -} - -} // namespace detail - -template -__host__ __device__ constexpr robin_hood_probing::robin_hood_probing( - Underlying const& probing) - : Underlying{probing} -{ -} - -template -template -__host__ __device__ constexpr auto robin_hood_probing::rebind_hash_function( - NewHash const& hash) const noexcept -{ - auto const inner = static_cast(*this).rebind_hash_function(hash); - return robin_hood_probing>{inner}; -} - -template -template -__host__ __device__ constexpr typename Extent::value_type -robin_hood_probing::probe_distance(ProbeKey resident_key, - typename Extent::value_type slot_index, - Extent upper_bound) const noexcept -{ - // Delegate to the per-underlying-scheme inverse overload. Only `linear_probing` provides one - // today; wrapping a scheme without a matching overload is a compile-time error. - return cuco::detail::probe_distance( - static_cast(*this), resident_key, slot_index, upper_bound); -} - } // namespace cuco diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index b71a40f9f..7f8ff043b 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -495,7 +495,6 @@ class operator_impl< using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; using mapped_type = T; - using size_type = typename base_type::size_type; static constexpr auto cg_size = base_type::cg_size; static constexpr auto bucket_size = base_type::bucket_size; @@ -516,20 +515,17 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto val = ref_.impl_.heterogeneous_value(value); - auto key = ref_.impl_.extract_key(val); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(key, storage_ref.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; while (true) { auto const bucket_slots = storage_ref[*probing_iter]; - [[maybe_unused]] bool retry = false; - for (auto& slot_content : bucket_slots) { auto const eq_res = ref_.impl_.predicate_.template operator()(key, slot_content.first); @@ -538,69 +534,14 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - // Robin Hood may relocate this key; assign via a full-slot CAS that keeps the key and - // fails if it moved. On failure re-probe and retry -- the loop re-finds the key. - auto desired = slot_content; - desired.second = val.second; - if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == - detail::insert_result::SUCCESS) { - return; - } - retry = true; - break; - } else { - cuda::atomic_ref payload_ref(slot_ptr->second); - payload_ref.store(val.second, cuda::memory_order_relaxed); - return; - } + cuda::atomic_ref payload_ref(slot_ptr->second); + payload_ref.store(val.second, cuda::memory_order_relaxed); + return; } if (eq_res == detail::equal_result::AVAILABLE) { - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - // Claim only a true empty; a tombstone is handled as a resident by the displacement - // test below. On a lost CAS (rival insert) or a duplicate, retry -- the loop re-finds - // the key and assigns it via the EQUAL full-slot CAS. - if (not ref_.impl_.is_erased(slot_content)) { - if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == - detail::insert_result::SUCCESS) { - return; - } - retry = true; - break; - } - } else { - if (attempt_insert_or_assign(slot_ptr, val)) { return; } - } - } - - // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A - // tombstone is a resident too (age from its payload); picking one up consumes it -- the - // pair lands there and we are done. - if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { - auto const evicted_age = ref_.impl_.robin_hood_age( - slot_content, static_cast(*probing_iter + intra_bucket_index)); - if (evicted_age < probe_step) { - if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == - detail::insert_result::SUCCESS) { - if (ref_.impl_.is_erased(slot_content)) { return; } // consumed tombstone -- done - val = cuda::std::bit_cast(slot_content); - key = ref_.impl_.extract_key(val); - probe_step = evicted_age; - } - retry = true; - break; - } - } + if (attempt_insert_or_assign(slot_ptr, val)) { return; } } } - - if constexpr (cuco::is_robin_hood_probing::value) { - if (retry) { continue; } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return; } } @@ -624,14 +565,13 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto val = ref_.impl_.heterogeneous_value(value); - auto key = ref_.impl_.extract_key(val); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(group, key, storage_ref.extent()); - auto const init_idx = *probing_iter; - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; while (true) { auto const bucket_slots = storage_ref[*probing_iter]; @@ -642,15 +582,6 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); - // Robin Hood: a tombstone is a resident handled by the displacement scan, not - // AVAILABLE. - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - if (res == detail::equal_result::AVAILABLE and - ref_.impl_.is_erased(bucket_slots[i()])) { - res = detail::equal_result::UNEQUAL; - } - } if (res != detail::equal_result::UNEQUAL) { result = detail::bucket_probing_results{res, i()}; } @@ -664,89 +595,23 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if constexpr (cuco::is_robin_hood_probing::value) { - // src_lane assigns via a full-slot CAS (key fixed); a relocation or rival update fails - // it, so the group re-probes and retries -- the loop re-finds the key. - auto const success = [&, target_idx = intra_bucket_index]() { - if (group.thread_rank() != src_lane) { return false; } - auto desired = bucket_slots[target_idx]; - desired.second = val.second; - return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == - detail::insert_result::SUCCESS; - }(); - if (group.shfl(success, src_lane)) { return; } - continue; - } else { - if (group.thread_rank() == src_lane) { - cuda::atomic_ref payload_ref(slot_ptr->second); - payload_ref.store(val.second, cuda::memory_order_relaxed); - } - group.sync(); - return; + if (group.thread_rank() == src_lane) { + cuda::atomic_ref payload_ref(slot_ptr->second); + payload_ref.store(val.second, cuda::memory_order_relaxed); } + group.sync(); + return; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; - if constexpr (cuco::is_robin_hood_probing::value) { - // Insert the new pair with a full-slot CAS; on a lost CAS or duplicate, re-probe and - // retry (the loop re-finds the key and assigns via the EQUAL full-slot CAS). - auto const success = [&, target_idx = intra_bucket_index]() { - if (group.thread_rank() != src_lane) { return false; } - return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], val) == - detail::insert_result::SUCCESS; - }(); - if (group.shfl(success, src_lane)) { return; } - continue; - } else { - auto const status = - (group.thread_rank() == src_lane) ? attempt_insert_or_assign(slot_ptr, val) : false; + auto const status = + (group.thread_rank() == src_lane) ? attempt_insert_or_assign(slot_ptr, val) : false; - // Exit if inserted or assigned - if (group.shfl(status, src_lane)) { return; } - } + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } } else { - if constexpr (cuco::is_robin_hood_probing::value) { - // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). - cuda::std::int32_t displace_idx = -1; - size_type evicted_age = 0; - cuda::static_for([&] __device__(auto i) { - if (displace_idx < 0) { - // `robin_hood_age` so a tombstone uses its payload-stored age (like any resident). - auto const age = ref_.impl_.robin_hood_age( - bucket_slots[i()], static_cast(*probing_iter + i())); - if (age < probe_step) { - displace_idx = i(); - evicted_age = age; - } - } - }); - - auto const group_displaceable = group.ballot(displace_idx >= 0); - if (group_displaceable) { - auto const src_lane = __ffs(group_displaceable) - 1; - auto status = detail::insert_result::CONTINUE; - value_type evicted = ref_.impl_.empty_slot_sentinel(); - if (group.thread_rank() == src_lane) { - evicted = bucket_slots[displace_idx]; - status = ref_.impl_.attempt_insert( - ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); - } - if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { - // Consuming a tombstone reuses its slot -- nothing to carry, so we are done. - if (group.shfl(ref_.impl_.is_erased(evicted), src_lane)) { return; } - auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); - auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); - auto const new_age = group.shfl(evicted_age, src_lane); - val = cuda::std::bit_cast(value_type{new_key, new_payload}); - key = ref_.impl_.extract_key(val); - probe_step = new_age; - } - continue; - } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return; } } @@ -805,7 +670,6 @@ class operator_impl< using ref_type = static_map_ref; using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; - using size_type = typename base_type::size_type; static constexpr auto cg_size = base_type::cg_size; static constexpr auto bucket_size = base_type::bucket_size; @@ -1022,15 +886,14 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto val = ref_.impl_.heterogeneous_value(value); - auto key = ref_.impl_.extract_key(val); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(key, storage_ref.extent()); - auto const init_idx = *probing_iter; - auto const empty_value = ref_.empty_value_sentinel(); - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; + auto const empty_value = ref_.empty_value_sentinel(); // wait for payload only when init != sentinel and insert strategy is not `packed_cas` auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8); @@ -1038,8 +901,6 @@ class operator_impl< while (true) { auto const bucket_slots = storage_ref[*probing_iter]; - [[maybe_unused]] bool retry = false; - for (auto& slot_content : bucket_slots) { auto const eq_res = ref_.impl_.predicate_.template operator()(key, slot_content.first); @@ -1048,46 +909,18 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - // Lift `op` to the whole slot, keeping the key, and CAS it. A relocation (or a rival - // update) makes the CAS fail; re-probe and retry -- the loop re-finds the key. - auto desired = slot_content; - // `desired` is a local copy, so this `op` is just local arithmetic -- the - // `atomic_ref`'s atomicity does nothing here and is used only because `Op`'s signature - // requires one. The real atomic is the full-slot CAS below. - op(cuda::atomic_ref{desired.second}, val.second); - if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == - detail::insert_result::SUCCESS) { - return false; - } - retry = true; - break; - } else { - // wait for payload only when performing insert operation - if constexpr (wait_for_payload) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); - } - op(cuda::atomic_ref{slot_ptr->second}, val.second); - return false; + // wait for payload only when performing insert operation + if constexpr (wait_for_payload) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); } + op(cuda::atomic_ref{slot_ptr->second}, val.second); + return false; } - // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the - // displacement test below. - if (eq_res == detail::equal_result::AVAILABLE and - not(cuco::is_robin_hood_probing::value and - ref_.impl_.is_erased(slot_content))) { + if (eq_res == detail::equal_result::AVAILABLE) { switch (ref_.template attempt_insert_or_apply( slot_ptr, slot_content, val, op)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - // Key is present now; re-probe so it is found EQUAL and updated via the full-slot - // CAS. - retry = true; - break; - } // wait for payload only when performing insert operation if constexpr (wait_for_payload) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); @@ -1095,49 +928,10 @@ class operator_impl< op(cuda::atomic_ref{slot_ptr->second}, val.second); return false; } - case insert_result::CONTINUE: { - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - retry = true; - break; - } else { - continue; - } - } - } - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - if (retry) { break; } - } - } - - // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A - // tombstone is a resident too (age from its payload); picking one up consumes it -- the - // in-flight pair lands there, completing the insert. - if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { - auto const evicted_age = ref_.impl_.robin_hood_age( - slot_content, static_cast(*probing_iter + intra_bucket_index)); - if (evicted_age < probe_step) { - if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == - detail::insert_result::SUCCESS) { - // Consuming a tombstone places the in-flight pair in its slot -- insert complete. - if (ref_.impl_.is_erased(slot_content)) { return true; } - val = cuda::std::bit_cast(slot_content); - key = ref_.impl_.extract_key(val); - probe_step = evicted_age; - } - retry = true; - break; - } + default: continue; } } } - - if constexpr (cuco::is_robin_hood_probing::value) { - if (retry) { continue; } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -1170,15 +964,14 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto val = ref_.impl_.heterogeneous_value(value); - auto key = ref_.impl_.extract_key(val); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(group, key, storage_ref.extent()); - auto const init_idx = *probing_iter; - auto const empty_value = ref_.empty_value_sentinel(); - [[maybe_unused]] size_type probe_step = 0; + auto const init_idx = *probing_iter; + auto const empty_value = ref_.empty_value_sentinel(); // wait for payload only when init != sentinel and insert strategy is not `packed_cas` auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8); @@ -1192,15 +985,6 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); - // Robin Hood: a tombstone is a resident handled by the displacement scan, not - // AVAILABLE. - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - if (res == detail::equal_result::AVAILABLE and - ref_.impl_.is_erased(bucket_slots[i()])) { - res = detail::equal_result::UNEQUAL; - } - } if (res != detail::equal_result::UNEQUAL) { result = detail::bucket_probing_results{res, i()}; } @@ -1214,30 +998,13 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if constexpr (cuco::is_robin_hood_probing::value) { - // src_lane lifts `op` to the slot (key fixed) and CASes it; a relocation or rival update - // fails the CAS, so the group re-probes and retries -- the loop re-finds the key. - auto const success = [&, target_idx = intra_bucket_index]() { - if (group.thread_rank() != src_lane) { return false; } - auto desired = bucket_slots[target_idx]; - // `desired` is a local copy, so this `op` is just local arithmetic -- the - // `atomic_ref`'s atomicity does nothing here and is used only because `Op`'s signature - // requires one. The real atomic is the full-slot CAS below. - op(cuda::atomic_ref{desired.second}, val.second); - return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == - detail::insert_result::SUCCESS; - }(); - if (group.shfl(success, src_lane)) { return false; } - continue; - } else { - if (group.thread_rank() == src_lane) { - if constexpr (wait_for_payload) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); - } - op(cuda::atomic_ref{slot_ptr->second}, val.second); + if (group.thread_rank() == src_lane) { + if constexpr (wait_for_payload) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); } - return false; + op(cuda::atomic_ref{slot_ptr->second}, val.second); } + return false; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); @@ -1252,10 +1019,6 @@ class operator_impl< switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { - continue; // key present now: re-probe, find it EQUAL, apply via the full-slot CAS - } if (group.thread_rank() == src_lane) { if constexpr (wait_for_payload) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); @@ -1267,46 +1030,6 @@ class operator_impl< default: continue; } } else { - if constexpr (cuco::is_robin_hood_probing::value) { - // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). - cuda::std::int32_t displace_idx = -1; - size_type evicted_age = 0; - cuda::static_for([&] __device__(auto i) { - if (displace_idx < 0) { - // `robin_hood_age` so a tombstone uses its payload-stored age (like any resident). - auto const age = ref_.impl_.robin_hood_age( - bucket_slots[i()], static_cast(*probing_iter + i())); - if (age < probe_step) { - displace_idx = i(); - evicted_age = age; - } - } - }); - - auto const group_displaceable = group.ballot(displace_idx >= 0); - if (group_displaceable) { - auto const src_lane = __ffs(group_displaceable) - 1; - auto status = detail::insert_result::CONTINUE; - value_type evicted = ref_.impl_.empty_slot_sentinel(); - if (group.thread_rank() == src_lane) { - evicted = bucket_slots[displace_idx]; - status = ref_.impl_.attempt_insert( - ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); - } - if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { - // Consuming a tombstone places the in-flight pair in its slot -- insert complete. - if (group.shfl(ref_.impl_.is_erased(evicted), src_lane)) { return true; } - auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); - auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); - auto const new_age = group.shfl(evicted_age, src_lane); - val = cuda::std::bit_cast(value_type{new_key, new_payload}); - key = ref_.impl_.extract_key(val); - probe_step = new_age; - } - continue; - } - ++probe_step; - } ++probing_iter; if (*probing_iter == init_idx) { return false; } } diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index 3bf8b98c2..c809794dc 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -209,87 +209,6 @@ class double_hashing : private detail::probing_scheme_base { Hash2 hash2_; }; -/** - * @brief Public Robin Hood probing scheme class. - * - * @note Robin Hood probing wraps an underlying probe sequence (e.g. `cuco::linear_probing`) and - * pairs it with the Robin Hood invariant: on insert, an in-flight key displaces any resident that - * sits closer to its own home than the in-flight key is to its home, and the displaced resident is - * then re-inserted from that point onward. This keeps probe lengths tightly distributed, which is - * especially valuable on GPUs where a tile's tail latency is set by its longest probe. - * - * @note This class is a thin decorator over `Underlying`. It forwards the forward probe sequence - * (`make_iterator`, `hash_function`) unchanged and contributes the `cuco::is_robin_hood_probing` - * trait that selects the displacement (insert) and early-termination (find) control flow in the - * open-addressing ref implementation. The invariant's one extra requirement — the inverse - * primitive `probe_distance`, which recovers a resident's probe distance ("age") from the slot it - * occupies — is delegated to `cuco::detail::probe_distance`, which is overloaded per underlying - * scheme. Only `cuco::linear_probing` provides that overload today; a `cuco::double_hashing` - * variant would compose simply by adding a matching `cuco::detail::probe_distance` overload, with - * no change to this class or the ref implementation. - * - * @tparam Underlying The wrapped probe-sequence scheme (e.g. `cuco::linear_probing`) - */ -template -class robin_hood_probing : private Underlying { - public: - using typename Underlying::hasher; ///< Hash function type (from the underlying scheme) - using Underlying::cg_size; ///< Cooperative group size (from the underlying scheme) - using Underlying::hash_function; ///< Forwarded: gets the function(s) used to hash keys - using Underlying::make_iterator; ///< Forwarded: the (unchanged) forward probe sequence - - /** - * @brief Constructs a Robin Hood probing scheme wrapping the given underlying scheme. - * - * @param probing The underlying probe-sequence scheme to wrap - */ - __host__ __device__ constexpr robin_hood_probing(Underlying const& probing = {}); - - /** - * @brief Makes a copy of the current probing method with the given hasher. - * - * @note Forwards to the underlying scheme's `rebind_hash_function` and re-wraps the result so the - * returned scheme is again a `robin_hood_probing`. - * - * @tparam NewHash New hasher type - * - * @param hash Hasher - * - * @return Copy of the current probing method - */ - template - [[nodiscard]] __host__ __device__ constexpr auto rebind_hash_function( - NewHash const& hash) const noexcept; - - /** - * @brief Computes the probe distance ("age") of a resident key. - * - * @note This is the inverse of the probe sequence: given a resident key and the slot index at - * which it currently lives, it returns how many probing steps that resident is from its own home - * bucket. The Robin Hood insert/find logic compares this against the in-flight key's own probe - * distance to decide displacement (insert) or early termination (find). - * - * @note Delegates to the `cuco::detail::probe_distance` overload for `Underlying`. Instantiating - * this for an `Underlying` without such an overload (e.g. `cuco::double_hashing` today) is a - * compile-time error — that is the single seam where a new underlying scheme would supply its own - * inverse. - * - * @tparam BucketSize Size of the bucket - * @tparam ProbeKey Type of probing key - * @tparam Extent Type of extent - * - * @param resident_key The key currently residing in the slot - * @param slot_index The slot index at which `resident_key` resides - * @param upper_bound Upper bound of the iteration - * @return The resident's probe distance, in probing steps - */ - template - [[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( - ProbeKey resident_key, - typename Extent::value_type slot_index, - Extent upper_bound) const noexcept; -}; - /** * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not * @@ -308,22 +227,6 @@ struct is_double_hashing : cuda::std::false_type {}; template struct is_double_hashing> : cuda::std::true_type {}; -/** - * @brief Trait indicating whether the given probing scheme is of `robin_hood_probing` type or not - * - * @tparam T Input probing scheme type - */ -template -struct is_robin_hood_probing : cuda::std::false_type {}; - -/** - * @brief Trait indicating whether the given probing scheme is of `robin_hood_probing` type or not - * - * @tparam Underlying The wrapped probe-sequence scheme - */ -template -struct is_robin_hood_probing> : cuda::std::true_type {}; - } // namespace cuco #include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 07465b86e..51d4f42c1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -89,7 +89,6 @@ ConfigureTest(STATIC_MAP_TEST static_map/insert_or_assign_test.cu static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu - static_map/probing_scheme_invariants_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu static_map/rehash_test.cu diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index 9c1312482..0b3604528 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -29,7 +29,6 @@ #include #include -#include using size_type = int32_t; @@ -80,12 +79,6 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); - // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. - if constexpr (cuco::is_robin_hood_probing< - typename std::decay_t::probing_scheme_type>::value) { - cuco::test::check_robin_hood_invariant(map); - } - SECTION("All inserted keys should be contained.") { REQUIRE(map.count(keys_begin, keys_begin + num_keys) == num_keys); @@ -138,19 +131,12 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), - // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are - // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { @@ -159,12 +145,9 @@ TEMPLATE_TEST_CASE_SIG( // XXX: testing static extent is intended, DO NOT CHANGE using extent_type = cuco::extent; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::double_hashing, - cuco::double_hashing, cuco::murmurhash3_32>, - std::conditional_t< - Probe == cuco::test::probe_sequence::robin_hood, - cuco::robin_hood_probing>>, - cuco::linear_probing>>>; + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; auto map = cuco::static_map #include -#include #include #include #include #include -#include using size_type = int32_t; @@ -104,34 +102,21 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), - // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are - // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot - // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are - // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would - // livelock on an occupied slot. - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { constexpr size_type num_keys{1'000'000}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::double_hashing, - cuco::double_hashing, cuco::murmurhash3_32>, - std::conditional_t< - Probe == cuco::test::probe_sequence::robin_hood, - cuco::robin_hood_probing>>, - cuco::linear_probing>>>; + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; auto map = cuco::static_map>>; - using map_type = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>; - - constexpr size_type capacity = static_cast(num_keys / 0.85); - auto map = map_type{ - capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - - // Keys 1..num_keys (avoid the -1 / -2 sentinels). - auto keys_begin = cuda::counting_iterator(1); - auto pairs_begin = cuda::make_transform_iterator( - keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { - return cuco::pair{k, static_cast(k)}; - })); - - map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.size() == num_keys); - - constexpr size_type num_erased = num_keys / 2; - map.erase(keys_begin, keys_begin + num_erased); - REQUIRE(map.size() == num_keys - num_erased); - - thrust::device_vector d_contained(num_keys); - map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::none_of( - d_contained.begin(), d_contained.begin() + num_erased, cuda::std::identity{})); - REQUIRE( - cuco::test::all_of(d_contained.begin() + num_erased, d_contained.end(), cuda::std::identity{})); -} - -// Robin Hood erase reuse + structural invariant at high load: insert, erase half (-> tombstones), -// re-insert (-> consume tombstones), checking the per-bucket Robin Hood layout invariant after each -// step (tombstones counted as residents, age read from their payload). A wrongly consumed tombstone -// (the age inversion) corrupts the layout and trips the invariant; a lost/duplicate key trips -// size/contains. -TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase reuse + invariant", - "", - ((typename Key, typename Value, int CGSize), Key, Value, CGSize), - (int32_t, int32_t, 1), - (int32_t, int32_t, 2) -#if defined(CUCO_HAS_128BIT_ATOMICS) - , - (int64_t, int64_t, 1), - (int64_t, int64_t, 2) -#endif -) -{ - constexpr size_type num_keys = 10'000; - - using probe = cuco::robin_hood_probing>>; - using map_type = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>; - - constexpr size_type capacity = static_cast(num_keys / 0.85); - auto map = map_type{ - capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - - auto keys_begin = cuda::counting_iterator(1); - auto pairs_begin = cuda::make_transform_iterator( - keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { - return cuco::pair{k, static_cast(k)}; - })); - - map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.size() == num_keys); - cuco::test::check_robin_hood_invariant(map); - - constexpr size_type num_erased = num_keys / 2; - map.erase(keys_begin, keys_begin + num_erased); - REQUIRE(map.size() == num_keys - num_erased); - cuco::test::check_robin_hood_invariant(map); // tombstones-as-residents layout still valid - - map.insert(pairs_begin, pairs_begin + num_erased); // consume tombstones / fill empties - REQUIRE(map.size() == num_keys); - cuco::test::check_robin_hood_invariant(map); // layout valid after reuse - - thrust::device_vector d_contained(num_keys); - map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); -} - -namespace { -enum class reinsert_via { insert_or_assign, insert_or_apply, insert_and_find }; - -// Robin Hood erase + reuse through a specific re-insert API: insert, check invariant, erase the -// first half (-> tombstones), check invariant, then re-insert that half via `how` (-> consume -// tombstones) and check invariant + that every key is present. Exercises the tombstone path of the -// chosen insert variant. -template -void test_rh_erase_reuse(size_type num_keys, reinsert_via how) -{ - using probe = cuco::robin_hood_probing>>; - using map_type = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>; - - auto const capacity = static_cast(num_keys / 0.85); - auto map = map_type{ - capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - - auto keys_begin = cuda::counting_iterator(1); - auto pairs_begin = cuda::make_transform_iterator( - keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { - return cuco::pair{k, static_cast(k)}; - })); - - map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.size() == num_keys); - cuco::test::check_robin_hood_invariant(map); - - auto const num_erased = num_keys / 2; - map.erase(keys_begin, keys_begin + num_erased); - REQUIRE(map.size() == num_keys - num_erased); - cuco::test::check_robin_hood_invariant(map); - - // re-insert the erased half through the variant under test (consuming tombstones) - switch (how) { - case reinsert_via::insert_or_assign: - map.insert_or_assign(pairs_begin, pairs_begin + num_erased); - break; - case reinsert_via::insert_or_apply: - map.insert_or_apply(pairs_begin, pairs_begin + num_erased, cuco::reduce::plus{}); - break; - case reinsert_via::insert_and_find: { - thrust::device_vector found(num_erased); - thrust::device_vector inserted(num_erased); - map.insert_and_find(pairs_begin, pairs_begin + num_erased, found.begin(), inserted.begin()); - break; - } - } - - REQUIRE(map.size() == num_keys); - cuco::test::check_robin_hood_invariant(map); - - thrust::device_vector d_contained(num_keys); - map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); -} -} // namespace - -TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_or_assign reuse", - "", - ((typename Key, typename Value, int CGSize), Key, Value, CGSize), - (int32_t, int32_t, 1), - (int32_t, int32_t, 2) -#if defined(CUCO_HAS_128BIT_ATOMICS) - , - (int64_t, int64_t, 1), - (int64_t, int64_t, 2) -#endif -) -{ - test_rh_erase_reuse(10'000, reinsert_via::insert_or_assign); -} - -TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_or_apply reuse", - "", - ((typename Key, typename Value, int CGSize), Key, Value, CGSize), - (int32_t, int32_t, 1), - (int32_t, int32_t, 2) -#if defined(CUCO_HAS_128BIT_ATOMICS) - , - (int64_t, int64_t, 1), - (int64_t, int64_t, 2) -#endif -) -{ - test_rh_erase_reuse(10'000, reinsert_via::insert_or_apply); -} - -TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_and_find reuse", - "", - ((typename Key, typename Value, int CGSize), Key, Value, CGSize), - (int32_t, int32_t, 1), - (int32_t, int32_t, 2) -#if defined(CUCO_HAS_128BIT_ATOMICS) - , - (int64_t, int64_t, 1), - (int64_t, int64_t, 2) -#endif -) -{ - test_rh_erase_reuse(10'000, reinsert_via::insert_and_find); -} diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index 2dda5e89f..5d9376309 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -30,7 +30,6 @@ #include #include -#include using size_type = int32_t; @@ -74,12 +73,6 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); - // Robin Hood-specific: the populated table must satisfy the per-bucket Robin Hood layout - // invariant (a no-op for linear/double hashing). - if constexpr (cuco::is_robin_hood_probing::value) { - cuco::test::check_robin_hood_invariant(map); - } - SECTION("All inserted keys should be correctly recovered during find") { map.find(keys_begin, keys_begin + num_keys, d_results.begin()); @@ -170,29 +163,12 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), - // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are - // unconditional: RH displacement swaps into an *occupied* slot, which requires a packed CAS of - // the whole slot. The wider-slot RH rows therefore live under CUCO_HAS_128BIT_ATOMICS below - // (LP/DH avoid this because they only ever CAS into empty slots via back-to-back CAS). - (int8_t, int8_t, cuco::test::probe_sequence::robin_hood, 1), - (int8_t, int8_t, cuco::test::probe_sequence::robin_hood, 2), - (int8_t, int16_t, cuco::test::probe_sequence::robin_hood, 2), - (int16_t, int16_t, cuco::test::probe_sequence::robin_hood, 1), - (int16_t, int16_t, cuco::test::probe_sequence::robin_hood, 2), - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot - // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are - // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would - // livelock on an occupied slot. - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { @@ -201,12 +177,9 @@ TEMPLATE_TEST_CASE_SIG( // XXX: testing static extent is intended, DO NOT CHANGE using extent_type = cuco::extent; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::double_hashing, - cuco::double_hashing, cuco::murmurhash3_32>, - std::conditional_t< - Probe == cuco::test::probe_sequence::robin_hood, - cuco::robin_hood_probing>>, - cuco::linear_probing>>>; + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; constexpr size_type gold_capacity = [&]() { if constexpr (cuco::is_double_hashing::value) { diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index be3c55a45..17665eb2b 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -26,7 +26,6 @@ #include #include -#include using size_type = std::size_t; @@ -53,22 +52,12 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), - // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are - // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot - // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are - // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would - // livelock on an occupied slot. - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { @@ -77,12 +66,9 @@ TEMPLATE_TEST_CASE_SIG( #endif { using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::double_hashing, - cuco::double_hashing, cuco::murmurhash3_32>, - std::conditional_t< - Probe == cuco::test::probe_sequence::robin_hood, - cuco::robin_hood_probing>>, - cuco::linear_probing>>>; + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; constexpr size_type num_keys{400}; @@ -114,12 +100,6 @@ TEMPLATE_TEST_CASE_SIG( map.insert_and_find(pairs_begin, pairs_begin + num_keys, found2.begin(), inserted.begin()); REQUIRE(cuco::test::none_of(inserted.begin(), inserted.end(), cuda::std::identity{})); - // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. - if constexpr (cuco::is_robin_hood_probing< - typename std::decay_t::probing_scheme_type>::value) { - cuco::test::check_robin_hood_invariant(map); - } - // both found1 and found2 should be same, as keys will be referring to same slot REQUIRE( cuco::test::equal(found1.begin(), found1.end(), found2.begin(), cuda::std::equal_to{})); diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 692e7af31..0a4d07ea3 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -27,7 +27,6 @@ #include #include -#include #include @@ -55,12 +54,6 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key map.insert_or_apply(pairs_begin, pairs_begin + num_keys, plus_op); } - // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. - if constexpr (cuco::is_robin_hood_probing< - typename std::decay_t::probing_scheme_type>::value) { - cuco::test::check_robin_hood_invariant(map); - } - REQUIRE(map.size() == num_unique_keys); thrust::device_vector d_keys(num_unique_keys); @@ -168,23 +161,12 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), - // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are - // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { @@ -192,12 +174,9 @@ TEMPLATE_TEST_CASE_SIG( constexpr size_type num_unique_keys{100}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::double_hashing, - cuco::double_hashing, cuco::murmurhash3_32>, - std::conditional_t< - Probe == cuco::test::probe_sequence::robin_hood, - cuco::robin_hood_probing>>, - cuco::linear_probing>>>; + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; using map_type = cuco::static_map(map, num_keys, num_unique_keys, static_cast(0)); } } - -// Dedicated Robin Hood coverage for insert_or_apply: the probe-enum test above is disabled -// upstream, so this is the only active exercise of the displacing RH insert_or_apply (reduction + -// lock-free displacement together). It runs at a high load factor (~0.95 on the unique keys) so -// displacement actually fires, and reuses `test_insert_or_apply`, whose tail asserts the structural -// RH invariant. -TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_apply (high load)", - "", - ((typename Key, typename Value, int CGSize), Key, Value, CGSize), - (int32_t, int32_t, 1), - (int32_t, int32_t, 2) -#if defined(CUCO_HAS_128BIT_ATOMICS) - , - (int64_t, int64_t, 1), - (int64_t, int64_t, 2) -#endif -) -{ - constexpr size_type num_unique_keys = 5'000; - constexpr size_type num_keys = 10'000; // each unique key inserted twice - - using probe = cuco::robin_hood_probing>>; - using map_type = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>; - - // Size the table for ~0.95 load on the unique keys, so it is nearly full and the displacing - // insert path (and the structural invariant check inside the helper) is genuinely stressed. - constexpr size_type capacity = static_cast(num_unique_keys / 0.95); - - SECTION("sentinel equals init; has_init = true") - { - auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{0}}; - test_insert_or_apply(map, num_keys, num_unique_keys, static_cast(0)); - } - SECTION("sentinel equals init; has_init = false") - { - auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{0}}; - test_insert_or_apply(map, num_keys, num_unique_keys, static_cast(0)); - } -} diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 843417072..2cccd62ad 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -21,15 +21,12 @@ #include #include -#include #include #include #include -#include #include #include -#include using size_type = std::size_t; @@ -56,12 +53,6 @@ void test_insert_or_assign(Map& map, size_type num_keys) map.insert_or_assign(query_pairs_begin, query_pairs_begin + num_keys); - // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. - if constexpr (cuco::is_robin_hood_probing< - typename std::decay_t::probing_scheme_type>::value) { - cuco::test::check_robin_hood_invariant(map); - } - auto const updated_size = map.size(); // all keys are present in the map so the size shouldn't change REQUIRE(updated_size == initial_size); @@ -102,34 +93,21 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), - // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are - // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot - // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are - // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would - // livelock on an occupied slot. - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { constexpr size_type num_keys{400}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::double_hashing, - cuco::double_hashing, cuco::murmurhash3_32>, - std::conditional_t< - Probe == cuco::test::probe_sequence::robin_hood, - cuco::robin_hood_probing>>, - cuco::linear_probing>>>; + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; auto map = cuco::static_mapkey value swap from a misplaced assign). -TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_assign (high load)", - "", - ((typename Key, typename Value, int CGSize), Key, Value, CGSize), - (int32_t, int32_t, 1), - (int32_t, int32_t, 2) -#if defined(CUCO_HAS_128BIT_ATOMICS) - , - (int64_t, int64_t, 1), - (int64_t, int64_t, 2) -#endif -) -{ - constexpr size_type num_unique_keys = 5'000; - constexpr size_type num_keys = 10'000; // each unique key assigned twice (same value) - - using probe = cuco::robin_hood_probing>>; - using map_type = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>; - - // ~0.95 load on the unique keys, so the table is nearly full and displacement is stressed. - constexpr size_type capacity = static_cast(num_unique_keys / 0.95); - auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{-1}}; - - // Every occurrence of key k carries the same value (k * 2). - auto pairs_begin = cuda::make_transform_iterator( - cuda::counting_iterator(0), - cuda::proclaim_return_type>([num_unique_keys] __device__(auto i) { - auto const k = static_cast(i % num_unique_keys); - return cuco::pair{k, static_cast(k * 2)}; - })); - - map.insert_or_assign(pairs_begin, pairs_begin + num_keys); - - REQUIRE(map.size() == num_unique_keys); - cuco::test::check_robin_hood_invariant(map); - - thrust::device_vector d_keys(num_unique_keys); - thrust::device_vector d_values(num_unique_keys); - map.retrieve_all(d_keys.begin(), d_values.begin()); - - auto const zip = thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); - REQUIRE(cuco::test::all_of( - zip, zip + num_unique_keys, cuda::proclaim_return_type([] __device__(auto const& p) { - return cuda::std::get<1>(p) == static_cast(cuda::std::get<0>(p) * 2); - }))); -} diff --git a/tests/static_map/probing_scheme_invariants_test.cu b/tests/static_map/probing_scheme_invariants_test.cu deleted file mode 100644 index 277ec1921..000000000 --- a/tests/static_map/probing_scheme_invariants_test.cu +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2026, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include -#include - -#include - -#include -#include - -namespace { - -// Identity hash. `cuco::detail::sanitize_hash` is `to_positive`, so for a non-negative key `k` this -// puts its home bucket at slot `k % capacity` — letting the test hand-craft an exact Robin Hood -// layout instead of reverse-engineering a real hash function. -template -struct identity_hash { - __host__ __device__ constexpr Key operator()(Key key) const noexcept { return key; } -}; - -} // namespace - -// Validates the Robin Hood read-path early-exit (`find` / `contains`) against a hand-seeded layout, -// before the displacing `insert` exists. A scalar (cg_size == 1) Robin Hood map over a linear probe -// sequence is seeded directly through the storage pointer with a known-valid Robin Hood cluster, -// then queried with keys chosen to exercise each lookup-termination rule. -TEST_CASE("static_map robin_hood read-path early-exit", "") -{ - using Key = std::int32_t; - using Value = std::int32_t; - using size_type = std::int32_t; - auto constexpr capacity = size_type{16}; - - using extent_type = cuco::extent; - using probe_type = cuco::robin_hood_probing>>; - using map_type = cuco::static_map, - probe_type, - cuco::cuda_allocator, - cuco::storage<1>>; - using value_type = typename map_type::value_type; // cuco::pair - - auto map = map_type{extent_type{}, cuco::empty_key{-1}, cuco::empty_value{-1}}; - // Ensure the constructor's slot initialization (empty sentinels) has completed before we seed. - REQUIRE(cudaDeviceSynchronize() == cudaSuccess); - - // Hand-seed a valid Robin Hood layout (identity hash => home(k) = k % capacity): - // slot 0: key 0 home 0, distance 0 - // slot 1: key 16 home 0, distance 1 (displaced past key 0) - // slot 2: key 2 home 2, distance 0 - // slots 3..15: empty - // This is exactly what inserting {0, 16, 2} would produce, so it satisfies the Robin Hood - // invariant. We only write the three occupied (and contiguous) slots; the rest stay empty. - // The ref is used purely to reach the storage pointer; the bulk queries below build their own. - auto const ref = map.ref(cuco::op::find); - std::vector const seed{value_type{0, 0}, value_type{16, 16}, value_type{2, 2}}; - REQUIRE(cudaMemcpy(ref.storage_ref().data(), - seed.data(), - seed.size() * sizeof(value_type), - cudaMemcpyHostToDevice) == cudaSuccess); - - // Probe keys chosen to exercise every lookup-termination rule: - // 0 : present, found immediately at its home (distance 0). - // 16 : present at distance 1 — found only if we do NOT early-exit at slot 0, where the resident - // distance (0) equals our probe step (0). Guards the strict `<` (vs `<=`) richer rule. - // 2 : present at its home. - // 32 : home 0, absent — terminates via the richer-resident early-exit at slot 2 (key 2 sits at - // distance 0 < our probe step 2), before reaching the empty slot 3. - // 1 : home 1, absent — also terminates via the early-exit at slot 2. - // 3 : home 3, absent — terminates on the empty slot 3. - std::vector const probe_keys{0, 16, 2, 32, 1, 3}; - std::vector const expected_contained{true, true, true, false, false, false}; - - thrust::device_vector const d_keys(probe_keys.begin(), probe_keys.end()); - - thrust::device_vector d_contained(probe_keys.size()); - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - for (std::size_t i = 0; i < probe_keys.size(); ++i) { - INFO("contains, probe key = " << probe_keys[i]); - REQUIRE(static_cast(d_contained[i]) == expected_contained[i]); - } - - // `find` must return the stored value for present keys and the empty-value sentinel otherwise. - thrust::device_vector d_values(probe_keys.size()); - map.find(d_keys.begin(), d_keys.end(), d_values.begin()); - std::vector const expected_values{0, 16, 2, -1, -1, -1}; - for (std::size_t i = 0; i < probe_keys.size(); ++i) { - INFO("find, probe key = " << probe_keys[i]); - REQUIRE(d_values[i] == expected_values[i]); - } -} diff --git a/tests/static_map/robin_hood_invariant.cuh b/tests/static_map/robin_hood_invariant.cuh deleted file mode 100644 index 34165435c..000000000 --- a/tests/static_map/robin_hood_invariant.cuh +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2026, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -#include - -namespace cuco { -namespace test { - -// Per-probe-step Robin Hood layout check. The unit is the *stride group* of `cg_size * bucket_size` -// contiguous slots that one probing step examines -- a single bucket for scalar probing, the whole -// cooperative-group window for CG probing. Within a stride group the slot order is free (the probe -// step distance is identical for every slot in it, so the intra-group offset cancels in all -// comparisons), so the invariant is only meaningful *between* groups. For each occupied group `g` -// (with predecessor `pg`), the resident probe-step distances ("ages") must satisfy: -// -// (1) Contiguity. If `g` holds any overflowed resident (distance >= 1), `pg` must be full -- -// otherwise that resident would have stopped in `pg`'s free slot instead of probing past it. -// (2) Balance. No resident of `pg` may be more than one probing step *richer* than the poorest -// resident of `g` (`min_age(pg) >= max_age(g) - 1`) -- otherwise the poorest resident of `g` -// should have displaced it. This is the property that distinguishes Robin Hood from plain -// linear probing, and (via condition 1) it inductively forces the whole home-to-position run -// to be full. -// -// `probe_distance` is reused here -- it is exercised independently by the utility probe-distance -// test, so a bug in *insert* (a layout that violates the invariant) is still caught. -template -__global__ void robin_hood_invariant_kernel(Ref ref, int* violations) -{ - using size_type = typename Ref::size_type; - constexpr int bs = Ref::bucket_size; - constexpr int stride = Ref::cg_size * Ref::bucket_size; - auto const storage_ref = ref.storage_ref(); - auto const slots = storage_ref.data(); - auto const num_groups = storage_ref.capacity() / stride; - auto const extent = storage_ref.extent(); - auto const empty_key = ref.empty_key_sentinel(); - auto const erased_key = ref.erased_key_sentinel(); - auto const scheme = ref.probing_scheme(); - - for (size_type g = blockIdx.x * blockDim.x + threadIdx.x; g < num_groups; - g += gridDim.x * blockDim.x) { - int occupied_g = 0; - size_type max_age_g = 0; - for (int s = 0; s < stride; ++s) { - auto const slot = slots[g * stride + s]; - if (slot.first != empty_key) { // tombstones count as residents (erase enabled => != empty) - ++occupied_g; - // A tombstone keeps its age in its payload; a live key's age is its probe distance. - auto const age = (slot.first == erased_key) - ? static_cast(slot.second) - : scheme.template probe_distance( - slot.first, static_cast(g * stride + s), extent); - if (age > max_age_g) { max_age_g = age; } - } - } - if (occupied_g == 0) { continue; } - - size_type const pg = (g + num_groups - 1) % num_groups; - int occupied_p = 0; - size_type min_age_p = 0; - for (int s = 0; s < stride; ++s) { - auto const slot = slots[pg * stride + s]; - if (slot.first != empty_key) { - auto const age = (slot.first == erased_key) - ? static_cast(slot.second) - : scheme.template probe_distance( - slot.first, static_cast(pg * stride + s), extent); - if (occupied_p == 0 || age < min_age_p) { min_age_p = age; } - ++occupied_p; - } - } - - if (max_age_g >= 1 && occupied_p < stride) { atomicAdd(violations, 1); } // (1) - if (occupied_p > 0 && min_age_p + 1 < max_age_g) { atomicAdd(violations, 1); } // (2) - } -} - -// Asserts that a populated Robin Hood `map` satisfies the per-bucket layout invariant above. No-op -// to call only on Robin Hood maps -- `probe_distance` exists only on `robin_hood_probing`, so guard -// the call site with `cuco::is_robin_hood_probing<...>`. -template -void check_robin_hood_invariant(Map& map) -{ - auto const ref = map.ref(cuco::op::find); - - thrust::device_vector d_violations(1, 0); - auto constexpr block_size = 128; - auto const grid_size = (map.capacity() + block_size - 1) / block_size; - robin_hood_invariant_kernel<<>>( - ref, thrust::raw_pointer_cast(d_violations.data())); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); - - REQUIRE(d_violations[0] == 0); -} - -} // namespace test -} // namespace cuco diff --git a/tests/test_utils.hpp b/tests/test_utils.hpp index d71336f8b..2c77eda36 100644 --- a/tests/test_utils.hpp +++ b/tests/test_utils.hpp @@ -33,7 +33,7 @@ namespace cg = cooperative_groups; constexpr int32_t block_size = 128; -enum class probe_sequence { linear_probing, double_hashing, robin_hood }; +enum class probe_sequence { linear_probing, double_hashing }; // User-defined logical algorithms to reduce compilation time template diff --git a/tests/utility/probing_scheme_test.cu b/tests/utility/probing_scheme_test.cu index 97f26dc18..39048946b 100644 --- a/tests/utility/probing_scheme_test.cu +++ b/tests/utility/probing_scheme_test.cu @@ -78,37 +78,6 @@ __global__ void generate_cg_probing_sequence(Key key, } } -// Walks each lane's probe iterator and records, at every step, the probe distance reported for the -// slot that lane is visiting. Because the resident under test is `key` itself, the slot visited at -// step `i` is at probe distance `i` from `key`'s home — for every lane. Recording one column per -// lane lets the host check both that `probe_distance` inverts `make_iterator` and that it strips -// the per-lane intra-stride offset (so all lanes at a given step agree). -template -__global__ void generate_cg_probe_distance_sequence(Key key, - Extent upper_bound, - size_t seq_length, - OutputIt out_seq) -{ - auto constexpr cg_size = ProbingScheme::cg_size; - - auto const tid = blockIdx.x * blockDim.x + threadIdx.x; - auto probing_scheme = ProbingScheme{}; - - if (tid < cg_size) { - auto const tile = - cooperative_groups::tiled_partition( - cooperative_groups::this_thread_block()); - - auto iter = probing_scheme.template make_iterator(tile, key, upper_bound); - - for (size_t i = 0; i < seq_length; ++i) { - out_seq[i * cg_size + tile.thread_rank()] = - probing_scheme.template probe_distance(key, *iter, upper_bound); - ++iter; - } - } -} - TEMPLATE_TEST_CASE_SIG( "utility probing_scheme tests", "", @@ -142,44 +111,3 @@ TEMPLATE_TEST_CASE_SIG( REQUIRE(cuco::test::equal( scalar_seq.begin(), scalar_seq.end(), cg_seq.begin(), cuda::std::equal_to{})); } - -TEMPLATE_TEST_CASE_SIG( - "utility robin_hood probe_distance inverts make_iterator", - "", - ((typename Key, int32_t CGSize, int32_t BucketSize), Key, CGSize, BucketSize), - (int32_t, 1, 1), - (int32_t, 4, 1), - (int32_t, 8, 1), - (int32_t, 8, 2), - (int64_t, 4, 1), - (int64_t, 8, 2)) -{ - // Robin Hood wraps a linear probe sequence; `probe_distance` is its inverse. For `key`'s own - // probe sequence, the slot visited at step `i` must report probe distance `i`. - using probe = - cuco::robin_hood_probing>>; - - // A deliberately small capacity, so the probe sequence wraps around the table within - // `seq_length` steps — this exercises the modular-subtraction (wrap) path in `probe_distance`. - auto const upper_bound = - cuco::make_valid_extent>(cuco::extent{64}); - - // Probe distance is measured in whole probing steps and lives in `[0, num_buckets)`, where one - // step spans the full `cg_size * bucket_size` stride. Taking `seq_length` past `num_buckets` - // guarantees the walk wraps at least once. - auto const capacity = static_cast(upper_bound); - auto const num_buckets = capacity / (CGSize * BucketSize); - auto const seq_length = num_buckets + 3; - constexpr Key key{42}; - - thrust::device_vector distances(seq_length * CGSize); - generate_cg_probe_distance_sequence - <<<1, CGSize>>>(key, upper_bound, seq_length, distances.begin()); - - // Under wrap, the slot visited at step `i` sits at probe distance `i mod num_buckets`. - for (std::size_t i = 0; i < seq_length; ++i) { - for (std::int32_t r = 0; r < CGSize; ++r) { - REQUIRE(distances[i * CGSize + r] == i % num_buckets); - } - } -} From 10af0f963646ea88437f2aa2898866964d04951b Mon Sep 17 00:00:00 2001 From: aterenin Date: Thu, 25 Jun 2026 14:51:19 -0400 Subject: [PATCH 13/15] Copy baseline implementation. --- .../robin_hood/open_addressing_ref_impl.cuh | 1962 +++++++++++++++++ 1 file changed, 1962 insertions(+) create mode 100644 include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh diff --git a/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh new file mode 100644 index 000000000..1eecb70f4 --- /dev/null +++ b/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh @@ -0,0 +1,1962 @@ +/* + * Copyright (c) 2023-2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CUCO_HAS_CUDA_BARRIER) +#include +#endif + +#include + +namespace cuco { +namespace detail { +namespace robin_hood { + +/// Three-way insert result enum +enum class insert_result : cuda::std::int8_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; + +/** + * @brief Helper struct to store intermediate bucket probing results. + */ +struct bucket_probing_results { + detail::equal_result state_; ///< Equal result + cuda::std::int32_t intra_bucket_index_; ///< Intra-bucket index + + /** + * @brief Constructs bucket_probing_results. + * + * @param state The three way equality result + * @param index Intra-bucket index + */ + __device__ explicit constexpr bucket_probing_results(detail::equal_result state, + cuda::std::int32_t index) noexcept + : state_{state}, intra_bucket_index_{index} + { + } +}; + +/** + * @brief Common device non-owning "ref" implementation class. + * + * @note This class should NOT be used directly. + * + * @throw If the size of the given key type is larger than `cuco::open_addressing_max_key_size` + * @throw If the size of the given slot type is larger than `cuco::open_addressing_max_slot_size` + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::is_bitwise_comparable_v == false` + * @throw If the given payload type doesn't have unique object representations, i.e., + * `cuco::is_bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `sizeof(Key) <= cuco::open_addressing_max_key_size` and + * `cuco::is_bitwise_comparable_v` + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type. Its `value_type` must fit in + * `cuco::open_addressing_max_slot_size`; + * payloads, if present, must be 4 or 8 bytes (or 16 with sm_90+) and satisfy + * `cuco::is_bitwise_comparable_v` + * @tparam AllowsDuplicates Flag indicating whether duplicate keys are allowed or not + */ +template +class open_addressing_ref_impl + : private open_addressing_compatible { + using storage_value_type = typename StorageRef::value_type; + + /// Determines if the container is a key/value or key-only store + static constexpr auto has_payload = not cuda::std::is_same_v; + + /// Flag indicating whether duplicate keys are allowed or not + static constexpr auto allows_duplicates = AllowsDuplicates; + + // TODO: how to re-enable this check? + // static_assert(is_bucket_extent_v, + // "Extent is not a valid cuco::bucket_extent"); + + public: + using key_type = Key; ///< Key type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using hasher = typename probing_scheme_type::hasher; ///< Hash function type + using storage_ref_type = StorageRef; ///< Type of storage ref + using bucket_type = typename storage_ref_type::bucket_type; ///< Bucket type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using key_equal = KeyEqual; ///< Type of key equality binary callable + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto bucket_size = + storage_ref_type::bucket_size; ///< Number of elements handled per bucket + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + + /** + * @brief Constructs open_addressing_ref_impl. + * + * @param empty_slot_sentinel Sentinel indicating an empty slot + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr open_addressing_ref_impl( + value_type empty_slot_sentinel, + key_equal const& predicate, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept + : empty_slot_sentinel_{empty_slot_sentinel}, + predicate_{ + this->extract_key(empty_slot_sentinel), this->extract_key(empty_slot_sentinel), predicate}, + probing_scheme_{probing_scheme}, + storage_ref_{storage_ref} + { + } + + /** + * @brief Constructs open_addressing_ref_impl. + * + * @param empty_slot_sentinel Sentinel indicating an empty slot + * @param erased_key_sentinel Sentinel indicating an erased key + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr open_addressing_ref_impl( + value_type empty_slot_sentinel, + key_type erased_key_sentinel, + key_equal const& predicate, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept + : empty_slot_sentinel_{empty_slot_sentinel}, + predicate_{this->extract_key(empty_slot_sentinel), erased_key_sentinel, predicate}, + probing_scheme_{probing_scheme}, + storage_ref_{storage_ref} + { + } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept + { + return this->predicate_.empty_sentinel_; + } + + /** + * @brief Gets the sentinel value used to represent an empty payload slot. + * + * @return The sentinel value used to represent an empty payload slot + */ + template > + [[nodiscard]] __host__ __device__ constexpr auto empty_value_sentinel() const noexcept + { + return this->extract_payload(this->empty_slot_sentinel()); + } + + /** + * @brief Gets the sentinel value used to represent an erased key slot. + * + * @return The sentinel value used to represent an erased key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type erased_key_sentinel() const noexcept + { + return this->predicate_.erased_sentinel_; + } + + /** + * @brief Gets the sentinel used to represent an empty slot. + * + * @return The sentinel value used to represent an empty slot + */ + [[nodiscard]] __host__ __device__ constexpr value_type empty_slot_sentinel() const noexcept + { + return empty_slot_sentinel_; + } + + /** + * @brief Returns the function that compares keys for equality. + * + * @return The key equality predicate + */ + [[nodiscard]] __host__ + __device__ constexpr detail::equal_wrapper + predicate() const noexcept + { + return this->predicate_; + } + + /** + * @brief Gets the key comparator. + * + * @return The comparator used to compare keys + */ + [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept + { + return this->predicate().equal_; + } + + /** + * @brief Gets the probing scheme. + * + * @return The probing scheme used for the container + */ + [[nodiscard]] __host__ __device__ constexpr probing_scheme_type probing_scheme() const noexcept + { + return probing_scheme_; + } + + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept + { + return this->probing_scheme().hash_function(); + } + + /** + * @brief Gets the non-owning storage ref. + * + * @return The non-owning storage ref of the container + */ + [[nodiscard]] __host__ __device__ constexpr storage_ref_type storage_ref() const noexcept + { + return storage_ref_; + } + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept + { + return storage_ref_.capacity(); + } + + /** + * @brief Gets the bucket extent of the current storage. + * + * @return The bucket extent. + */ + [[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept + { + return storage_ref_.extent(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() const noexcept + { + return storage_ref_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept { return storage_ref_.end(); } + + /** + * @brief Makes a copy of the current device reference using non-owned memory. + * + * This function is intended to be used to create shared memory copies of small static data + * structures, although global memory can be used as well. + * + * @tparam CG The type of the cooperative thread group + * + * @param g The cooperative thread group used to copy the data structure + * @param memory_to_use Array large enough to support `capacity` elements. Object does not take + * the ownership of the memory + */ + template + __device__ void make_copy(CG g, value_type* const memory_to_use) const noexcept + { + auto const num_slots = this->capacity(); +#if defined(CUCO_HAS_CUDA_BARRIER) +#pragma nv_diagnostic push +// Disables `barrier` initialization warning. +#pragma nv_diag_suppress static_var_with_dynamic_init + __shared__ cuda::barrier barrier; +#pragma nv_diagnostic pop + if (g.thread_rank() == 0) { init(&barrier, g.size()); } + g.sync(); + + cuda::memcpy_async( + g, memory_to_use, this->storage_ref().data(), sizeof(value_type) * num_slots, barrier); + + barrier.arrive_and_wait(); +#else + value_type const* const slots_ptr = this->storage_ref().data(); + for (size_type i = g.thread_rank(); i < num_slots; i += g.size()) { + memory_to_use[i] = slots_ptr[i]; + } + g.sync(); +#endif + } + + /** + * @brief Initializes the container storage. + * + * @note This function synchronizes the group `tile`. + * + * @tparam CG The type of the cooperative thread group + * + * @param tile The cooperative thread group used to initialize the container + */ + template + __device__ constexpr void initialize(CG tile) noexcept + { + auto tid = tile.thread_rank(); + auto const extent = static_cast(this->extent()); + + auto* const slots_ptr = this->storage_ref().data(); + while (tid < extent) { + slots_ptr[tid] = this->empty_slot_sentinel(); + tid += tile.size(); + } + + tile.sync(); + } + + /** + * @brief Inserts an element. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param value The element to insert + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(Value value) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); + + auto probing_iter = + probing_scheme_.template make_iterator(key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + + for (auto& slot_content : bucket_slots) { + auto const eq_res = this->predicate_.template operator()( + key, this->extract_key(slot_content)); + + if constexpr (not allows_duplicates) { + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return false; } + } + if (eq_res == detail::equal_result::AVAILABLE) { + auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); + switch (attempt_insert( + this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val)) { + case insert_result::DUPLICATE: { + if constexpr (allows_duplicates) { + [[fallthrough]]; + } else { + return false; + } + } + case insert_result::CONTINUE: continue; + case insert_result::SUCCESS: return true; + } + } + } + ++probing_iter; + if (*probing_iter == init_idx) { return false; } + } + } + + /** + * @brief Inserts an element. + * + * @tparam Value Input type which is convertible to 'value_type' + * @tparam ParentCG Type of parent Cooperative Group + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(cooperative_groups::thread_block_tile group, + Value value) noexcept + { + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); + auto probing_iter = + probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_bucket_index] = [&]() { + bucket_probing_results result{detail::equal_result::UNEQUAL, -1}; + cuda::static_for([&] __device__(auto i) { + if (result.state_ == detail::equal_result::UNEQUAL) { + switch (this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i()]))) { + case detail::equal_result::AVAILABLE: + result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + break; + case detail::equal_result::EQUAL: { + if constexpr (!allows_duplicates) { + result = bucket_probing_results{detail::equal_result::EQUAL, i()}; + } + break; + } + default: break; + } + } + }); + return result; + }(); + + if constexpr (not allows_duplicates) { + // If the key is already in the container, return false + if (group.any(state == detail::equal_result::EQUAL)) { return false; } + } + + auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); + if (group_contains_available) { + auto const src_lane = __ffs(group_contains_available) - 1; + auto status = insert_result::CONTINUE; + if (group.thread_rank() == src_lane) { + if constexpr (SupportsErase) { + status = attempt_insert(this->get_slot_ptr(*probing_iter, intra_bucket_index), + bucket_slots[intra_bucket_index], + val); + } else { + status = attempt_insert(this->get_slot_ptr(*probing_iter, intra_bucket_index), + this->empty_slot_sentinel(), + val); + } + } + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: { + if constexpr (allows_duplicates) { + [[fallthrough]]; + } else { + return false; + } + } + default: continue; + } + } else { + ++probing_iter; + if (*probing_iter == init_idx) { return false; } + } + } + } + + /** + * @brief Inserts the given element into the container. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + template + __device__ cuda::std::pair insert_and_find(Value value) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); +#if __CUDA_ARCH__ < 700 + // Spinning to ensure that the write to the value part took place requires + // independent thread scheduling introduced with the Volta architecture. + static_assert(sizeof(value_type) <= 8, + "insert_and_find is not supported for slot types larger than 8 bytes on " + "pre-Volta GPUs."); +#endif + + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); + auto probing_iter = + probing_scheme_.template make_iterator(key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + + for (auto i = 0; i < bucket_size; ++i) { + auto const eq_res = this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i])); + auto* slot_ptr = this->get_slot_ptr(*probing_iter, i); + + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { + this->maybe_wait_for_payload(slot_ptr); + return {iterator{slot_ptr}, false}; + } + if (eq_res == detail::equal_result::AVAILABLE) { + switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { + case insert_result::SUCCESS: { + this->maybe_wait_for_payload(slot_ptr); + return {iterator{slot_ptr}, true}; + } + case insert_result::DUPLICATE: { + this->maybe_wait_for_payload(slot_ptr); + return {iterator{slot_ptr}, false}; + } + default: continue; + } + } + } + ++probing_iter; + if (*probing_iter == init_idx) { return {this->end(), false}; } + }; + } + + /** + * @brief Inserts the given element into the container. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @tparam Value Input type which is convertible to 'value_type' + * @tparam ParentCG Type of parent Cooperative Group + * + * @param group The Cooperative Group used to perform group insert_and_find + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + template + __device__ cuda::std::pair insert_and_find( + cooperative_groups::thread_block_tile group, Value value) noexcept + { +#if __CUDA_ARCH__ < 700 + // Spinning to ensure that the write to the value part took place requires + // independent thread scheduling introduced with the Volta architecture. + static_assert(sizeof(value_type) <= 8, + "insert_and_find is not supported for slot types larger than 8 bytes on " + "pre-Volta GPUs."); +#endif + + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); + auto probing_iter = + probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_bucket_index] = [&]() { + bucket_probing_results result{detail::equal_result::UNEQUAL, -1}; + cuda::static_for([&] __device__(auto i) { + if (result.state_ == detail::equal_result::UNEQUAL) { + auto res = this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i()])); + if (res != detail::equal_result::UNEQUAL) { result = bucket_probing_results{res, i()}; } + } + }); + return result; + }(); + + auto* slot_ptr = this->get_slot_ptr(*probing_iter, intra_bucket_index); + + // If the key is already in the container, return false + auto const group_finds_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_finds_equal) { + auto const src_lane = __ffs(group_finds_equal) - 1; + auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } + group.sync(); + return {iterator{reinterpret_cast(res)}, false}; + } + + auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); + if (group_contains_available) { + auto const src_lane = __ffs(group_contains_available) - 1; + auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); + auto const status = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; } + return this->attempt_insert_stable(slot_ptr, bucket_slots[target_idx], val); + }(); + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: { + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } + group.sync(); + return {iterator{reinterpret_cast(res)}, true}; + } + case insert_result::DUPLICATE: { + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } + group.sync(); + return {iterator{reinterpret_cast(res)}, false}; + } + default: continue; + } + } else { + ++probing_iter; + if (*probing_iter == init_idx) { return {this->end(), false}; } + } + } + } + + /** + * @brief Erases an element. + * + * @tparam ProbeKey Input type which is convertible to 'key_type' + * + * @param key The element to erase + * + * @return True if the given element is successfully erased + */ + template + __device__ bool erase(ProbeKey key) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + auto probing_iter = + probing_scheme_.template make_iterator(key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + + for (auto& slot_content : bucket_slots) { + auto const eq_res = + this->predicate_.template operator()(key, this->extract_key(slot_content)); + + // Key doesn't exist, return false + if (eq_res == detail::equal_result::EMPTY) { return false; } + // Key exists, return true if successfully deleted + if (eq_res == detail::equal_result::EQUAL) { + auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); + switch (attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), + slot_content, + this->erased_slot_sentinel())) { + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + default: continue; + } + } + } + ++probing_iter; + if (*probing_iter == init_idx) { return false; } + } + } + + /** + * @brief Erases an element. + * + * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ParentCG Type of parent Cooperative Group + * + * @param group The Cooperative Group used to perform group erase + * @param key The element to erase + * + * @return True if the given element is successfully erased + */ + template + __device__ bool erase(cooperative_groups::thread_block_tile group, + ProbeKey key) noexcept + { + auto probing_iter = + probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_bucket_index] = [&]() { + bucket_probing_results result{detail::equal_result::UNEQUAL, -1}; + cuda::static_for([&] __device__(auto i) { + if (result.state_ == detail::equal_result::UNEQUAL) { + auto res = this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i()])); + if (res != detail::equal_result::UNEQUAL) { result = bucket_probing_results{res, i()}; } + } + }); + return result; + }(); + + auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_contains_equal) { + auto const src_lane = __ffs(group_contains_equal) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), + bucket_slots[intra_bucket_index], + this->erased_slot_sentinel()) + : insert_result::CONTINUE; + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + default: continue; + } + } + + // Key doesn't exist, return false + if (group.any(state == detail::equal_result::EMPTY)) { return false; } + + ++probing_iter; + if (*probing_iter == init_idx) { return false; } + } + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey key) const noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = + probing_scheme_.template make_iterator(key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const bucket_slots = storage_ref_[*probing_iter]; + + for (auto i = 0; i < bucket_size; ++i) { + switch (this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i]))) { + case detail::equal_result::UNEQUAL: continue; + case detail::equal_result::EMPTY: return false; + case detail::equal_result::EQUAL: return true; + } + } + ++probing_iter; + if (*probing_iter == init_idx) { return false; } + } + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. + * + * @tparam ProbeKey Probe key type + * @tparam ParentCG Type of parent Cooperative Group + * + * @param group The Cooperative Group used to perform group contains + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains( + cooperative_groups::thread_block_tile group, ProbeKey key) const noexcept + { + auto probing_iter = + probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + + auto const state = [&]() { + auto res = detail::equal_result::UNEQUAL; + for (auto i = 0; i < bucket_size; ++i) { + res = this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i])); + if (res != detail::equal_result::UNEQUAL) { return res; } + } + return res; + }(); + + if (group.any(state == detail::equal_result::EQUAL)) { return true; } + if (group.any(state == detail::equal_result::EMPTY)) { return false; } + + ++probing_iter; + if (*probing_iter == init_idx) { return false; } + } + } + + /** + * @brief Finds an element in the container with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ iterator find(ProbeKey key) const noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = + probing_scheme_.template make_iterator(key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const bucket_slots = storage_ref_[*probing_iter]; + + for (auto i = 0; i < bucket_size; ++i) { + switch (this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i]))) { + case detail::equal_result::EMPTY: { + return this->end(); + } + case detail::equal_result::EQUAL: { + return iterator{this->get_slot_ptr(*probing_iter, i)}; + } + default: continue; + } + } + ++probing_iter; + if (*probing_iter == init_idx) { return this->end(); } + } + } + + /** + * @brief Finds an element in the container with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * @tparam ParentCG Type of parent Cooperative Group + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ iterator + find(cooperative_groups::thread_block_tile group, ProbeKey key) const noexcept + { + auto probing_iter = + probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_bucket_index] = [&]() { + bucket_probing_results result{detail::equal_result::UNEQUAL, -1}; + cuda::static_for([&] __device__(auto i) { + if (result.state_ == detail::equal_result::UNEQUAL) { + auto res = this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i()])); + if (res != detail::equal_result::UNEQUAL) { result = bucket_probing_results{res, i()}; } + } + }); + return result; + }(); + + // Find a match for the probe key, thus return an iterator to the entry + auto const group_finds_match = group.ballot(state == detail::equal_result::EQUAL); + if (group_finds_match) { + auto const src_lane = __ffs(group_finds_match) - 1; + auto const res = group.shfl( + reinterpret_cast(this->get_slot_ptr(*probing_iter, intra_bucket_index)), + src_lane); + return iterator{reinterpret_cast(res)}; + } + + // Find an empty slot, meaning that the probe key isn't present in the container + if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } + + ++probing_iter; + if (*probing_iter == init_idx) { return this->end(); } + } + } + + /** + * @brief Counts the occurrence of a given key contained in the container + * + * @tparam ProbeKey Probe key type + * + * @param key The key to count for + * + * @return Number of occurrences found by the current thread + */ + template + [[nodiscard]] __device__ size_type count(ProbeKey key) const noexcept + { + if constexpr (not allows_duplicates) { + return static_cast(this->contains(key)); + } else { + auto probing_iter = + probing_scheme_.template make_iterator(key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + size_type count = 0; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + cuda::std::int32_t equals[bucket_size] = {0}; + bool empty_found = false; + + cuda::static_for([&] __device__(auto i) { + auto const result = predicate_.template operator()( + key, this->extract_key(bucket_slots[i()])); + equals[i()] = (result == detail::equal_result::EQUAL); + if (result == detail::equal_result::EMPTY) { empty_found = true; } + }); + + count += thrust::reduce(thrust::seq, equals, equals + bucket_size); + + if (empty_found) { return count; } + + ++probing_iter; + if (*probing_iter == init_idx) { return count; } + } + } + } + + /** + * @brief Counts the occurrence of a given key contained in the container + * + * @tparam ProbeKey Probe key type + * @tparam ParentCG Type of parent Cooperative Group + * + * @param group The Cooperative Group used to perform group count + * @param key The key to count for + * + * @return Number of occurrences found by the current thread + */ + template + [[nodiscard]] __device__ size_type + count(cooperative_groups::thread_block_tile group, ProbeKey key) const noexcept + { + auto probing_iter = + probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + size_type count = 0; + + while (true) { + auto const bucket_slots = storage_ref_[*probing_iter]; + cuda::std::int32_t equals[bucket_size] = {0}; + bool empty_found = false; + + cuda::static_for([&] __device__(auto i) { + auto const result = + predicate_.template operator()(key, this->extract_key(bucket_slots[i()])); + equals[i()] = (result == detail::equal_result::EQUAL); + if (result == detail::equal_result::EMPTY) { empty_found = true; } + }); + + count += thrust::reduce(thrust::seq, equals, equals + bucket_size); + + if (group.any(empty_found)) { return count; } + + ++probing_iter; + if (*probing_iter == init_idx) { return count; } + } + } + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Integral atomic counter type that follows the same semantics as + * `cuda::(std::)atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Atomic object of integral type that is used to count the + * number of output elements + */ + template + __device__ void retrieve(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto constexpr is_outer = false; + auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); + auto const always_true_stencil = cuda::constant_iterator(true); + auto const identity_predicate = cuda::std::identity{}; + this->retrieve_impl(block, + input_probe_begin, + n, + always_true_stencil, + identity_predicate, + output_probe, + output_match, + atomic_counter); + } + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * If a key `k` has no matches in the container, then `{key, empty_slot_sentinel}` will be added + * to the output sequence. + * + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Integral atomic counter type that follows the same semantics as + * `cuda::(std::)atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Atomic object of integral type that is used to count the + * number of output elements + */ + template + __device__ void retrieve_outer(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto constexpr is_outer = true; + auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); + auto const always_true_stencil = cuda::constant_iterator(true); + auto const identity_predicate = cuda::std::identity{}; + this->retrieve_impl(block, + input_probe_begin, + n, + always_true_stencil, + identity_predicate, + output_probe, + output_match, + atomic_counter); + } + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)` if `pred` of the corresponding stencil returns true. + * + * If key `k = *(first + i)` exists in the container and `pred( *(stencil + i) )` returns true, + * copies `k` to `output_probe` and associated slot contents to `output_match`, + * respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Integral atomic counter type that follows the same semantics as + * `cuda::(std::)atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Atomic object of integral type that is used to count the + * number of output elements + */ + template + __device__ void retrieve_if(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + StencilIt stencil, + Predicate pred, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto constexpr is_outer = false; + auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); + this->retrieve_impl( + block, input_probe_begin, n, stencil, pred, output_probe, output_match, atomic_counter); + } + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * If `IsOuter == true` and a key `k` has no matches in the container, then `{key, + * empty_slot_sentinel}` will be added to the output sequence. + * + * @tparam IsOuter Flag indicating if an inner or outer retrieve operation should be performed + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Integral atomic type that follows the same semantics as + * `cuda::(std::)atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe Beginning of the input sequence of keys + * @param n Number of input keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Atomic object of integral type that is used to count the + * number of output elements + */ + template + __device__ void retrieve_impl(cooperative_groups::thread_block const& block, + InputProbeIt input_probe, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + namespace cg = cooperative_groups; + + if (n == 0) { return; } + + using probe_type = typename cuda::std::iterator_traits::value_type; + + // tuning parameter + auto constexpr buffer_multiplier = 1; + static_assert(buffer_multiplier > 0); + + auto constexpr probing_tile_size = cg_size; + auto constexpr flushing_tile_size = cuco::detail::warp_size(); + static_assert(flushing_tile_size >= probing_tile_size); + + auto constexpr num_flushing_tiles = BlockSize / flushing_tile_size; + auto constexpr max_matches_per_step = flushing_tile_size * bucket_size; + auto constexpr buffer_size = buffer_multiplier * max_matches_per_step + flushing_tile_size; + + auto const flushing_tile = cg::tiled_partition(block); + auto const probing_tile = cg::tiled_partition(block); + + auto const flushing_tile_id = flushing_tile.meta_group_rank(); + auto const stride = probing_tile.meta_group_size(); + auto idx = probing_tile.meta_group_rank(); + + __shared__ cuco::pair buffers[num_flushing_tiles][buffer_size]; + __shared__ cuda::std::int32_t counters[num_flushing_tiles]; + + if (flushing_tile.thread_rank() == 0) { counters[flushing_tile_id] = 0; } + flushing_tile.sync(); + + auto flush_buffers = [&](auto tile) { + size_type offset = 0; + auto const count = counters[flushing_tile_id]; + auto const rank = tile.thread_rank(); + if (rank == 0) { offset = atomic_counter.fetch_add(count, cuda::memory_order_relaxed); } + offset = tile.shfl(offset, 0); + + // flush_buffers + for (auto i = rank; i < count; i += tile.size()) { + *(output_probe + offset + i) = buffers[flushing_tile_id][i].first; + *(output_match + offset + i) = buffers[flushing_tile_id][i].second; + } + }; + + while (flushing_tile.any(idx < n)) { + bool active_flag = idx < n and pred(*(stencil + idx)); + auto const active_flushing_tile = + cg::binary_partition(flushing_tile, active_flag); + + if (active_flag) { + // perform probing + // make sure the flushing_tile is converged at this point to get a coalesced load + auto const probe_key = *(input_probe + idx); + + auto probing_iter = probing_scheme_.template make_iterator( + probing_tile, probe_key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + bool running = true; + [[maybe_unused]] bool found_match = false; + + bool equals[bucket_size]; + cuda::std::uint32_t exists[bucket_size]; + + while (active_flushing_tile.any(running)) { + if (running) { + // TODO atomic_ref::load if insert operator is present + auto const bucket_slots = this->storage_ref_[*probing_iter]; + + cuda::static_for([&] __device__(auto i) { + equals[i()] = false; + if (running) { + // inspect slot content + switch (this->predicate_.template operator()( + probe_key, this->extract_key(bucket_slots[i()]))) { + case detail::equal_result::EMPTY: { + running = false; + break; + } + case detail::equal_result::EQUAL: { + if constexpr (!AllowsDuplicates) { running = false; } + equals[i()] = true; + break; + } + default: { + break; + } + } + } + }); + + probing_tile.sync(); + running = probing_tile.all(running); + cuda::static_for( + [&](auto i) { exists[i()] = probing_tile.ballot(equals[i()]); }); + + // Fill the buffer if any matching keys are found + auto const lane_id = probing_tile.thread_rank(); + if (thrust::any_of(thrust::seq, exists, exists + bucket_size, cuda::std::identity{})) { + if constexpr (IsOuter) { found_match = true; } + + cuda::std::int32_t num_matches[bucket_size]; + + cuda::static_for( + [&](auto i) { num_matches[i()] = __popc(exists[i()]); }); + + cuda::std::int32_t output_idx; + if (lane_id == 0) { + auto const total_matches = + thrust::reduce(thrust::seq, num_matches, num_matches + bucket_size); + auto ref = cuda::atomic_ref{ + counters[flushing_tile_id]}; + output_idx = ref.fetch_add(total_matches, cuda::memory_order_relaxed); + } + output_idx = probing_tile.shfl(output_idx, 0); + + cuda::std::int32_t matches_offset = 0; + cuda::static_for([&] __device__(auto i) { + if (equals[i()]) { + auto const lane_offset = + detail::count_least_significant_bits(exists[i()], lane_id); + buffers[flushing_tile_id][output_idx + matches_offset + lane_offset] = { + probe_key, bucket_slots[i()]}; + } + matches_offset += num_matches[i()]; + }); + } + // Special handling for outer cases where no match is found + if constexpr (IsOuter) { + if (!running) { + if (!found_match and lane_id == 0) { + auto ref = cuda::atomic_ref{ + counters[flushing_tile_id]}; + auto const output_idx = ref.fetch_add(1, cuda::memory_order_relaxed); + buffers[flushing_tile_id][output_idx] = {probe_key, this->empty_slot_sentinel()}; + } + } + } + } // if running + + active_flushing_tile.sync(); + // if the buffer has not enough empty slots for the next iteration + if (counters[flushing_tile_id] > (buffer_size - max_matches_per_step)) { + flush_buffers(active_flushing_tile); + active_flushing_tile.sync(); + + // reset buffer counter + if (active_flushing_tile.thread_rank() == 0) { counters[flushing_tile_id] = 0; } + active_flushing_tile.sync(); + } + + // onto the next probing bucket + ++probing_iter; + if (*probing_iter == init_idx) { running = false; } + } // while running + } // if active_flag + + // onto the next key + idx += stride; + } + + flushing_tile.sync(); + // entire flusing_tile has finished; flush remaining elements + if (counters[flushing_tile_id] > 0) { flush_buffers(flushing_tile); } + } + + /** + * @brief For a given key, applies the function object `callback_op` to the copy of all + * corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object + * + * @param key The key to search for + * @param callback_op Function to apply to every matched slot + */ + template + __device__ void for_each(ProbeKey key, CallbackOp&& callback_op) const noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = + probing_scheme_.template make_iterator(key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const bucket_slots = this->storage_ref_[*probing_iter]; + + bool should_return = false; + cuda::static_for([&] __device__(auto i) { + if (!should_return) { + switch (this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i()]))) { + case detail::equal_result::EMPTY: { + should_return = true; + break; + } + case detail::equal_result::EQUAL: { + callback_op(bucket_slots[i()]); + break; + } + default: break; + } + } + }); + if (should_return) { return; } + ++probing_iter; + if (*probing_iter == init_idx) { return; } + } + } + + /** + * @brief For a given key, applies the function object `callback_op` to the copy of all + * corresponding matches found in the container. + * + * @note This function uses cooperative group semantics, meaning that any thread may call the + * callback if it finds a matching element. If multiple elements are found within the same group, + * each thread with a match will call the callback with its associated element. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @note Synchronizing `group` within `callback_op` is undefined behavior. + * + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object + * @tparam ParentCG Type of parent Cooperative Group + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * @param callback_op Function to apply to every matched slot + */ + template + __device__ void for_each(cooperative_groups::thread_block_tile group, + ProbeKey key, + CallbackOp&& callback_op) const noexcept + { + auto probing_iter = + probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + bool empty = false; + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const bucket_slots = this->storage_ref_[*probing_iter]; + + for (cuda::std::int32_t i = 0; i < bucket_size and !empty; ++i) { + switch (this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i]))) { + case detail::equal_result::EMPTY: { + empty = true; + continue; + } + case detail::equal_result::EQUAL: { + callback_op(bucket_slots[i]); + continue; + } + default: { + continue; + } + } + } + if (group.any(empty)) { return; } + + ++probing_iter; + if (*probing_iter == init_idx) { return; } + } + } + + /** + * @brief Applies the function object `callback_op` to the copy of every slot in the container + * with key equivalent to the probe key and can additionally perform work that requires + * synchronizing the Cooperative Group performing this operation. + * + * @note This function uses cooperative group semantics, meaning that any thread may call the + * callback if it finds a matching element. If multiple elements are found within the same group, + * each thread with a match will call the callback with its associated element. + * + * @note Synchronizing `group` within `callback_op` is undefined behavior. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @note The `sync_op` function can be used to perform work that requires synchronizing threads in + * `group` inbetween probing steps, where the number of probing steps performed between + * synchronization points is capped by `bucket_size * cg_size`. The functor will be called right + * after the current probing bucket has been traversed. + * + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object + * @tparam SyncOp Type of function object which accepts the current `group` object + * @tparam ParentCG Type of parent Cooperative Group + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * @param callback_op Function to apply to every matched slot + * @param sync_op Function that is allowed to synchronize `group` inbetween probing buckets + */ + template + __device__ void for_each(cooperative_groups::thread_block_tile group, + ProbeKey key, + CallbackOp&& callback_op, + SyncOp&& sync_op) const noexcept + { + auto probing_iter = + probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); + auto const init_idx = *probing_iter; + bool empty = false; + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const bucket_slots = this->storage_ref_[*probing_iter]; + + for (cuda::std::int32_t i = 0; i < bucket_size and !empty; ++i) { + switch (this->predicate_.template operator()( + key, this->extract_key(bucket_slots[i]))) { + case detail::equal_result::EMPTY: { + empty = true; + continue; + } + case detail::equal_result::EQUAL: { + callback_op(bucket_slots[i]); + continue; + } + default: { + continue; + } + } + } + sync_op(group); + if (group.any(empty)) { return; } + + ++probing_iter; + if (*probing_iter == init_idx) { return; } + } + } + + /** + * @brief Gets a pointer to the slot at the given probing index and intra-bucket index. + * + * @param probing_idx The current probing index + * @param intra_bucket_idx The index within the bucket (0 for flat storage) + * @return Pointer to the slot + */ + __device__ value_type* get_slot_ptr(size_type probing_idx, + cuda::std::int32_t intra_bucket_idx) const noexcept + { + return storage_ref_.data() + probing_idx + intra_bucket_idx; + } + + /** + * @brief Extracts the key from a given value type. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param value The input value + * + * @return The key + */ + template + [[nodiscard]] __host__ __device__ constexpr auto extract_key(Value value) const noexcept + { + if constexpr (has_payload) { + return thrust::raw_reference_cast(value).first; + } else { + return thrust::raw_reference_cast(value); + } + } + + /** + * @brief Extracts the payload from a given value type. + * + * @note This function is only available if `this->has_payload == true` + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param value The input value + * + * @return The payload + */ + template > + [[nodiscard]] __host__ __device__ constexpr auto extract_payload(Value value) const noexcept + { + return thrust::raw_reference_cast(value).second; + } + + /** + * @brief Converts the given type to the container's native `value_type`. + * + * @tparam T Input type which is convertible to 'value_type' + * + * @param value The input value + * + * @return The converted object + */ + template + [[nodiscard]] __device__ constexpr value_type native_value(T value) const noexcept + { + if constexpr (has_payload) { + return {static_cast(this->extract_key(value)), this->extract_payload(value)}; + } else { + return static_cast(value); + } + } + + /** + * @brief Converts the given type to the container's native `value_type` while maintaining the + * heterogeneous key type. + * + * @tparam T Input type which is convertible to 'value_type' + * + * @param value The input value + * + * @return The converted object + */ + template + [[nodiscard]] __device__ constexpr auto heterogeneous_value(T value) const noexcept + { + if constexpr (has_payload and not cuda::std::is_same_v) { + using mapped_type = decltype(this->empty_value_sentinel()); + if constexpr (cuco::detail::is_cuda_std_pair_like::value) { + return cuco::pair{cuda::std::get<0>(value), + static_cast(cuda::std::get<1>(value))}; + } else { + // hail mary (convert using .first/.second members) + return cuco::pair{thrust::raw_reference_cast(value.first), + static_cast(value.second)}; + } + } else { + return thrust::raw_reference_cast(value); + } + } + + /** + * @brief Gets the sentinel used to represent an erased slot. + * + * @return The sentinel value used to represent an erased slot + */ + [[nodiscard]] __device__ constexpr value_type erased_slot_sentinel() const noexcept + { + if constexpr (has_payload) { + return cuco::pair{this->erased_key_sentinel(), this->empty_value_sentinel()}; + } else { + return this->erased_key_sentinel(); + } + } + + /** + * @brief Inserts the specified element with one single CAS operation. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param address Pointer to the slot in memory + * @param expected Element to compare against + * @param desired Element to insert + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ constexpr insert_result packed_cas(value_type* address, + value_type expected, + Value desired) noexcept + { + using packed_type = cuco::detail::packed_t; + + auto* slot_ptr = reinterpret_cast(address); + auto* expected_ptr = reinterpret_cast(&expected); + auto* desired_ptr = reinterpret_cast(&desired); + + auto slot_ref = cuda::atomic_ref{*slot_ptr}; + + auto const success = + slot_ref.compare_exchange_strong(*expected_ptr, *desired_ptr, cuda::memory_order_relaxed); + + if (success) { + return insert_result::SUCCESS; + } else { + return this->predicate_.equal_to(this->extract_key(desired), this->extract_key(expected)) == + detail::equal_result::EQUAL + ? insert_result::DUPLICATE + : insert_result::CONTINUE; + } + } + + /** + * @brief Inserts the specified element with two back-to-back CAS operations. + * + * @note This CAS can be used exclusively for `cuco::op::insert` operations. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param address Pointer to the slot in memory + * @param expected Element to compare against + * @param desired Element to insert + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ constexpr insert_result back_to_back_cas(value_type* address, + value_type expected, + Value desired) noexcept + { + using mapped_type = cuda::std::decay_tempty_value_sentinel())>; + + auto expected_key = expected.first; + auto expected_payload = this->empty_value_sentinel(); + + cuda::atomic_ref key_ref(address->first); + cuda::atomic_ref payload_ref(address->second); + + auto const key_cas_success = key_ref.compare_exchange_strong( + expected_key, static_cast(desired.first), cuda::memory_order_relaxed); + auto payload_cas_success = payload_ref.compare_exchange_strong( + expected_payload, desired.second, cuda::memory_order_relaxed); + + // if key success + if (key_cas_success) { + while (not payload_cas_success) { + payload_cas_success = + payload_ref.compare_exchange_strong(expected_payload = this->empty_value_sentinel(), + desired.second, + cuda::memory_order_relaxed); + } + return insert_result::SUCCESS; + } else if (payload_cas_success) { + // This is insert-specific, cannot for `erase` operations + payload_ref.store(this->empty_value_sentinel(), cuda::memory_order_relaxed); + } + + // Our key was already present in the slot, so our key is a duplicate + // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare + if (this->predicate_.equal_to(desired.first, expected_key) == detail::equal_result::EQUAL) { + return insert_result::DUPLICATE; + } + + return insert_result::CONTINUE; + } + + /** + * @brief Inserts the specified element with CAS-dependent write operations. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param address Pointer to the slot in memory + * @param expected Element to compare against + * @param desired Element to insert + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ constexpr insert_result cas_dependent_write(value_type* address, + value_type expected, + Value desired) noexcept + { + using mapped_type = cuda::std::decay_tempty_value_sentinel())>; + + cuda::atomic_ref key_ref(address->first); + auto expected_key = expected.first; + auto const success = key_ref.compare_exchange_strong( + expected_key, static_cast(desired.first), cuda::memory_order_relaxed); + + // if key success + if (success) { + cuda::atomic_ref payload_ref(address->second); + payload_ref.store(desired.second, cuda::memory_order_relaxed); + return insert_result::SUCCESS; + } + + // Our key was already present in the slot, so our key is a duplicate + // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare + if (this->predicate_.equal_to(desired.first, expected_key) == detail::equal_result::EQUAL) { + return insert_result::DUPLICATE; + } + + return insert_result::CONTINUE; + } + + /** + * @brief Attempts to insert an element into a slot. + * + * @note Dispatches the correct implementation depending on the container + * type and presence of other operator mixins. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param address Pointer to the slot in memory + * @param expected Element to compare against + * @param desired Element to insert + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ insert_result attempt_insert(value_type* address, + value_type expected, + Value desired) noexcept + { + if constexpr (sizeof(value_type) <= 8) { + return packed_cas(address, expected, desired); + } +#if (__CUDA_ARCH__ >= 900) + else if constexpr (cuco::detail::is_packable()) { + return packed_cas(address, expected, desired); + } +#endif + else if constexpr (has_payload) { +#if (__CUDA_ARCH__ < 700) + return cas_dependent_write(address, expected, desired); +#else + return back_to_back_cas(address, expected, desired); +#endif + } else { + static_assert(cuco::dependent_false, + "No valid atomic CAS path: 16-byte key in a key-only container must be " + "packable (have unique object representations) and target sm_90+."); + } + } + + /** + * @brief Attempts to insert an element into a slot. + * + * @note Dispatches the correct implementation depending on the container + * type and presence of other operator mixins. + * + * @note `stable` indicates that the payload will only be updated once from the sentinel value to + * the desired value, meaning there can be no ABA situations. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param address Pointer to the slot in memory + * @param expected Element to compare against + * @param desired Element to insert + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ insert_result attempt_insert_stable(value_type* address, + value_type expected, + Value desired) noexcept + { + if constexpr (sizeof(value_type) <= 8) { + return packed_cas(address, expected, desired); + } +#if (__CUDA_ARCH__ >= 900) + else if constexpr (cuco::detail::is_packable()) { + return packed_cas(address, expected, desired); + } +#endif + else if constexpr (has_payload) { + return cas_dependent_write(address, expected, desired); + } else { + static_assert(cuco::dependent_false, + "No valid atomic CAS path: 16-byte key in a key-only container must be " + "packable (have unique object representations) and target sm_90+."); + } + } + + /** + * @brief Waits until the slot payload has been updated + * + * @note The function will return once the slot payload is no longer equal to the sentinel + * value. + * + * @tparam T Map slot type + * + * @param slot The target slot to check payload with + * @param sentinel The slot sentinel value + */ + template + __device__ void wait_for_payload(T& slot, T sentinel) const noexcept + { + auto ref = cuda::atomic_ref{slot}; + T current; + // TODO exponential backoff strategy + do { + current = ref.load(cuda::std::memory_order_relaxed); + } while (cuco::detail::bitwise_compare(current, sentinel)); + } + + /** + * @brief Conditionally spin-waits for the payload of a non-atomically inserted slot to become + * visible. + * + * For containers where the key and value are inserted by separate instructions + * (`cas_dependent_write` / `back_to_back_cas`), an observer thread may see the key before the + * payload. This helper spins until the payload is visible. For atomic single-CAS paths (slot + * size <= 8 bytes, or a packable slot on sm_90+ via `atom.cas.b128`), the payload is already + * visible and this is a no-op. + * + * @tparam SlotPtr Pointer-like type to a slot holding a `.second` payload member + * + * @param slot_ptr Pointer to the slot whose payload may need waiting on + */ + template + __device__ void maybe_wait_for_payload(SlotPtr slot_ptr) noexcept + { + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif + } + } + + // TODO: Clean up the sentinel handling since it's duplicated in ref and equal wrapper + value_type empty_slot_sentinel_; ///< Sentinel value indicating an empty slot + detail::equal_wrapper + predicate_; ///< Key equality binary callable + probing_scheme_type probing_scheme_; ///< Probing scheme + storage_ref_type storage_ref_; ///< Slot storage ref +}; + +} // namespace robin_hood +} // namespace detail +} // namespace cuco From 1fb7cf37f47c7e7005c6790501ad6f15d3ae7e8f Mon Sep 17 00:00:00 2001 From: aterenin Date: Thu, 25 Jun 2026 14:51:47 -0400 Subject: [PATCH 14/15] Add Robin Hood get and insert. --- .../robin_hood/open_addressing_ref_impl.cuh | 472 +++++++++++++++++- include/cuco/static_map_ref.cuh | 8 +- 2 files changed, 453 insertions(+), 27 deletions(-) diff --git a/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh index 1eecb70f4..841e71f4d 100644 --- a/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include #include +#include #include #include #include @@ -68,6 +70,52 @@ struct bucket_probing_results { } }; +/** + * @brief Robin Hood inverse primitive for the linear probing sequence. + * + * @note Recovers a resident's probe distance ("age") from the slot it occupies: how many probing + * steps the resident sits from its own home bucket. For the linear sequence this is a single + * subtract — `(slot_base - resident_home) / stride mod num_buckets`. This is the linear-only + * overload; a `double_hashing` variant would add its own overload here (a modular inverse of the + * resident's per-key step, or a stored age), which is the single place that change lands. + * + * @tparam BucketSize Size of the bucket + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash Unary callable type + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param scheme The underlying linear probing scheme (supplies the hash function) + * @param resident_key The key currently residing in the slot + * @param slot_index The slot index at which `resident_key` resides + * @param upper_bound Upper bound of the iteration + * @return The resident's probe distance, in probing steps + */ +template +[[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( + linear_probing const& scheme, + ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) noexcept +{ + using size_type = typename Extent::value_type; + size_type constexpr stride = CGSize * BucketSize; + auto const bound = static_cast(upper_bound); + auto const hash = scheme.hash_function(); + + // Home bucket base of the resident, using the same alignment as `make_iterator`. + size_type const resident_home = + cuco::detail::sanitize_hash(hash(resident_key)) % (bound / stride) * stride; + + // Bucket-strided base of the slot the resident currently occupies. The per-lane `thread_rank` + // offset (which is < stride) is stripped by the floor division so that the distance is measured + // in whole probing steps, consistent with the forward sequence. + size_type const slot_base = (slot_index / stride) * stride; + + // (slot_base - resident_home) mod capacity, expressed in probing steps. + return static_cast((slot_base + bound - resident_home) % bound) / stride; +} + /** * @brief Common device non-owning "ref" implementation class. * @@ -130,6 +178,21 @@ class open_addressing_ref_impl storage_ref_type::bucket_size; ///< Number of elements handled per bucket static constexpr auto thread_scope = Scope; ///< CUDA thread scope + // Robin Hood displacement swaps the in-flight pair into an *occupied* slot, which needs a single + // atomic CAS of the whole slot. That requires a packable slot: <= 8 bytes (atom.cas.b64), or + // padding-free and <= 16 bytes on an sm_90+ build (atom.cas.b128). A non-packable slot (e.g. a + // padded `pair`) would fall back to a split key/value CAS, which cannot move an + // occupied slot -- displacement would livelock. Reject it at compile time rather than hang. + static constexpr bool robin_hood_slot_is_single_cas = sizeof(value_type) <= 8 +#if defined(CUCO_HAS_128BIT_ATOMICS) + or cuco::detail::is_packable() +#endif + ; + static_assert(robin_hood_slot_is_single_cas, + "Robin Hood probing requires a single-CAS slot: the key+value must fit in 8 bytes, " + "or be packable (padding-free) and <= 16 bytes on an sm_90+ build. A padded slot " + "(e.g. pair) is unsupported -- displacement would livelock."); + /** * @brief Constructs open_addressing_ref_impl. * @@ -379,16 +442,19 @@ class open_addressing_ref_impl { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); auto const init_idx = *probing_iter; + size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; + bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(slot_content)); @@ -397,7 +463,10 @@ class open_addressing_ref_impl // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { return false; } } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone carries an age and is handled as a + // resident by the displacement test below. Skipping it must gate the CAS (once claimed it + // is already consumed), so it is folded into this condition. + if (eq_res == detail::equal_result::AVAILABLE and not this->is_erased(slot_content)) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); switch (attempt_insert( this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val)) { @@ -408,11 +477,58 @@ class open_addressing_ref_impl return false; } } - case insert_result::CONTINUE: continue; + case insert_result::CONTINUE: { + // Retry on a lost CAS. Plain probing keeps scanning this (now stale) bucket; Robin + // Hood must re-read it instead, so the in-flight pair is re-evaluated against the new + // occupants -- otherwise it could be placed past a slot it should have displaced, + // breaking the invariant (and therefore lookups). + retry = true; + break; + } case insert_result::SUCCESS: return true; } + if (retry) { break; } // leave the scan to re-read the bucket + } + + // Robin Hood swap test. A resident "richer" than the in-flight pair (a smaller probe + // distance than our current probe step) is displaced: we swap our pair into its slot, adopt + // the evicted resident, and re-probe forward. A tombstone is treated as a resident too -- + // its age comes from its payload (`robin_hood_age`) -- but picking one up *consumes* it: we + // take the slot and are done, since there is nothing to carry forward. + if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(slot_content)) { + auto const intra_bucket_index = + cuda::std::distance(bucket_slots.begin(), &slot_content); + auto const evicted_age = this->robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); + if (evicted_age < probe_step) { + if (this->attempt_insert(this->get_slot_ptr(*probing_iter, intra_bucket_index), + slot_content, + val) == insert_result::SUCCESS) { + // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. + if (this->is_erased(slot_content)) { return true; } + // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is + // `evicted_age`, and it may belong in another slot of the same bucket: an empty + // one, or one holding an even-richer resident it can displace in turn. Re-reading + // the bucket (rather than advancing past it) is the within-bucket linear probe, + // i.e. the combined bucket+slot distance that makes displacement correct for + // bucket_size > 1. The `slot_distance` term cancels in every comparison, so it + // never appears here; it shows up only as this slot-by-slot continuation. + // `bit_cast` keeps the adoption valid for heterogeneous insert types + // (layout-compatible by contract; identity in the common case). + val = cuda::std::bit_cast(slot_content); + key = this->extract_key(val); + probe_step = evicted_age; + } + retry = + true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS + break; + } } } + + if (retry) { continue; } // re-probe (re-read this bucket, or move on after displacement) + ++probe_step; + ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -433,11 +549,12 @@ class open_addressing_ref_impl __device__ bool insert(cooperative_groups::thread_block_tile group, Value value) noexcept { - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); auto const init_idx = *probing_iter; + size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -448,9 +565,15 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { switch (this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()]))) { - case detail::equal_result::AVAILABLE: - result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + case detail::equal_result::AVAILABLE: { + // Robin Hood: only a true empty is AVAILABLE; a tombstone is a resident handled by + // the displacement scan below, so leave it UNEQUAL here. + bool empty_slot = not this->is_erased(bucket_slots[i()]); + if (empty_slot) { + result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + } break; + } case detail::equal_result::EQUAL: { if constexpr (!allows_duplicates) { result = bucket_probing_results{detail::equal_result::EQUAL, i()}; @@ -497,6 +620,60 @@ class open_addressing_ref_impl default: continue; } } else { + // Robin Hood displacement: no match, no empty slot in this bucket. Displace the first + // resident in probe (lane) order that is richer than the in-flight pair, adopt it, and + // re-probe THIS bucket -- the victim may belong in another slot of it. The within-bucket + // linear probe (combined bucket+slot distance) is identical to the scalar path; the + // `slot_distance` term cancels, so the test is again `resident distance < probe_step`. + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. + // consumed) exactly when richer than the in-flight pair, like any other resident. + auto const age = this->robin_hood_age(bucket_slots[i()], + static_cast(*probing_iter + i())); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = insert_result::CONTINUE; + // Only `src_lane` reads `evicted` meaningfully; other lanes just need a valid value to + // feed the broadcast `shfl` below, so seed it with the empty-slot sentinel. + value_type evicted = this->empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = + attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. + if (group.shfl(this->is_erased(evicted), src_lane)) { return true; } + // Broadcast the evicted pair and its probe distance from the winning lane, and adopt + // it on every lane (all lanes need the new in-flight pair for the next scan). + auto const new_key = group.shfl(this->extract_key(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + value_type evicted_slot; + if constexpr (has_payload) { + auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); + evicted_slot = value_type{new_key, new_payload}; + } else { + evicted_slot = new_key; + } + val = cuda::std::bit_cast(evicted_slot); + key = this->extract_key(val); + probe_step = new_age; + } + continue; // success: re-probe this bucket with the victim; lost CAS: re-read it + } + // No displaceable resident: fall through to the shared advance below. + ++probe_step; ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -529,15 +706,21 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); auto const init_idx = *probing_iter; + size_type probe_step = 0; + // Robin Hood may displace the original key before the chain ends; remember the slot it landed + // in so we return an iterator to it (not to a later victim's slot). + value_type* placed_ptr = nullptr; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; + bool retry = false; + for (auto i = 0; i < bucket_size; ++i) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i])); @@ -548,20 +731,58 @@ class open_addressing_ref_impl this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the + // displacement test below (see `insert`). + if (eq_res == detail::equal_result::AVAILABLE and not this->is_erased(bucket_slots[i])) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { - this->maybe_wait_for_payload(slot_ptr); - return {iterator{slot_ptr}, true}; + // The in-flight pair is placed in an empty slot, ending any displacement chain. The + // iterator to return is the original key's slot (captured on its first placement). + auto* result_ptr = slot_ptr; + if (placed_ptr != nullptr) { result_ptr = placed_ptr; } + this->maybe_wait_for_payload(result_ptr); + return {iterator{result_ptr}, true}; } case insert_result::DUPLICATE: { this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - default: continue; + case insert_result::CONTINUE: { + retry = true; + break; + } + } + if (retry) { break; } + } + + // Robin Hood swap test (see `insert` for the full rationale). A tombstone is a resident too + // (age from its payload); picking one up consumes it -- the in-flight pair lands there and + // we are done. + if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(bucket_slots[i])) { + auto const evicted_age = + this->robin_hood_age(bucket_slots[i], static_cast(*probing_iter + i)); + if (evicted_age < probe_step) { + if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == insert_result::SUCCESS) { + if (this->is_erased(bucket_slots[i])) { + // Consumed a tombstone: the in-flight pair is placed here; return the original + // key's slot (this one if it was never displaced). + auto* result_ptr = (placed_ptr != nullptr) ? placed_ptr : slot_ptr; + this->maybe_wait_for_payload(result_ptr); + return {iterator{result_ptr}, true}; + } + if (placed_ptr == nullptr) { placed_ptr = slot_ptr; } // original key's slot + val = cuda::std::bit_cast(bucket_slots[i]); + key = this->extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; } } } + + if (retry) { continue; } + ++probe_step; ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } }; @@ -595,11 +816,15 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); auto const init_idx = *probing_iter; + size_type probe_step = 0; + // Robin Hood may displace the original key before the chain ends; remember (broadcast) the slot + // it first landed in so we return an iterator to it. 0 means "not yet placed". + intptr_t placed_ptr = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -610,6 +835,11 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { auto res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()])); + // Robin Hood: a tombstone is a resident handled by the displacement scan below, not + // AVAILABLE, so leave it UNEQUAL here. + if (res == detail::equal_result::AVAILABLE and this->is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } if (res != detail::equal_result::UNEQUAL) { result = bucket_probing_results{res, i()}; } } }); @@ -639,9 +869,13 @@ class open_addressing_ref_impl switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { + // The in-flight pair is placed in an empty slot, ending any displacement chain. Return + // the original key's slot (the first placement) if it was displaced earlier. + auto result = res; + if (placed_ptr != 0) { result = placed_ptr; } if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); - return {iterator{reinterpret_cast(res)}, true}; + return {iterator{reinterpret_cast(result)}, true}; } case insert_result::DUPLICATE: { if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } @@ -651,6 +885,62 @@ class open_addressing_ref_impl default: continue; } } else { + // Robin Hood displacement (see CG `insert` for the full rationale). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. + // consumed) exactly when richer than the in-flight pair, like any other resident. + auto const age = this->robin_hood_age(bucket_slots[i()], + static_cast(*probing_iter + i())); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = insert_result::CONTINUE; + value_type evicted = this->empty_slot_sentinel(); + intptr_t displaced = 0; + if (group.thread_rank() == src_lane) { + auto* dptr = this->get_slot_ptr(*probing_iter, displace_idx); + evicted = bucket_slots[displace_idx]; + status = attempt_insert(dptr, evicted, val); + displaced = reinterpret_cast(dptr); + } + if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } + // Consumed a tombstone: the in-flight pair is placed in its slot; we are done. Return + // the original key's slot (`placed_ptr`, which is this slot if it was never + // displaced). + if (group.shfl(this->is_erased(evicted), src_lane)) { + if (group.thread_rank() == src_lane) { + this->maybe_wait_for_payload(reinterpret_cast(displaced)); + } + group.sync(); + return {iterator{reinterpret_cast(placed_ptr)}, true}; + } + auto const new_key = group.shfl(this->extract_key(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + value_type evicted_slot; + if constexpr (has_payload) { + auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); + evicted_slot = value_type{new_key, new_payload}; + } else { + evicted_slot = new_key; + } + val = cuda::std::bit_cast(evicted_slot); + key = this->extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } } @@ -687,9 +977,12 @@ class open_addressing_ref_impl // Key exists, return true if successfully deleted if (eq_res == detail::equal_result::EQUAL) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - switch (attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - slot_content, - this->erased_slot_sentinel())) { + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes + // use the plain erased sentinel. + value_type erased = this->robin_hood_erased_sentinel( + slot_content, static_cast(*probing_iter + intra_bucket_index)); + switch (attempt_insert_stable( + this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, erased)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: return false; default: continue; @@ -738,12 +1031,17 @@ class open_addressing_ref_impl auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - auto const status = - (group.thread_rank() == src_lane) - ? attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - bucket_slots[intra_bucket_index], - this->erased_slot_sentinel()) - : insert_result::CONTINUE; + auto status = insert_result::CONTINUE; + if (group.thread_rank() == src_lane) { + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes + // use the plain erased sentinel. + value_type erased = this->robin_hood_erased_sentinel( + bucket_slots[intra_bucket_index], + static_cast(*probing_iter + intra_bucket_index)); + status = attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), + bucket_slots[intra_bucket_index], + erased); + } switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; @@ -779,6 +1077,7 @@ class open_addressing_ref_impl auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); auto const init_idx = *probing_iter; + size_type probe_step = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -792,6 +1091,11 @@ class open_addressing_ref_impl case detail::equal_result::EQUAL: return true; } } + // Robin Hood: a resident richer than us proves the key is absent. + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { + return false; + } + ++probe_step; ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -818,6 +1122,7 @@ class open_addressing_ref_impl auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); auto const init_idx = *probing_iter; + size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -835,6 +1140,11 @@ class open_addressing_ref_impl if (group.any(state == detail::equal_result::EQUAL)) { return true; } if (group.any(state == detail::equal_result::EMPTY)) { return false; } + // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. + if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { + return false; + } + ++probe_step; ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -859,6 +1169,7 @@ class open_addressing_ref_impl auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); auto const init_idx = *probing_iter; + size_type probe_step = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -876,6 +1187,11 @@ class open_addressing_ref_impl default: continue; } } + // Robin Hood: a resident richer than us proves the key is absent. + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { + return this->end(); + } + ++probe_step; ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -902,6 +1218,7 @@ class open_addressing_ref_impl auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); auto const init_idx = *probing_iter; + size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -931,6 +1248,11 @@ class open_addressing_ref_impl // Find an empty slot, meaning that the probe key isn't present in the container if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } + // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. + if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { + return this->end(); + } + ++probe_step; ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -1593,6 +1915,104 @@ class open_addressing_ref_impl return storage_ref_.data() + probing_idx + intra_bucket_idx; } + /** + * @brief Determines whether the Robin Hood invariant proves the probe key absent at the current + * probe step. + * + * @note Only meaningful for Robin Hood probing. The key is proven absent when the bucket holds a + * resident that is "richer" than the probe key — i.e. whose own probe distance is smaller than + * the probe key's probe distance at the current step (`probe_step`). Such a resident would have + * been displaced on insertion if the probe key lived here, so the probe key cannot be present. + * + * @note Behavior is only well-defined when every slot in the bucket is occupied (the callers + * reach this check only after ruling out empty and matching slots), since probe distance is + * meaningless for an empty slot. + * + * @tparam BucketSlots Bucket slot array type + * + * @param bucket_slots The slots of the bucket currently being probed + * @param bucket_base The slot index of the first slot in the bucket + * @param probe_step The probe key's own probe distance at the current step + * + * @return True if some resident in the bucket is richer than the probe key + */ + template + [[nodiscard]] __device__ bool robin_hood_proves_absent(BucketSlots const& bucket_slots, + size_type bucket_base, + size_type probe_step) const noexcept + { + bool richer = false; + cuda::static_for([&](auto i) { + auto const resident_age = + this->robin_hood_age(bucket_slots[i()], static_cast(bucket_base + i())); + if (resident_age < probe_step) { richer = true; } + }); + return richer; + } + + /** + * @brief Whether `slot` holds a tombstone (erased marker). + * + * @note Returns false when erase is disabled (the erased and empty sentinels coincide, so no slot + * is a tombstone) -- this keeps the test correct even for empty slots. + * + * @param slot The slot to test + * + * @return True if `slot` is an erased tombstone + */ + [[nodiscard]] __device__ bool is_erased(value_type const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(this->erased_key_sentinel(), + this->empty_key_sentinel()) and + cuco::detail::bitwise_compare(this->extract_key(slot), this->erased_key_sentinel()); + } + + /** + * @brief Robin Hood probe distance ("age") of an occupied slot. + * + * A live key's age is its `probe_distance`. A Robin Hood tombstone keeps the age of the key it + * replaced in its payload (the original key is gone and cannot be rehashed; see `erase`), so it + * is read back here -- a tombstone then participates in every Robin Hood comparison exactly like + * the resident it stood in for. + * + * @param slot The (occupied) slot + * @param slot_index The slot's index + * + * @return The slot's probe distance + */ + [[nodiscard]] __device__ size_type robin_hood_age(value_type const& slot, + size_type slot_index) const noexcept + { + if constexpr (has_payload) { + if (this->is_erased(slot)) { return static_cast(this->extract_payload(slot)); } + } + return robin_hood::probe_distance( + probing_scheme_, this->extract_key(slot), slot_index, storage_ref_.extent()); + } + + /** + * @brief The Robin Hood tombstone for erasing the live key currently in `slot` at `slot_index`. + * + * The erased key's age is stashed in the payload (1a) so the tombstone keeps its place in the + * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes + * use the plain `erased_slot_sentinel()` and never call this. + * + * @param slot The slot's current (live) contents + * @param slot_index The slot's index + * + * @return The value to CAS into the slot to erase it + */ + [[nodiscard]] __device__ value_type + robin_hood_erased_sentinel(value_type const& slot, size_type slot_index) const noexcept + { + static_assert(has_payload, + "Robin Hood erase requires a mapped payload to store the tombstone age"); + auto const age = robin_hood::probe_distance( + probing_scheme_, this->extract_key(slot), slot_index, storage_ref_.extent()); + return cuco::pair{this->erased_key_sentinel(), + static_castempty_value_sentinel())>(age)}; + } + /** * @brief Extracts the key from a given value type. * diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 660a7fcbd..1c9691d2b 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -74,7 +75,12 @@ class static_map_ref static constexpr auto allows_duplicates = false; /// Implementation type - using impl_type = detail:: + // + // HARD-WIRE (experimental Robin Hood PR): static_map is routed through the Robin Hood engine + // instead of the generic open-addressing engine. This single line is what makes static_map use + // Robin Hood probing; downstream front-end work replaces it with a proper backend-selection + // mechanism. See robin_hood_refactor_plan.md. + using impl_type = detail::robin_hood:: open_addressing_ref_impl; public: From 53c884c675bce9e3347d6761f39873edd0b79c58 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jun 2026 21:55:21 +0000 Subject: [PATCH 15/15] [pre-commit.ci] auto code formatting --- .../robin_hood/open_addressing_ref_impl.cuh | 36 +++++++++---------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh index 841e71f4d..5bbbffc56 100644 --- a/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/robin_hood/open_addressing_ref_impl.cuh @@ -447,7 +447,7 @@ class open_addressing_ref_impl auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; size_type probe_step = 0; while (true) { @@ -496,9 +496,8 @@ class open_addressing_ref_impl // its age comes from its payload (`robin_hood_age`) -- but picking one up *consumes* it: we // take the slot and are done, since there is nothing to carry forward. if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(slot_content)) { - auto const intra_bucket_index = - cuda::std::distance(bucket_slots.begin(), &slot_content); - auto const evicted_age = this->robin_hood_age( + auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); + auto const evicted_age = this->robin_hood_age( slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { if (this->attempt_insert(this->get_slot_ptr(*probing_iter, intra_bucket_index), @@ -553,7 +552,7 @@ class open_addressing_ref_impl auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; size_type probe_step = 0; while (true) { @@ -631,8 +630,8 @@ class open_addressing_ref_impl if (displace_idx < 0) { // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. // consumed) exactly when richer than the in-flight pair, like any other resident. - auto const age = this->robin_hood_age(bucket_slots[i()], - static_cast(*probing_iter + i())); + auto const age = + this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -649,8 +648,7 @@ class open_addressing_ref_impl value_type evicted = this->empty_slot_sentinel(); if (group.thread_rank() == src_lane) { evicted = bucket_slots[displace_idx]; - status = - attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); + status = attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); } if (group.shfl(status, src_lane) == insert_result::SUCCESS) { // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. @@ -710,7 +708,7 @@ class open_addressing_ref_impl auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; size_type probe_step = 0; // Robin Hood may displace the original key before the chain ends; remember the slot it landed // in so we return an iterator to it (not to a later victim's slot). @@ -820,7 +818,7 @@ class open_addressing_ref_impl auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; size_type probe_step = 0; // Robin Hood may displace the original key before the chain ends; remember (broadcast) the slot // it first landed in so we return an iterator to it. 0 means "not yet placed". @@ -892,8 +890,8 @@ class open_addressing_ref_impl if (displace_idx < 0) { // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. // consumed) exactly when richer than the in-flight pair, like any other resident. - auto const age = this->robin_hood_age(bucket_slots[i()], - static_cast(*probing_iter + i())); + auto const age = + this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -1076,7 +1074,7 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; size_type probe_step = 0; while (true) { @@ -1092,9 +1090,7 @@ class open_addressing_ref_impl } } // Robin Hood: a resident richer than us proves the key is absent. - if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { - return false; - } + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { return false; } ++probe_step; ++probing_iter; if (*probing_iter == init_idx) { return false; } @@ -1121,7 +1117,7 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; size_type probe_step = 0; while (true) { @@ -1168,7 +1164,7 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; size_type probe_step = 0; while (true) { @@ -1217,7 +1213,7 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; size_type probe_step = 0; while (true) {