xarray-contrib · brendancol · Jun 2, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/xrspatial/focal.py b/xrspatial/focal.py
@@ -905,34 +905,56 @@ def _focal_variety_cuda(data, kernel, out):
     dr = kernel.shape[0] // 2
     dc = kernel.shape[1] // 2
 
-    # Local buffer for up to 25 unique values (covers kernels up to 5x5).
-    # For larger kernels the buffer simply fills and stops counting,
-    # which is an acceptable trade-off for GPU register pressure.
-    MAX_UNIQ = 25
-    buf = cuda.local.array(MAX_UNIQ, nb.float32)
+    krows = kernel.shape[0]
+    kcols = kernel.shape[1]
+
+    # Count distinct non-NaN values without a scratch buffer: for each valid
+    # cell, scan only the earlier cells in the same window (row-major order).
+    # If none of them equals the current value, this is its first occurrence.
+    # This is O(window^2) per pixel but needs no cuda.local.array, so it works
+    # for arbitrary kernel sizes and matches the CPU implementation exactly.
     count = 0
 
-    for k in range(kernel.shape[0]):
-        for h in range(kernel.shape[1]):
+    for k in range(krows):
+        for h in range(kcols):
             if kernel[k, h] == 0:
                 continue
 
             ii = i + k - dr
             jj = j + h - dc
 
-            if 0 <= ii < rows and 0 <= jj < cols:
-                v = data[ii, jj]
-                if v != v:  # NaN check (NaN != NaN)
-                    continue
-                # check if already in buffer
-                found = False
-                for u in range(count):
-                    if buf[u] == v:
-                        found = True
+            if not (0 <= ii < rows and 0 <= jj < cols):
+                continue
+            v = data[ii, jj]
+            if v != v:  # NaN check (NaN != NaN)
+                continue
+
+            # Scan earlier kernel cells (flattened index < target).
+            target = k * kcols + h
+            first = True
+            for pk in range(krows):
+                if pk * kcols >= target:
+                    break
+                for ph in range(kcols):
+                    if pk * kcols + ph >= target:
                         break
-                if not found and count < MAX_UNIQ:
-                    buf[count] = v
-                    count += 1
+                    if kernel[pk, ph] == 0:
+                        continue
+                    pii = i + pk - dr
+                    pjj = j + ph - dc
+                    if not (0 <= pii < rows and 0 <= pjj < cols):
+                        continue
+                    pv = data[pii, pjj]
+                    if pv != pv:
+                        continue
+                    if pv == v:
+                        first = False
+                        break
+                if not first:
+                    break
+
+            if first:
+                count += 1
 
     if count == 0:
         out[i, j] = math.nan

diff --git a/xrspatial/tests/test_focal.py b/xrspatial/tests/test_focal.py
@@ -736,6 +736,47 @@ def test_variety_single_cell():
     assert result.sel(stats='variety').values.item() == 1.0
 
 
+def _all_unique_window(n):
+    """(n x n) raster of all-distinct values with an (n x n) all-ones kernel.
+
+    The center pixel's window is the whole raster, so its variety equals n*n.
+    """
+    data = np.arange(n * n, dtype=np.float64).reshape(n, n)
+    kernel = np.ones((n, n))
+    return data, kernel
+
+
+@pytest.mark.parametrize("n", [7, 9])
+def test_variety_large_kernel_numpy(n):
+    """Variety must not cap below the true distinct count for large kernels."""
+    data, kernel = _all_unique_window(n)
+    agg = create_test_raster(data)
+    result = focal_stats(agg, kernel, stats_funcs=['variety'])
+    vals = result.sel(stats='variety').values
+    center = n // 2
+    assert vals[center, center] == float(n * n)
+
+
+@cuda_and_cupy_available
+@pytest.mark.parametrize("n", [7, 9])
+def test_variety_gpu_large_kernel_parity(n):
+    """GPU variety must match numpy for kernels larger than 5x5 (#2775).
+
+    The old CUDA kernel capped unique counts at 25, so a 7x7 all-unique
+    window returned 25 on GPU vs 49 on CPU. The center pixel's window covers
+    the whole raster, so its variety equals n*n (49 for 7x7, 81 for 9x9).
+    """
+    data, kernel = _all_unique_window(n)
+    np_agg = create_test_raster(data)
+    cupy_agg = create_test_raster(data, backend='cupy')
+    np_result = focal_stats(np_agg, kernel, stats_funcs=['variety'])
+    cupy_result = focal_stats(cupy_agg, kernel, stats_funcs=['variety'])
+    center = n // 2
+    assert cupy_result.sel(stats='variety').data.get()[center, center] == float(n * n)
+    np.testing.assert_allclose(
+        np_result.values, cupy_result.data.get(), equal_nan=True)
+
+
 @pytest.fixture
 def data_hotspots():
     data = np.asarray([