From db0ee82861bb6f337e7e0a3153f02d54d02cc854 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 1 Jun 2026 10:44:41 -0700 Subject: [PATCH 1/2] Fix GPU variety undercount for kernels larger than 5x5 (#2775) The CUDA variety kernel used a fixed 25-element cuda.local.array, so it silently capped unique-value counts at 25. A 7x7 all-unique window returned 25 on GPU versus 49 on CPU. Rewrite the kernel to count distinct values without a scratch buffer: for each valid non-NaN cell, scan only the earlier cells in the same window and increment the count when no earlier cell matches. This drops the cap and the register-pressure concern, and matches the CPU implementation for arbitrary kernel sizes. Add test_variety_gpu_large_kernel_parity asserting cupy matches numpy on 7x7 and 9x9 all-unique windows, plus a numpy-only large-kernel test that runs without a GPU. --- xrspatial/focal.py | 57 +++++++++++++++++++++++------------ xrspatial/tests/test_focal.py | 41 +++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 19 deletions(-) diff --git a/xrspatial/focal.py b/xrspatial/focal.py index 2cb276fb..9f9ac55d 100644 --- a/xrspatial/focal.py +++ b/xrspatial/focal.py @@ -905,34 +905,53 @@ def _focal_variety_cuda(data, kernel, out): dr = kernel.shape[0] // 2 dc = kernel.shape[1] // 2 - # Local buffer for up to 25 unique values (covers kernels up to 5x5). - # For larger kernels the buffer simply fills and stops counting, - # which is an acceptable trade-off for GPU register pressure. - MAX_UNIQ = 25 - buf = cuda.local.array(MAX_UNIQ, nb.float32) + krows = kernel.shape[0] + kcols = kernel.shape[1] + + # Count distinct non-NaN values without a scratch buffer: for each valid + # cell, scan only the earlier cells in the same window (row-major order). + # If none of them equals the current value, this is its first occurrence. + # This is O(window^2) per pixel but needs no cuda.local.array, so it works + # for arbitrary kernel sizes and matches the CPU implementation exactly. count = 0 - for k in range(kernel.shape[0]): - for h in range(kernel.shape[1]): + for k in range(krows): + for h in range(kcols): if kernel[k, h] == 0: continue ii = i + k - dr jj = j + h - dc - if 0 <= ii < rows and 0 <= jj < cols: - v = data[ii, jj] - if v != v: # NaN check (NaN != NaN) - continue - # check if already in buffer - found = False - for u in range(count): - if buf[u] == v: - found = True + if not (0 <= ii < rows and 0 <= jj < cols): + continue + v = data[ii, jj] + if v != v: # NaN check (NaN != NaN) + continue + + # Scan earlier kernel cells (flattened index < k * kcols + h). + first = True + for pk in range(krows): + for ph in range(kcols): + if pk * kcols + ph >= k * kcols + h: + break + if kernel[pk, ph] == 0: + continue + pii = i + pk - dr + pjj = j + ph - dc + if not (0 <= pii < rows and 0 <= pjj < cols): + continue + pv = data[pii, pjj] + if pv != pv: + continue + if pv == v: + first = False break - if not found and count < MAX_UNIQ: - buf[count] = v - count += 1 + if not first: + break + + if first: + count += 1 if count == 0: out[i, j] = math.nan diff --git a/xrspatial/tests/test_focal.py b/xrspatial/tests/test_focal.py index fdb7c67e..d3b902b3 100644 --- a/xrspatial/tests/test_focal.py +++ b/xrspatial/tests/test_focal.py @@ -736,6 +736,47 @@ def test_variety_single_cell(): assert result.sel(stats='variety').values.item() == 1.0 +def _all_unique_window(n): + """(n x n) raster of all-distinct values with an (n x n) all-ones kernel. + + The center pixel's window is the whole raster, so its variety equals n*n. + """ + data = np.arange(n * n, dtype=np.float64).reshape(n, n) + kernel = np.ones((n, n)) + return data, kernel + + +@pytest.mark.parametrize("n", [7, 9]) +def test_variety_large_kernel_numpy(n): + """Variety must not cap below the true distinct count for large kernels.""" + data, kernel = _all_unique_window(n) + agg = create_test_raster(data) + result = focal_stats(agg, kernel, stats_funcs=['variety']) + vals = result.sel(stats='variety').values + center = n // 2 + assert vals[center, center] == float(n * n) + + +@cuda_and_cupy_available +@pytest.mark.parametrize("n", [7, 9]) +def test_variety_gpu_large_kernel_parity(n): + """GPU variety must match numpy for kernels larger than 5x5 (#2775). + + The old CUDA kernel capped unique counts at 25, so a 7x7 all-unique + window returned 25 on GPU vs 49 on CPU. The center pixel's window covers + the whole raster, so its variety equals n*n (49 for 7x7, 81 for 9x9). + """ + data, kernel = _all_unique_window(n) + np_agg = create_test_raster(data) + cupy_agg = create_test_raster(data, backend='cupy') + np_result = focal_stats(np_agg, kernel, stats_funcs=['variety']) + cupy_result = focal_stats(cupy_agg, kernel, stats_funcs=['variety']) + center = n // 2 + assert cupy_result.sel(stats='variety').data.get()[center, center] == float(n * n) + np.testing.assert_allclose( + np_result.values, cupy_result.data.get(), equal_nan=True) + + @pytest.fixture def data_hotspots(): data = np.asarray([ From 871001680657b64323d019f168162ff8c0c4ae4e Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 1 Jun 2026 10:46:26 -0700 Subject: [PATCH 2/2] Address review nit: exit prior-cell scan early (#2775) Break the outer pk loop once pk*kcols reaches the target flat index so the scan stops at the row boundary instead of re-breaking on the first cell of every later row. No behaviour change; verified by the variety tests including the GPU parity cases. --- xrspatial/focal.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/xrspatial/focal.py b/xrspatial/focal.py index 9f9ac55d..4c4b325b 100644 --- a/xrspatial/focal.py +++ b/xrspatial/focal.py @@ -929,11 +929,14 @@ def _focal_variety_cuda(data, kernel, out): if v != v: # NaN check (NaN != NaN) continue - # Scan earlier kernel cells (flattened index < k * kcols + h). + # Scan earlier kernel cells (flattened index < target). + target = k * kcols + h first = True for pk in range(krows): + if pk * kcols >= target: + break for ph in range(kcols): - if pk * kcols + ph >= k * kcols + h: + if pk * kcols + ph >= target: break if kernel[pk, ph] == 0: continue