diff --git a/xrspatial/focal.py b/xrspatial/focal.py index 2cb276fb..4c4b325b 100644 --- a/xrspatial/focal.py +++ b/xrspatial/focal.py @@ -905,34 +905,56 @@ def _focal_variety_cuda(data, kernel, out): dr = kernel.shape[0] // 2 dc = kernel.shape[1] // 2 - # Local buffer for up to 25 unique values (covers kernels up to 5x5). - # For larger kernels the buffer simply fills and stops counting, - # which is an acceptable trade-off for GPU register pressure. - MAX_UNIQ = 25 - buf = cuda.local.array(MAX_UNIQ, nb.float32) + krows = kernel.shape[0] + kcols = kernel.shape[1] + + # Count distinct non-NaN values without a scratch buffer: for each valid + # cell, scan only the earlier cells in the same window (row-major order). + # If none of them equals the current value, this is its first occurrence. + # This is O(window^2) per pixel but needs no cuda.local.array, so it works + # for arbitrary kernel sizes and matches the CPU implementation exactly. count = 0 - for k in range(kernel.shape[0]): - for h in range(kernel.shape[1]): + for k in range(krows): + for h in range(kcols): if kernel[k, h] == 0: continue ii = i + k - dr jj = j + h - dc - if 0 <= ii < rows and 0 <= jj < cols: - v = data[ii, jj] - if v != v: # NaN check (NaN != NaN) - continue - # check if already in buffer - found = False - for u in range(count): - if buf[u] == v: - found = True + if not (0 <= ii < rows and 0 <= jj < cols): + continue + v = data[ii, jj] + if v != v: # NaN check (NaN != NaN) + continue + + # Scan earlier kernel cells (flattened index < target). + target = k * kcols + h + first = True + for pk in range(krows): + if pk * kcols >= target: + break + for ph in range(kcols): + if pk * kcols + ph >= target: break - if not found and count < MAX_UNIQ: - buf[count] = v - count += 1 + if kernel[pk, ph] == 0: + continue + pii = i + pk - dr + pjj = j + ph - dc + if not (0 <= pii < rows and 0 <= pjj < cols): + continue + pv = data[pii, pjj] + if pv != pv: + continue + if pv == v: + first = False + break + if not first: + break + + if first: + count += 1 if count == 0: out[i, j] = math.nan diff --git a/xrspatial/tests/test_focal.py b/xrspatial/tests/test_focal.py index fdb7c67e..d3b902b3 100644 --- a/xrspatial/tests/test_focal.py +++ b/xrspatial/tests/test_focal.py @@ -736,6 +736,47 @@ def test_variety_single_cell(): assert result.sel(stats='variety').values.item() == 1.0 +def _all_unique_window(n): + """(n x n) raster of all-distinct values with an (n x n) all-ones kernel. + + The center pixel's window is the whole raster, so its variety equals n*n. + """ + data = np.arange(n * n, dtype=np.float64).reshape(n, n) + kernel = np.ones((n, n)) + return data, kernel + + +@pytest.mark.parametrize("n", [7, 9]) +def test_variety_large_kernel_numpy(n): + """Variety must not cap below the true distinct count for large kernels.""" + data, kernel = _all_unique_window(n) + agg = create_test_raster(data) + result = focal_stats(agg, kernel, stats_funcs=['variety']) + vals = result.sel(stats='variety').values + center = n // 2 + assert vals[center, center] == float(n * n) + + +@cuda_and_cupy_available +@pytest.mark.parametrize("n", [7, 9]) +def test_variety_gpu_large_kernel_parity(n): + """GPU variety must match numpy for kernels larger than 5x5 (#2775). + + The old CUDA kernel capped unique counts at 25, so a 7x7 all-unique + window returned 25 on GPU vs 49 on CPU. The center pixel's window covers + the whole raster, so its variety equals n*n (49 for 7x7, 81 for 9x9). + """ + data, kernel = _all_unique_window(n) + np_agg = create_test_raster(data) + cupy_agg = create_test_raster(data, backend='cupy') + np_result = focal_stats(np_agg, kernel, stats_funcs=['variety']) + cupy_result = focal_stats(cupy_agg, kernel, stats_funcs=['variety']) + center = n // 2 + assert cupy_result.sel(stats='variety').data.get()[center, center] == float(n * n) + np.testing.assert_allclose( + np_result.values, cupy_result.data.get(), equal_nan=True) + + @pytest.fixture def data_hotspots(): data = np.asarray([