Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 41 additions & 19 deletions xrspatial/focal.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,34 +905,56 @@ def _focal_variety_cuda(data, kernel, out):
dr = kernel.shape[0] // 2
dc = kernel.shape[1] // 2

# Local buffer for up to 25 unique values (covers kernels up to 5x5).
# For larger kernels the buffer simply fills and stops counting,
# which is an acceptable trade-off for GPU register pressure.
MAX_UNIQ = 25
buf = cuda.local.array(MAX_UNIQ, nb.float32)
krows = kernel.shape[0]
kcols = kernel.shape[1]

# Count distinct non-NaN values without a scratch buffer: for each valid
# cell, scan only the earlier cells in the same window (row-major order).
# If none of them equals the current value, this is its first occurrence.
# This is O(window^2) per pixel but needs no cuda.local.array, so it works
# for arbitrary kernel sizes and matches the CPU implementation exactly.
count = 0

for k in range(kernel.shape[0]):
for h in range(kernel.shape[1]):
for k in range(krows):
for h in range(kcols):
if kernel[k, h] == 0:
continue

ii = i + k - dr
jj = j + h - dc

if 0 <= ii < rows and 0 <= jj < cols:
v = data[ii, jj]
if v != v: # NaN check (NaN != NaN)
continue
# check if already in buffer
found = False
for u in range(count):
if buf[u] == v:
found = True
if not (0 <= ii < rows and 0 <= jj < cols):
continue
v = data[ii, jj]
if v != v: # NaN check (NaN != NaN)
continue

# Scan earlier kernel cells (flattened index < target).
target = k * kcols + h
first = True
for pk in range(krows):
if pk * kcols >= target:
break
for ph in range(kcols):
if pk * kcols + ph >= target:
break
if not found and count < MAX_UNIQ:
buf[count] = v
count += 1
if kernel[pk, ph] == 0:
continue
pii = i + pk - dr
pjj = j + ph - dc
if not (0 <= pii < rows and 0 <= pjj < cols):
continue
pv = data[pii, pjj]
if pv != pv:
continue
if pv == v:
first = False
break
if not first:
break

if first:
count += 1

if count == 0:
out[i, j] = math.nan
Expand Down
41 changes: 41 additions & 0 deletions xrspatial/tests/test_focal.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,47 @@ def test_variety_single_cell():
assert result.sel(stats='variety').values.item() == 1.0


def _all_unique_window(n):
"""(n x n) raster of all-distinct values with an (n x n) all-ones kernel.

The center pixel's window is the whole raster, so its variety equals n*n.
"""
data = np.arange(n * n, dtype=np.float64).reshape(n, n)
kernel = np.ones((n, n))
return data, kernel


@pytest.mark.parametrize("n", [7, 9])
def test_variety_large_kernel_numpy(n):
"""Variety must not cap below the true distinct count for large kernels."""
data, kernel = _all_unique_window(n)
agg = create_test_raster(data)
result = focal_stats(agg, kernel, stats_funcs=['variety'])
vals = result.sel(stats='variety').values
center = n // 2
assert vals[center, center] == float(n * n)


@cuda_and_cupy_available
@pytest.mark.parametrize("n", [7, 9])
def test_variety_gpu_large_kernel_parity(n):
"""GPU variety must match numpy for kernels larger than 5x5 (#2775).

The old CUDA kernel capped unique counts at 25, so a 7x7 all-unique
window returned 25 on GPU vs 49 on CPU. The center pixel's window covers
the whole raster, so its variety equals n*n (49 for 7x7, 81 for 9x9).
"""
data, kernel = _all_unique_window(n)
np_agg = create_test_raster(data)
cupy_agg = create_test_raster(data, backend='cupy')
np_result = focal_stats(np_agg, kernel, stats_funcs=['variety'])
cupy_result = focal_stats(cupy_agg, kernel, stats_funcs=['variety'])
center = n // 2
assert cupy_result.sel(stats='variety').data.get()[center, center] == float(n * n)
np.testing.assert_allclose(
np_result.values, cupy_result.data.get(), equal_nan=True)


@pytest.fixture
def data_hotspots():
data = np.asarray([
Expand Down
Loading