diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index 799d632d..2fd5bc3e 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -1,5 +1,5 @@ module,last_inspected,oom_verdict,bottleneck,high_count,issue,notes -aspect,2026-03-31T18:00:00Z,SAFE,compute-bound,0,1122,northness/eastness now use da.cos/sin on dask arrays. +aspect,2026-05-29,SAFE,compute-bound,1,2688,"dask+cupy geodesic densified full lat/lon on one GPU at graph build (OOM at scale); fixed via per-block map_blocks cupy conversion. planar/numpy/dask SAFE; geodesic GPU kernel ~184 regs, mitigated by 16x16 blocks." balanced_allocation,2026-04-16T12:00:00Z,WILL OOM,memory-bound,8,1114,"Re-audit 2026-04-16 after PR 1203 float32 fix. 8 HIGH found (friction.compute L339, argmin.compute in iter loop L182, double all_nan recompute L206, stacked cost_surfaces allocation). Covered by existing documented limitation on #1114. Not refiled." bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation). diff --git a/xrspatial/aspect.py b/xrspatial/aspect.py index 01f7e95b..dedadb33 100644 --- a/xrspatial/aspect.py +++ b/xrspatial/aspect.py @@ -287,11 +287,21 @@ def _run_dask_numpy_geodesic(data, lat_2d, lon_2d, a2, b2, z_factor, boundary='n return out[0] +def _to_cupy_f64(block): + # Only reached from the dask+cupy path, so `cupy` is the real module here, + # never the import-time fallback class. + return cupy.asarray(block, dtype=cupy.float64) + + def _run_dask_cupy_geodesic(data, lat_2d, lon_2d, a2, b2, z_factor, boundary='nan'): - lat_dask = da.from_array(cupy.asarray(lat_2d, dtype=cupy.float64), - chunks=data.chunksize) - lon_dask = da.from_array(cupy.asarray(lon_2d, dtype=cupy.float64), - chunks=data.chunksize) + # Keep lat/lon as dask-of-numpy on the (zero-stride) broadcast views, then + # convert each block to cupy lazily. Converting up front with + # cupy.asarray(lat_2d) would densify the full (H, W) grid onto a single GPU + # at graph-construction time and OOM on large rasters. + lat_dask = da.from_array(lat_2d, chunks=data.chunksize).map_blocks( + _to_cupy_f64, dtype=np.float64) + lon_dask = da.from_array(lon_2d, chunks=data.chunksize).map_blocks( + _to_cupy_f64, dtype=np.float64) stacked = da.stack([ data.astype(cupy.float64), lat_dask, diff --git a/xrspatial/tests/test_geodesic_aspect.py b/xrspatial/tests/test_geodesic_aspect.py index 00aff14a..44b8c09c 100644 --- a/xrspatial/tests/test_geodesic_aspect.py +++ b/xrspatial/tests/test_geodesic_aspect.py @@ -294,3 +294,38 @@ def test_numpy_equals_dask_cupy(self): np.testing.assert_allclose( a_np.values, a_dc.data.compute().get(), rtol=1e-5, equal_nan=True ) + + def test_latlon_not_materialized_on_gpu_at_graph_build(self): + """The dask+cupy geodesic path must keep lat/lon chunked. + + Building the graph (no compute) for a large raster must not densify + the full (H, W) lat/lon grids onto the GPU. Converting the broadcast + views with ``cupy.asarray`` up front would allocate ~2*H*W*8 bytes of + GPU memory at graph-construction time and OOM on large rasters. + """ + import cupy + + H = W = 2048 + elev = cupy.zeros((H, W), dtype=cupy.float64) + lat = np.linspace(40.0, 41.0, H) + lon = np.linspace(10.0, 11.0, W) + raster = xr.DataArray( + da.from_array(elev, chunks=(256, 256)), + dims=['lat', 'lon'], + coords={'lat': lat, 'lon': lon}, + ) + + pool = cupy.get_default_memory_pool() + pool.free_all_blocks() + before = pool.used_bytes() + out = aspect(raster, method='geodesic') # graph construction only + out.data.__dask_graph__() + delta = pool.used_bytes() - before + + # A single full lat or lon grid is H*W*8 bytes. If either were + # densified eagerly the delta would be at least that large. + one_full_grid = H * W * 8 + assert delta < one_full_grid, ( + f"graph construction allocated {delta} GPU bytes; expected well " + f"under one full lat/lon grid ({one_full_grid} bytes)" + )