From 5987137363bc4f53ee9ac7d84012035fd090ea8e Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Sat, 28 Feb 2026 02:13:27 +0100 Subject: [PATCH 01/32] Add initial tests for remote storage workflows with UPath --- tests/io/test_remote_storage.py | 185 ++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 tests/io/test_remote_storage.py diff --git a/tests/io/test_remote_storage.py b/tests/io/test_remote_storage.py new file mode 100644 index 000000000..c24f1bcd1 --- /dev/null +++ b/tests/io/test_remote_storage.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import pytest +from upath import UPath + +from spatialdata import SpatialData +from spatialdata.testing import assert_spatial_data_objects_are_identical + +# Azure emulator connection string (Azurite default) +# Source: https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string +AZURE_CONNECTION_STRING = ( + "DefaultEndpointsProtocol=http;" + "AccountName=devstoreaccount1;" + "AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + "BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" +) + + +def _get_azure_upath(container: str = "test-container", path: str = "test.zarr") -> UPath: + """Create Azure UPath for testing with Azurite (local emulator).""" + return UPath(f"az://{container}/{path}", connection_string=AZURE_CONNECTION_STRING) + + +def _get_s3_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: + """Create S3 UPath for testing. + + Uses anon=True for public buckets. For private buckets with moto (local S3 emulator), + would use: endpoint_url="http://127.0.0.1:5555/", AWS_ACCESS_KEY_ID="testing", etc. + """ + return UPath(f"s3://{container}/{path}", anon=True) + + +def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: + """Create GCS UPath for testing with fake-gcs-server (local GCS emulator).""" + return UPath(f"gs://{container}/{path}", endpoint_url="http://localhost:4443") + + +# Shared parametrization for remote storage backends (azure, s3, gcs). +GET_UPATH_PARAMS = pytest.mark.parametrize( + "get_upath", [_get_azure_upath, _get_s3_upath, _get_gcs_upath], ids=["azure", "s3", "gcs"] +) +REMOTE_STORAGE_PARAMS = pytest.mark.parametrize( + "get_upath,storage_name", + [(_get_azure_upath, "azure"), (_get_s3_upath, "s3"), (_get_gcs_upath, "gcs")], + ids=["azure", "s3", "gcs"], +) + + +def _assert_read_identical(expected: SpatialData, upath: UPath, *, check_path: bool = True) -> None: + """Read SpatialData from upath and assert it equals expected; optionally assert path.""" + sdata_read = SpatialData.read(upath) + if check_path: + assert isinstance(sdata_read.path, UPath) + assert sdata_read.path == upath + assert_spatial_data_objects_are_identical(expected, sdata_read) + + +class TestPathSetter: + """Test SpatialData.path setter with UPath objects.""" + + @GET_UPATH_PARAMS + def test_path_setter_accepts_upath(self, get_upath) -> None: + """Test that SpatialData.path setter accepts UPath for remote storage. + + This test fails, reproducing issue #441: SpatialData.path setter only accepts + None | str | Path, not UPath, preventing the use of remote storage. + """ + sdata = SpatialData() + upath = get_upath() + sdata.path = upath + assert sdata.path == upath + + @GET_UPATH_PARAMS + def test_write_with_upath_sets_path(self, get_upath) -> None: + """Test that writing to UPath sets SpatialData.path correctly. + + This test fails because SpatialData.write() rejects UPath in + _validate_can_safely_write_to_path() before it can set sdata.path. + """ + sdata = SpatialData() + upath = get_upath() + sdata.write(upath) + assert isinstance(sdata.path, UPath) + + def test_path_setter_rejects_other_types(self) -> None: + """Test that SpatialData.path setter rejects other types.""" + sdata = SpatialData() + + with pytest.raises(TypeError, match="Path must be.*str.*Path"): + sdata.path = 123 + + with pytest.raises(TypeError, match="Path must be.*str.*Path"): + sdata.path = {"not": "a path"} + + +class TestRemoteStorage: + """Test end-to-end remote storage workflows with UPath. + + Note: These tests require appropriate emulators running (Azurite for Azure, + moto for S3, fake-gcs-server for GCS). Tests will fail if emulators are not available. + """ + + @REMOTE_STORAGE_PARAMS + def test_write_read_roundtrip_remote( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + """Test writing and reading SpatialData to/from remote storage. + + This test verifies the full workflow: + 1. Write SpatialData to remote storage using UPath + 2. Read SpatialData from remote storage using UPath + 3. Verify data integrity (round-trip) + """ + upath = get_upath(container=f"test-{storage_name}", path=f"roundtrip-{id(full_sdata)}.zarr") + + full_sdata.write(upath, overwrite=True) + assert isinstance(full_sdata.path, UPath) + assert full_sdata.path == upath + + _assert_read_identical(full_sdata, upath) + + @REMOTE_STORAGE_PARAMS + def test_path_setter_with_remote_then_operations( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + """Test setting remote path, then performing operations. + + This test verifies that after setting a remote path: + 1. Path is correctly stored + 2. Write operations work + 3. Read operations work + """ + upath = get_upath(container=f"test-{storage_name}", path=f"operations-{id(full_sdata)}.zarr") + + full_sdata.path = upath + assert full_sdata.path == upath + assert full_sdata.is_backed() is True + + full_sdata.write(overwrite=True) + assert full_sdata.path == upath + + _assert_read_identical(full_sdata, upath) + + @REMOTE_STORAGE_PARAMS + def test_overwrite_existing_remote_data( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + """Test overwriting existing data in remote storage. + + Verifies that overwriting existing remote data works (path-exists handling) + and data integrity after overwrite. Round-trip is covered by + test_write_read_roundtrip_remote. + """ + upath = get_upath(container=f"test-{storage_name}", path=f"overwrite-{id(full_sdata)}.zarr") + + full_sdata.write(upath, overwrite=True) + full_sdata.write(upath, overwrite=True) + _assert_read_identical(full_sdata, upath, check_path=False) + + @REMOTE_STORAGE_PARAMS + def test_write_element_to_remote_storage( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + """Test writing individual elements to remote storage using write_element(). + + This test verifies that: + 1. Setting path to remote UPath works + 2. write_element() works with remote storage + 3. Written elements can be read back correctly + """ + upath = get_upath(container=f"test-{storage_name}", path=f"write-element-{id(full_sdata)}.zarr") + + # Create empty SpatialData and write to remote storage + empty_sdata = SpatialData() + empty_sdata.write(upath, overwrite=True) + + # Set path and write individual elements + full_sdata.path = upath + assert full_sdata.path == upath + + # Write each element type individually + for element_type, element_name, _ in full_sdata.gen_elements(): + full_sdata.write_element(element_name, overwrite=True) + + _assert_read_identical(full_sdata, upath, check_path=False) From 865eb76eb2afa1a8cb7d4502a230becb00dd3afa Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:21:53 +0100 Subject: [PATCH 02/32] io: add dask.array.to_zarr compat for ome_zarr kwargs Patch da.to_zarr so ome_zarr's **kwargs are forwarded as zarr_array_kwargs, avoiding FutureWarning and keeping behavior correct. --- src/spatialdata/_io/__init__.py | 2 + src/spatialdata/_io/_dask_zarr_compat.py | 52 ++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 src/spatialdata/_io/_dask_zarr_compat.py diff --git a/src/spatialdata/_io/__init__.py b/src/spatialdata/_io/__init__.py index 38ff8c6bb..9e4b11de1 100644 --- a/src/spatialdata/_io/__init__.py +++ b/src/spatialdata/_io/__init__.py @@ -1,5 +1,7 @@ from __future__ import annotations +# Patch da.to_zarr so ome_zarr's **kwargs are passed as zarr_array_kwargs (avoids FutureWarning) +import spatialdata._io._dask_zarr_compat # noqa: F401 from spatialdata._io._utils import get_dask_backing_files from spatialdata._io.format import SpatialDataFormatType from spatialdata._io.io_points import write_points diff --git a/src/spatialdata/_io/_dask_zarr_compat.py b/src/spatialdata/_io/_dask_zarr_compat.py new file mode 100644 index 000000000..350207056 --- /dev/null +++ b/src/spatialdata/_io/_dask_zarr_compat.py @@ -0,0 +1,52 @@ +"""Compatibility layer for dask.array.to_zarr when callers pass array options via **kwargs. + +ome_zarr.writer calls da.to_zarr(..., **options) with array options (compressor, dimension_names, +etc.). Dask deprecated **kwargs in favor of zarr_array_kwargs. This module patches da.to_zarr to +forward such kwargs into zarr_array_kwargs (excluding dask-internal keys like zarr_format that +zarr.Group.create_array() does not accept), avoiding the FutureWarning and keeping behavior correct. +""" + +from __future__ import annotations + +import dask.array as _da + +_orig_to_zarr = _da.to_zarr + +# Keys from ome_zarr/dask **kwargs that must not be passed to zarr.Group.create_array() +_DASK_INTERNAL_KEYS = frozenset({"zarr_format"}) + + +def _to_zarr( + arr, + url, + component=None, + storage_options=None, + region=None, + compute=True, + return_stored=False, + zarr_array_kwargs=None, + zarr_read_kwargs=None, + **kwargs, +): + """Forward deprecated **kwargs into zarr_array_kwargs, excluding _DASK_INTERNAL_KEYS.""" + if kwargs: + zarr_array_kwargs = dict(zarr_array_kwargs) if zarr_array_kwargs else {} + for k, v in kwargs.items(): + if k not in _DASK_INTERNAL_KEYS: + zarr_array_kwargs[k] = v + kwargs = {} + return _orig_to_zarr( + arr, + url, + component=component, + storage_options=storage_options, + region=region, + compute=compute, + return_stored=return_stored, + zarr_array_kwargs=zarr_array_kwargs, + zarr_read_kwargs=zarr_read_kwargs, + **kwargs, + ) + + +_da.to_zarr = _to_zarr From 2134386ca62aadcf9d6a24c132c3b8d54e1a9b5f Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:22:14 +0100 Subject: [PATCH 03/32] io: add remote storage helpers in _utils - _FsspecStoreRoot, _get_store_root for path-like store roots (local + fsspec) - _storage_options_from_fs for parquet writes to Azure/S3/GCS - _remote_zarr_store_exists, _ensure_async_fs for UPath/FsspecStore - Extend _resolve_zarr_store for UPath and _FsspecStoreRoot with async fs - _backed_elements_contained_in_path, _is_element_self_contained accept UPath --- src/spatialdata/_io/_utils.py | 139 ++++++++++++++++++++++++++++++++-- 1 file changed, 132 insertions(+), 7 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 6690d1118..747d8ed7b 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import filecmp +import json import os.path import re import sys @@ -23,6 +24,7 @@ from upath import UPath from upath.implementations.local import PosixUPath, WindowsUPath from xarray import DataArray, DataTree +from zarr.errors import GroupNotFoundError from zarr.storage import FsspecStore, LocalStore from spatialdata._core.spatialdata import SpatialData @@ -38,6 +40,74 @@ from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes +class _FsspecStoreRoot: + """Path-like root for FsspecStore (no .root attribute); supports __truediv__ and str() as full URL.""" + + __slots__ = ("_store", "_path") + + def __init__(self, store: FsspecStore, path: str | None = None) -> None: + self._store = store + self._path = (path or store.path).rstrip("/") + + def __truediv__(self, other: str | Path) -> _FsspecStoreRoot: + return _FsspecStoreRoot(self._store, self._path + "/" + str(other).lstrip("/")) + + def __str__(self) -> str: + protocol = getattr(self._store.fs, "protocol", None) + if isinstance(protocol, (list, tuple)): + protocol = protocol[0] if protocol else "file" + elif protocol is None: + protocol = "file" + return f"{protocol}://{self._path}" + + def __fspath__(self) -> str: + return str(self) + + +def _storage_options_from_fs(fs: Any) -> dict[str, Any]: + """Build storage_options dict from an fsspec filesystem for use with to_parquet/write_parquet. + + Ensures parquet writes to remote stores (Azure, S3, GCS) use the same credentials as the + zarr store. + """ + out: dict[str, Any] = {} + name = type(fs).__name__ + if name == "AzureBlobFileSystem": + if getattr(fs, "connection_string", None): + out["connection_string"] = fs.connection_string + elif getattr(fs, "account_name", None) and getattr(fs, "account_key", None): + out["account_name"] = fs.account_name + out["account_key"] = fs.account_key + if getattr(fs, "anon", None) is not None: + out["anon"] = fs.anon + elif name in ("S3FileSystem", "MotoS3FS"): + if getattr(fs, "endpoint_url", None): + out["endpoint_url"] = fs.endpoint_url + if getattr(fs, "key", None): + out["key"] = fs.key + if getattr(fs, "secret", None): + out["secret"] = fs.secret + if getattr(fs, "anon", None) is not None: + out["anon"] = fs.anon + elif name == "GCSFileSystem": + if getattr(fs, "token", None) is not None: + out["token"] = fs.token + if getattr(fs, "_endpoint", None): + out["endpoint_url"] = fs._endpoint + if getattr(fs, "project", None): + out["project"] = fs.project + return out + + +def _get_store_root(store: LocalStore | FsspecStore) -> Path | _FsspecStoreRoot: + """Return a path-like root for the store (supports / and str()). Use for building paths to parquet etc.""" + if isinstance(store, LocalStore): + return Path(store.root) + if isinstance(store, FsspecStore): + return _FsspecStoreRoot(store) + raise TypeError(f"Unsupported store type: {type(store)}") + + def _get_transformations_from_ngff_dict( list_of_encoded_ngff_transformations: list[dict[str, Any]], ) -> MappingToCoordinateSystem_t: @@ -370,7 +440,9 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No files.append(os.path.realpath(parquet_file)) -def _backed_elements_contained_in_path(path: Path, object: SpatialData | SpatialElement | AnnData) -> list[bool]: +def _backed_elements_contained_in_path( + path: Path | UPath, object: SpatialData | SpatialElement | AnnData +) -> list[bool]: """ Return the list of boolean values indicating if backing files for an object are child directory of a path. @@ -390,8 +462,10 @@ def _backed_elements_contained_in_path(path: Path, object: SpatialData | Spatial If an object does not have a Dask computational graph, it will return an empty list. It is possible for a single SpatialElement to contain multiple files in their Dask computational graph. """ + if isinstance(path, UPath): + return [] # no local backing files are "contained" in a remote path if not isinstance(path, Path): - raise TypeError(f"Expected a Path object, got {type(path)}") + raise TypeError(f"Expected a Path or UPath object, got {type(path)}") return [_is_subfolder(parent=path, child=Path(fp)) for fp in get_dask_backing_files(object)] @@ -420,14 +494,58 @@ def _is_subfolder(parent: Path, child: Path) -> bool: def _is_element_self_contained( - element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, element_path: Path + element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, + element_path: Path | UPath, ) -> bool: + if isinstance(element_path, UPath): + return True # treat remote-backed as self-contained for this check if isinstance(element, DaskDataFrame): pass # TODO when running test_save_transformations it seems that for the same element this is called multiple times return all(_backed_elements_contained_in_path(path=element_path, object=element)) +def _is_azure_http_response_error(exc: BaseException) -> bool: + """Return True if exc is an Azure SDK HttpResponseError (e.g. emulator API mismatch).""" + t = type(exc) + return t.__name__ == "HttpResponseError" and (getattr(t, "__module__", "") or "").startswith("azure.") + + +def _remote_zarr_store_exists(store: zarr.storage.StoreLike) -> bool: + """Return True if the store contains a zarr group. Closes the store. Handles Azure emulator errors.""" + try: + zarr.open_group(store, mode="r") + return True + except (GroupNotFoundError, OSError, FileNotFoundError): + return False + except Exception as e: + if _is_azure_http_response_error(e): + return False + raise + finally: + store.close() + + +def _ensure_async_fs(fs: Any) -> Any: + """Return an async fsspec filesystem for use with zarr's FsspecStore. + + Zarr's FsspecStore expects an async filesystem. If the given fs is synchronous, + it is converted using fsspec's public API (async instance or AsyncFileSystemWrapper) + so that ZarrUserWarning is not raised. + """ + if getattr(fs, "asynchronous", False): + return fs + import fsspec + + if getattr(fs, "async_impl", False): + fs_dict = json.loads(fs.to_json()) + fs_dict["asynchronous"] = True + return fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict)) + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + return AsyncFileSystemWrapper(fs, asynchronous=True) + + def _resolve_zarr_store( path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, **kwargs: Any ) -> zarr.storage.StoreLike: @@ -477,17 +595,24 @@ def _resolve_zarr_store( if isinstance(path.store, FsspecStore): # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group - return FsspecStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) + return FsspecStore( + path.store.path + "/" + path.path, + fs=_ensure_async_fs(path.store.fs), + **kwargs, + ) if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): # if the store is a ConsolidatedMetadataStore, just return the underlying FSSpec store return path.store.store raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") + if isinstance(path, _FsspecStoreRoot): + # path-like from read_zarr that carries the same fs (preserves Azure/GCS credentials) + return FsspecStore(_ensure_async_fs(path._store.fs), path=path._path, **kwargs) + if isinstance(path, UPath): + # if input is a remote UPath, map it to an FSStore (check before StoreLike to avoid UnionType isinstance) + return FsspecStore(_ensure_async_fs(path.fs), path=path.path, **kwargs) if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore return FsspecStore(path, **kwargs) - if isinstance(path, UPath): - # if input is a remote UPath, map it to an FSStore - return FsspecStore(path.path, fs=path.fs, **kwargs) raise TypeError(f"Unsupported type: {type(path)}") From eee34d8371b13fe105ea8d05a439c46c0f1e3925 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:22:27 +0100 Subject: [PATCH 04/32] core: support UPath for SpatialData.path and write() - path and _path accept Path | UPath; setter allows UPath - write() accepts file_path: str | Path | UPath | None (None uses path) - _validate_can_safely_write_to_path handles UPath and remote store existence - _write_element accepts Path | UPath; skip local subfolder checks for UPath - __repr__ and _get_groups_for_element use path without forcing Path() --- src/spatialdata/_core/spatialdata.py | 71 +++++++++++++++++++--------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 739b225fe..810713d45 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -121,7 +121,7 @@ def __init__( tables: dict[str, AnnData] | Tables | None = None, attrs: Mapping[Any, Any] | None = None, ) -> None: - self._path: Path | None = None + self._path: Path | UPath | None = None self._shared_keys: set[str | None] = set() self._images: Images = Images(shared_keys=self._shared_keys) @@ -548,16 +548,16 @@ def is_backed(self) -> bool: return self.path is not None @property - def path(self) -> Path | None: + def path(self) -> Path | UPath | None: """Path to the Zarr storage.""" return self._path @path.setter - def path(self, value: Path | None) -> None: - if value is None or isinstance(value, str | Path): + def path(self, value: Path | UPath | None) -> None: + if value is None or isinstance(value, (str, Path, UPath)): self._path = value else: - raise TypeError("Path must be `None`, a `str` or a `Path` object.") + raise TypeError("Path must be `None`, a `str`, a `Path` or a `UPath` object.") def locate_element(self, element: SpatialElement) -> list[str]: """ @@ -1032,18 +1032,34 @@ def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: def _validate_can_safely_write_to_path( self, - file_path: str | Path, + file_path: str | Path | UPath, overwrite: bool = False, saving_an_element: bool = False, ) -> None: - from spatialdata._io._utils import _backed_elements_contained_in_path, _is_subfolder, _resolve_zarr_store + from spatialdata._io._utils import ( + _backed_elements_contained_in_path, + _is_subfolder, + _remote_zarr_store_exists, + _resolve_zarr_store, + ) if isinstance(file_path, str): file_path = Path(file_path) - if not isinstance(file_path, Path): - raise ValueError(f"file_path must be a string or a Path object, type(file_path) = {type(file_path)}.") + if not isinstance(file_path, (Path, UPath)): + raise ValueError(f"file_path must be a string, Path or UPath object, type(file_path) = {type(file_path)}.") + + if isinstance(file_path, UPath): + store = _resolve_zarr_store(file_path) + if _remote_zarr_store_exists(store) and not overwrite: + raise ValueError( + "The Zarr store already exists. Use `overwrite=True` to try overwriting the store. " + "Please note that only Zarr stores not currently in use by the current SpatialData object can be " + "overwritten." + ) + return + # Local Path: existing logic # TODO: add test for this if os.path.exists(file_path): store = _resolve_zarr_store(file_path) @@ -1072,8 +1088,13 @@ def _validate_can_safely_write_to_path( ERROR_MSG + "\nDetails: the target path contains one or more files that Dask use for " "backing elements in the SpatialData object." + WORKAROUND ) - if self.path is not None and ( - _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) + # Subfolder checks only for local paths (Path); skip when self.path is UPath + if ( + self.path is not None + and isinstance(self.path, Path) + and ( + _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) + ) ): if saving_an_element and _is_subfolder(parent=self.path, child=file_path): raise ValueError( @@ -1102,7 +1123,7 @@ def _validate_all_elements(self) -> None: @_deprecation_alias(format="sdata_formats", version="0.7.0") def write( self, - file_path: str | Path, + file_path: str | Path | UPath | None = None, overwrite: bool = False, consolidate_metadata: bool = True, update_sdata_path: bool = True, @@ -1115,7 +1136,7 @@ def write( Parameters ---------- file_path - The path to the Zarr store to write to. + The path to the Zarr store to write to. If ``None``, uses :attr:`path` (must be set). overwrite If `True`, overwrite the Zarr store if it already exists. If `False`, `write()` will fail if the Zarr store already exists. @@ -1161,8 +1182,13 @@ def write( parsed = _parse_formats(sdata_formats) + if file_path is None: + if self.path is None: + raise ValueError("file_path must be provided when SpatialData.path is not set.") + file_path = self.path if isinstance(file_path, str): file_path = Path(file_path) + # Keep UPath as-is; do not convert to Path self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) self._validate_all_elements() @@ -1192,7 +1218,7 @@ def write( def _write_element( self, element: SpatialElement | AnnData, - zarr_container_path: Path, + zarr_container_path: Path | UPath, element_type: str, element_name: str, overwrite: bool, @@ -1201,10 +1227,8 @@ def _write_element( ) -> None: from spatialdata._io.io_zarr import _get_groups_for_element - if not isinstance(zarr_container_path, Path): - raise ValueError( - f"zarr_container_path must be a Path object, type(zarr_container_path) = {type(zarr_container_path)}." - ) + if not isinstance(zarr_container_path, (Path, UPath)): + raise ValueError(f"zarr_container_path must be a Path or UPath, got {type(zarr_container_path).__name__}.") file_path_of_element = zarr_container_path / element_type / element_name self._validate_can_safely_write_to_path( file_path=file_path_of_element, overwrite=overwrite, saving_an_element=True @@ -1489,7 +1513,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # check if the element exists in the Zarr storage if not _group_for_element_exists( - zarr_path=Path(self.path), + zarr_path=self.path, element_type=element_type, element_name=element_name, ): @@ -1503,7 +1527,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # warn the users if the element is not self-contained, that is, it is Dask-backed by files outside the Zarr # group for the element - element_zarr_path = Path(self.path) / element_type / element_name + element_zarr_path = self.path / element_type / element_name if not _is_element_self_contained(element=element, element_path=element_zarr_path): logger.info( f"Element {element_type}/{element_name} is not self-contained. The metadata will be" @@ -1544,7 +1568,7 @@ def write_channel_names(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have the check in the conditional if element_type == "images" and self.path is not None: _, _, element_group = _get_groups_for_element( - zarr_path=Path(self.path), element_type=element_type, element_name=element_name, use_consolidated=False + zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False ) from spatialdata._io._utils import overwrite_channel_names @@ -1588,7 +1612,7 @@ def write_transformations(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have a conditional assert self.path is not None _, _, element_group = _get_groups_for_element( - zarr_path=Path(self.path), + zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False, @@ -1956,7 +1980,8 @@ def h(s: str) -> str: descr = "SpatialData object" if self.path is not None: - descr += f", with associated Zarr store: {self.path.resolve()}" + path_descr = str(self.path) if isinstance(self.path, UPath) else self.path.resolve() + descr += f", with associated Zarr store: {path_descr}" non_empty_elements = self._non_empty_elements() last_element_index = len(non_empty_elements) - 1 From 40af32757b2e2d10d8f0fdd408bfd5b6b8933304 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:28:22 +0100 Subject: [PATCH 05/32] io: use resolved store and remote parquet in points, raster, shapes, table, zarr - Resolve store via _resolve_zarr_store in read paths (points, shapes, raster, table) - Use _get_store_root for parquet paths; read/write parquet with storage_options for fsspec - io_shapes: upload parquet to Azure/S3/GCS via temp file when path is _FsspecStoreRoot - io_zarr: _get_store_root, UPath in _get_groups_for_element and _write_consolidated_metadata; set sdata.path to UPath when store is remote --- src/spatialdata/_io/io_points.py | 21 +++++-- src/spatialdata/_io/io_raster.py | 5 +- src/spatialdata/_io/io_shapes.py | 94 ++++++++++++++++++++++++++++++-- src/spatialdata/_io/io_table.py | 6 +- src/spatialdata/_io/io_zarr.py | 27 +++++---- 5 files changed, 127 insertions(+), 26 deletions(-) diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index b47fc418c..e41273dcb 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -8,7 +8,11 @@ from ome_zarr.format import Format from spatialdata._io._utils import ( + _FsspecStoreRoot, + _get_store_root, _get_transformations_from_ngff_dict, + _resolve_zarr_store, + _storage_options_from_fs, _write_metadata, overwrite_coordinate_transformations_non_raster, ) @@ -24,17 +28,21 @@ def _read_points( store: str | Path, ) -> DaskDataFrame: """Read points from a zarr store.""" - f = zarr.open(store, mode="r") + resolved_store = _resolve_zarr_store(store) + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None points_format = PointsFormats[version] - store_root = f.store_path.store.root + store_root = _get_store_root(f.store_path.store) path = store_root / f.path / "points.parquet" # cache on remote file needed for parquet reader to work # TODO: allow reading in the metadata without caching all the data - points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) + if isinstance(path, _FsspecStoreRoot): + points = read_parquet(str(path), storage_options=_storage_options_from_fs(path._store.fs)) + else: + points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) assert isinstance(points, DaskDataFrame) transformations = _get_transformations_from_ngff_dict(f.attrs.asdict()["coordinateTransformations"]) @@ -68,7 +76,7 @@ def write_points( axes = get_axes_names(points) transformations = _get_transformations(points) - store_root = group.store_path.store.root + store_root = _get_store_root(group.store_path.store) path = store_root / group.path / "points.parquet" # The following code iterates through all columns in the 'points' DataFrame. If the column's datatype is @@ -84,7 +92,10 @@ def write_points( points_without_transform = points.copy() del points_without_transform.attrs["transform"] - points_without_transform.to_parquet(path) + storage_options: dict = {} + if isinstance(path, _FsspecStoreRoot): + storage_options = _storage_options_from_fs(path._store.fs) + points_without_transform.to_parquet(str(path), storage_options=storage_options or None) attrs = element_format.attrs_to_dict(points.attrs) attrs["version"] = element_format.spatialdata_format_version diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index df7e1cb8f..767232fdd 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -19,6 +19,7 @@ from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, + _resolve_zarr_store, overwrite_coordinate_transformations_raster, ) from spatialdata._io.format import ( @@ -41,11 +42,11 @@ def _read_multiscale( store: str | Path, raster_type: Literal["image", "labels"], reader_format: Format ) -> DataArray | DataTree: - assert isinstance(store, str | Path) assert raster_type in ["image", "labels"] + resolved_store = _resolve_zarr_store(store) nodes: list[Node] = [] - image_loc = ZarrLocation(store, fmt=reader_format) + image_loc = ZarrLocation(resolved_store, fmt=reader_format) if exists := image_loc.exists(): image_reader = Reader(image_loc)() image_nodes = list(image_reader) diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index b07256273..adf4716f3 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -1,5 +1,8 @@ from __future__ import annotations +import contextlib +import os +import tempfile from pathlib import Path from typing import Any, Literal @@ -11,7 +14,11 @@ from shapely import from_ragged_array, to_ragged_array from spatialdata._io._utils import ( + _FsspecStoreRoot, + _get_store_root, _get_transformations_from_ngff_dict, + _resolve_zarr_store, + _storage_options_from_fs, _write_metadata, overwrite_coordinate_transformations_non_raster, ) @@ -34,7 +41,8 @@ def _read_shapes( store: str | Path, ) -> GeoDataFrame: """Read shapes from a zarr store.""" - f = zarr.open(store, mode="r") + resolved_store = _resolve_zarr_store(store) + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None shape_format = ShapesFormats[version] @@ -54,9 +62,12 @@ def _read_shapes( geometry = from_ragged_array(typ, coords, offsets) geo_df = GeoDataFrame({"geometry": geometry}, index=index) elif isinstance(shape_format, ShapesFormatV02 | ShapesFormatV03): - store_root = f.store_path.store.root - path = Path(store_root) / f.path / "shapes.parquet" - geo_df = read_parquet(path) + store_root = _get_store_root(f.store_path.store) + path = store_root / f.path / "shapes.parquet" + if isinstance(path, _FsspecStoreRoot): + geo_df = read_parquet(str(path), storage_options=_storage_options_from_fs(path._store.fs)) + else: + geo_df = read_parquet(path) else: raise ValueError( f"Unsupported shapes format {shape_format} from version {version}. Please update the spatialdata library." @@ -150,6 +161,67 @@ def _write_shapes_v01(shapes: GeoDataFrame, group: zarr.Group, element_format: F return attrs +def _parse_fsspec_remote_path(path: _FsspecStoreRoot) -> tuple[str, str]: + """Return (bucket_or_container, blob_key) from an fsspec store path.""" + remote = str(path) + if "://" in remote: + remote = remote.split("://", 1)[1] + parts = remote.split("/", 1) + bucket_or_container = parts[0] + blob_key = parts[1] if len(parts) > 1 else "" + return bucket_or_container, blob_key + + +def _upload_parquet_to_azure(tmp_path: str, bucket: str, key: str, fs: Any) -> None: + from azure.storage.blob import BlobServiceClient + + client = BlobServiceClient.from_connection_string(fs.connection_string) + blob_client = client.get_blob_client(container=bucket, blob=key) + with open(tmp_path, "rb") as f: + blob_client.upload_blob(f, overwrite=True) + + +def _upload_parquet_to_s3(tmp_path: str, bucket: str, key: str, fs: Any) -> None: + import boto3 + + endpoint = getattr(fs, "endpoint_url", None) or os.environ.get("AWS_ENDPOINT_URL") + s3 = boto3.client( + "s3", + endpoint_url=endpoint, + aws_access_key_id=getattr(fs, "key", None) or os.environ.get("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=getattr(fs, "secret", None) or os.environ.get("AWS_SECRET_ACCESS_KEY"), + region_name=os.environ.get("AWS_DEFAULT_REGION", "us-east-1"), + ) + s3.upload_file(tmp_path, bucket, key) + + +def _upload_parquet_to_gcs(tmp_path: str, bucket: str, key: str, fs: Any) -> None: + from google.auth.credentials import AnonymousCredentials + from google.cloud import storage + + client = storage.Client( + credentials=AnonymousCredentials(), + project=getattr(fs, "project", None) or "test", + ) + blob = client.bucket(bucket).blob(key) + blob.upload_from_filename(tmp_path) + + +def _upload_parquet_to_fsspec(path: _FsspecStoreRoot, tmp_path: str) -> None: + """Upload local parquet file to remote fsspec store using sync APIs to avoid event-loop issues.""" + fs = path._store.fs + bucket, key = _parse_fsspec_remote_path(path) + fs_name = type(fs).__name__ + if fs_name == "AzureBlobFileSystem" and getattr(fs, "connection_string", None): + _upload_parquet_to_azure(tmp_path, bucket, key, fs) + elif fs_name in ("S3FileSystem", "MotoS3FS"): + _upload_parquet_to_s3(tmp_path, bucket, key, fs) + elif fs_name == "GCSFileSystem": + _upload_parquet_to_gcs(tmp_path, bucket, key, fs) + else: + fs.put(tmp_path, str(path)) + + def _write_shapes_v02_v03( shapes: GeoDataFrame, group: zarr.Group, element_format: Format, geometry_encoding: Literal["WKB", "geoarrow"] ) -> Any: @@ -169,13 +241,23 @@ def _write_shapes_v02_v03( """ from spatialdata.models._utils import TRANSFORM_KEY - store_root = group.store_path.store.root + store_root = _get_store_root(group.store_path.store) path = store_root / group.path / "shapes.parquet" # Temporarily remove transformations from attrs to avoid serialization issues transforms = shapes.attrs[TRANSFORM_KEY] del shapes.attrs[TRANSFORM_KEY] - shapes.to_parquet(path, geometry_encoding=geometry_encoding) + if isinstance(path, _FsspecStoreRoot): + with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: + tmp_path = tmp.name + try: + shapes.to_parquet(tmp_path, geometry_encoding=geometry_encoding) + _upload_parquet_to_fsspec(path, tmp_path) + finally: + with contextlib.suppress(OSError): + os.unlink(tmp_path) + else: + shapes.to_parquet(path, geometry_encoding=geometry_encoding) shapes.attrs[TRANSFORM_KEY] = transforms attrs = element_format.attrs_to_dict(shapes.attrs) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 8cd7b8385..03ec78526 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -9,6 +9,7 @@ from anndata._io.specs import write_elem as write_adata from ome_zarr.format import Format +from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import ( CurrentTablesFormat, TablesFormats, @@ -20,9 +21,10 @@ def _read_table(store: str | Path) -> AnnData: - table = read_anndata_zarr(str(store)) + resolved_store = _resolve_zarr_store(store) + table = read_anndata_zarr(resolved_store) - f = zarr.open(store, mode="r") + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=False) assert version is not None table_format = TablesFormats[version] diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 4c410fab0..48795513c 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import warnings from collections.abc import Callable from json import JSONDecodeError @@ -19,6 +18,8 @@ from spatialdata._core.spatialdata import SpatialData from spatialdata._io._utils import ( BadFileHandleMethod, + _FsspecStoreRoot, + _get_store_root, _resolve_zarr_store, handle_read_errors, ) @@ -32,7 +33,7 @@ def _read_zarr_group_spatialdata_element( root_group: zarr.Group, - root_store_path: str, + root_store_path: Path | _FsspecStoreRoot, sdata_version: Literal["0.1", "0.2"], selector: set[str], read_func: Callable[..., Any], @@ -54,7 +55,7 @@ def _read_zarr_group_spatialdata_element( # skip hidden files like .zgroup or .zmetadata continue elem_group = group[subgroup_name] - elem_group_path = os.path.join(root_store_path, elem_group.path) + elem_group_path = root_store_path / elem_group.path with handle_read_errors( on_bad_files, location=f"{group.path}/{subgroup_name}", @@ -170,7 +171,7 @@ def read_zarr( UserWarning, stacklevel=2, ) - root_store_path = root_group.store.root + root_store_path = _get_store_root(root_group.store) images: dict[str, Raster_T] = {} labels: dict[str, Raster_T] = {} @@ -231,12 +232,12 @@ def read_zarr( tables=tables, attrs=attrs, ) - sdata.path = resolved_store.root + sdata.path = store if isinstance(store, UPath) else resolved_store.root return sdata def _get_groups_for_element( - zarr_path: Path, element_type: str, element_name: str, use_consolidated: bool = True + zarr_path: Path | UPath, element_type: str, element_name: str, use_consolidated: bool = True ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. @@ -265,8 +266,8 @@ def _get_groups_for_element( ------- The Zarr groups for the root, element_type and element for a specific element. """ - if not isinstance(zarr_path, Path): - raise ValueError("zarr_path should be a Path object") + if not isinstance(zarr_path, (Path, UPath)): + raise ValueError("zarr_path should be a Path or UPath object") if element_type not in [ "images", @@ -289,7 +290,7 @@ def _get_groups_for_element( return root_group, element_type_group, element_name_group -def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: str) -> bool: +def _group_for_element_exists(zarr_path: Path | UPath, element_type: str, element_name: str) -> bool: """ Check if the group for an element exists. @@ -319,9 +320,13 @@ def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: return exists -def _write_consolidated_metadata(path: Path | str | None) -> None: +def _write_consolidated_metadata(path: Path | UPath | str | None) -> None: if path is not None: - f = zarr.open_group(path, mode="r+", use_consolidated=False) + if isinstance(path, UPath): + store = _resolve_zarr_store(path) + f = zarr.open_group(store, mode="r+", use_consolidated=False) + else: + f = zarr.open_group(path, mode="r+", use_consolidated=False) # .parquet files are not recognized as proper zarr and thus throw a warning. This does not affect SpatialData. # and therefore we silence it for our users as they can't do anything about this. # TODO check with remote PR whether we can prevent this warning at least for points data and whether with zarrv3 From 540631c1f64c8f1925aa8aa19d92a60930816920 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:28:28 +0100 Subject: [PATCH 06/32] ci: add test deps and Dockerfile for storage emulators (S3, Azure, GCS) - pyproject.toml: adlfs, gcsfs, moto[server], pytest-timeout in test extras - Dockerfile.emulators: moto, Azurite, fake-gcs-server for tests/io/remote_storage/ --- Dockerfile.emulators | 28 ++++++++++++++++++++++++++++ pyproject.toml | 4 ++++ 2 files changed, 32 insertions(+) create mode 100644 Dockerfile.emulators diff --git a/Dockerfile.emulators b/Dockerfile.emulators new file mode 100644 index 000000000..b4846a595 --- /dev/null +++ b/Dockerfile.emulators @@ -0,0 +1,28 @@ +# Storage emulators for tests/io/remote_storage/ (S3, Azure, GCS). +# Emulator URLs: S3 127.0.0.1:5000 | Azure 127.0.0.1:10000 | GCS 127.0.0.1:4443 +# +# Build (from project root): +# docker build -f Dockerfile.emulators -t spatialdata-emulators . +# +# Run in background (detached): +# docker run --rm -d --name spatialdata-emulators -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators +# +# Run in foreground (attach to terminal): +# docker run --rm --name spatialdata-emulators -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators +# +# Stop / remove: +# docker stop spatialdata-emulators +# docker rm -f spatialdata-emulators # if already stopped or to force-remove +FROM node:20-slim +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* +RUN python3 -m venv /opt/venv && /opt/venv/bin/pip install --no-cache-dir 'moto[server]' +ENV PATH="/opt/venv/bin:$PATH" +RUN cd /tmp && curl -sSL -o fgs.tgz https://github.com/fsouza/fake-gcs-server/releases/download/v1.54.0/fake-gcs-server_1.54.0_linux_amd64.tar.gz \ + && tar xzf fgs.tgz && mv fake-gcs-server /usr/local/bin/ 2>/dev/null || mv fake-gcs-server_*/fake-gcs-server /usr/local/bin/ \ + && chmod +x /usr/local/bin/fake-gcs-server && rm -f fgs.tgz +RUN mkdir -p /data +EXPOSE 5000 10000 4443 +RUN echo 'moto_server -H 0.0.0.0 -p 5000 & npx --yes azurite --silent --location /data --blobHost 0.0.0.0 --skipApiVersionCheck & fake-gcs-server -scheme http -port 4443 & wait' > /start.sh && chmod +x /start.sh +CMD ["/bin/sh", "/start.sh"] diff --git a/pyproject.toml b/pyproject.toml index e5f3134aa..6ab9b42e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,9 +66,13 @@ dev = [ "bump2version", ] test = [ + "adlfs", + "gcsfs", + "moto[server]", "pytest", "pytest-cov", "pytest-mock", + "pytest-timeout", "torch", ] docs = [ From 532af5a07017e341e3bfd7249fbaf722d83ef240 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:28:38 +0100 Subject: [PATCH 07/32] test: move remote storage tests under tests/io/remote_storage and add emulator config - full_sdata fixture: two regions for table categorical (avoids 404 on remote read) - tests/io/remote_storage/conftest.py: bucket/container creation, resilient async shutdown - tests/io/remote_storage/test_remote_storage.py: parametrized Azure/S3/GCS roundtrip and write tests --- tests/conftest.py | 8 +- tests/io/remote_storage/conftest.py | 193 ++++++++++++++++++ .../test_remote_storage.py | 83 ++++---- 3 files changed, 244 insertions(+), 40 deletions(-) create mode 100644 tests/io/remote_storage/conftest.py rename tests/io/{ => remote_storage}/test_remote_storage.py (72%) diff --git a/tests/conftest.py b/tests/conftest.py index c97939129..a6deba0ae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,12 +89,18 @@ def tables() -> list[AnnData]: @pytest.fixture() def full_sdata() -> SpatialData: + # Use two regions so the table categorical has two categories; otherwise anndata does not + # write the obs/region/codes/c/0 chunk (only codes/zarr.json), causing 404 on remote read. return SpatialData( images=_get_images(), labels=_get_labels(), shapes=_get_shapes(), points=_get_points(), - tables=_get_tables(region="labels2d", region_key="region", instance_key="instance_id"), + tables=_get_tables( + region=["labels2d", "poly"], + region_key="region", + instance_key="instance_id", + ), ) diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py new file mode 100644 index 000000000..c650ab53c --- /dev/null +++ b/tests/io/remote_storage/conftest.py @@ -0,0 +1,193 @@ +"""Minimal pytest config for IO tests. Creates buckets/containers when remote emulators are running. + +Assumes emulators are already running (e.g. Docker: + docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). +Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. +""" + +from __future__ import annotations + +import os +import socket +import time + +import pytest + +# Error messages from asyncio when closing sessions after the event loop is gone (e.g. at process exit) +_LOOP_GONE_ERRORS = ("different loop", "Loop is not running") + + +def _patch_fsspec_sync_for_shutdown() -> None: + """If fsspec.asyn.sync() runs at exit when the loop is gone, return None instead of raising.""" + import fsspec.asyn as asyn_mod + + _orig = asyn_mod.sync + + def _wrapped(loop, func, *args, timeout=None, **kwargs): + try: + return _orig(loop, func, *args, timeout=timeout, **kwargs) + except RuntimeError as e: + if any(msg in str(e) for msg in _LOOP_GONE_ERRORS): + return None + raise + + asyn_mod.sync = _wrapped + + +def _patch_gcsfs_close_session_for_shutdown() -> None: + """If gcsfs close_session fails (loop gone), close the connector synchronously instead of raising.""" + import asyncio + + import fsspec + import fsspec.asyn as asyn_mod + import gcsfs.core + + @staticmethod + def _close_session(loop, session, asynchronous=False): + if session.closed: + return + try: + running = asyncio.get_running_loop() + except RuntimeError: + running = None + + use_force_close = False + if loop and loop.is_running(): + loop.create_task(session.close()) + elif running and running.is_running() and asynchronous: + running.create_task(session.close()) + elif asyn_mod.loop[0] is not None and asyn_mod.loop[0].is_running(): + try: + asyn_mod.sync(asyn_mod.loop[0], session.close, timeout=0.1) + except (RuntimeError, fsspec.FSTimeoutError): + use_force_close = True + else: + use_force_close = True + + if use_force_close: + connector = getattr(session, "_connector", None) + if connector is not None: + connector._close() + + gcsfs.core.GCSFileSystem.close_session = _close_session + + +def _apply_resilient_async_close_patches() -> None: + """Avoid RuntimeError tracebacks when aiohttp sessions are closed at process exit (loop already gone).""" + _patch_fsspec_sync_for_shutdown() + _patch_gcsfs_close_session_for_shutdown() + + +def pytest_configure(config: pytest.Config) -> None: + """Apply patches for remote storage tests (resilient async close at shutdown).""" + _apply_resilient_async_close_patches() + + +EMULATOR_PORTS = {"s3": 5000, "azure": 10000, "gcs": 4443} +S3_BUCKETS = ("bucket", "test-azure", "test-s3", "test-gcs") +AZURE_CONTAINERS = ("test-container", "test-azure", "test-s3", "test-gcs") +GCS_BUCKETS = ("bucket", "test-azure", "test-s3", "test-gcs") + +AZURITE_CONNECTION_STRING = ( + "DefaultEndpointsProtocol=http;" + "AccountName=devstoreaccount1;" + "AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + "BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" +) + + +def _port_open(host: str = "127.0.0.1", port: int | None = None, timeout: float = 2.0) -> bool: + if port is None: + return False + try: + with socket.create_connection((host, port), timeout=timeout): + return True + except (OSError, TimeoutError): + return False + + +def _ensure_s3_buckets(host: str) -> None: + if not _port_open(host, EMULATOR_PORTS["s3"]): + return + os.environ.setdefault("AWS_ENDPOINT_URL", "http://127.0.0.1:5000") + os.environ.setdefault("AWS_ACCESS_KEY_ID", "testing") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "testing") + import boto3 + from botocore.config import Config + + client = boto3.client( + "s3", + endpoint_url=os.environ["AWS_ENDPOINT_URL"], + aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"], + aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], + region_name="us-east-1", + config=Config(signature_version="s3v4"), + ) + existing = {b["Name"] for b in client.list_buckets().get("Buckets", [])} + for name in S3_BUCKETS: + if name not in existing: + client.create_bucket(Bucket=name) + + +def _ensure_azure_containers(host: str) -> None: + if not _port_open(host, EMULATOR_PORTS["azure"]): + return + from azure.storage.blob import BlobServiceClient + + client = BlobServiceClient.from_connection_string(AZURITE_CONNECTION_STRING) + existing = {c.name for c in client.list_containers()} + for name in AZURE_CONTAINERS: + if name not in existing: + client.create_container(name) + + +def _ensure_gcs_buckets(host: str) -> None: + if not _port_open(host, EMULATOR_PORTS["gcs"]): + return + os.environ.setdefault("STORAGE_EMULATOR_HOST", "http://127.0.0.1:4443") + from google.auth.credentials import AnonymousCredentials + from google.cloud import storage + + client = storage.Client(credentials=AnonymousCredentials(), project="test") + existing = {b.name for b in client.list_buckets()} + for name in GCS_BUCKETS: + if name not in existing: + client.create_bucket(name) + + +def _wait_for_emulator_ports(host: str = "127.0.0.1", timeout: float = 60.0, check_interval: float = 2.0) -> None: + """Wait until all three emulator ports accept connections (e.g. after docker run).""" + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if all(_port_open(host, EMULATOR_PORTS[p]) for p in ("s3", "azure", "gcs")): + return + time.sleep(check_interval) + raise RuntimeError( + f"Emulators did not become ready within {timeout}s. " + "Ensure the container is running: docker run --rm -d -p 5000:5000 " + "-p 10000:10000 -p 4443:4443 spatialdata-emulators" + ) + + +@pytest.fixture(scope="session") +def _remote_storage_buckets_containers(): + """Create buckets/containers on running emulators so remote storage tests can run. + + Run with emulators up, e.g.: + docker run --rm -d -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators + Then: pytest tests/io/test_remote_storage.py -v + """ + host = "127.0.0.1" + _wait_for_emulator_ports(host) + _ensure_s3_buckets(host) + _ensure_azure_containers(host) + _ensure_gcs_buckets(host) + yield + + +def pytest_collection_modifyitems(config: pytest.Config, items: list) -> None: + """Inject bucket/container creation for test_remote_storage.py.""" + for item in items: + path = getattr(item, "path", None) or getattr(item, "fspath", None) + if path and "test_remote_storage" in str(path): + item.add_marker(pytest.mark.usefixtures("_remote_storage_buckets_containers")) diff --git a/tests/io/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py similarity index 72% rename from tests/io/test_remote_storage.py rename to tests/io/remote_storage/test_remote_storage.py index c24f1bcd1..44685061a 100644 --- a/tests/io/test_remote_storage.py +++ b/tests/io/remote_storage/test_remote_storage.py @@ -1,13 +1,25 @@ +"""Integration tests for remote storage (Azure, S3, GCS) using real emulators. + +Emulators must be running (e.g. Docker: docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). +Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. +tests/io/conftest.py creates the required buckets/containers when emulators are up. + +All remote paths use uuid.uuid4().hex so each test run writes to a unique location. +""" + from __future__ import annotations +import os +import uuid + import pytest from upath import UPath from spatialdata import SpatialData from spatialdata.testing import assert_spatial_data_objects_are_identical -# Azure emulator connection string (Azurite default) -# Source: https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string +# Azure emulator connection string (Azurite default). +# https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string AZURE_CONNECTION_STRING = ( "DefaultEndpointsProtocol=http;" "AccountName=devstoreaccount1;" @@ -22,20 +34,29 @@ def _get_azure_upath(container: str = "test-container", path: str = "test.zarr") def _get_s3_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: - """Create S3 UPath for testing. - - Uses anon=True for public buckets. For private buckets with moto (local S3 emulator), - would use: endpoint_url="http://127.0.0.1:5555/", AWS_ACCESS_KEY_ID="testing", etc. - """ + """Create S3 UPath for testing (moto emulator at 5000).""" + endpoint = os.environ.get("AWS_ENDPOINT_URL", "http://127.0.0.1:5000") + if endpoint: + return UPath( + f"s3://{container}/{path}", + endpoint_url=endpoint, + key=os.environ.get("AWS_ACCESS_KEY_ID", "testing"), + secret=os.environ.get("AWS_SECRET_ACCESS_KEY", "testing"), + ) return UPath(f"s3://{container}/{path}", anon=True) def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: - """Create GCS UPath for testing with fake-gcs-server (local GCS emulator).""" - return UPath(f"gs://{container}/{path}", endpoint_url="http://localhost:4443") + """Create GCS UPath for testing with fake-gcs-server (port 4443).""" + os.environ.setdefault("STORAGE_EMULATOR_HOST", "http://127.0.0.1:4443") + return UPath( + f"gs://{container}/{path}", + endpoint_url=os.environ["STORAGE_EMULATOR_HOST"], + token="anon", + project="test", + ) -# Shared parametrization for remote storage backends (azure, s3, gcs). GET_UPATH_PARAMS = pytest.mark.parametrize( "get_upath", [_get_azure_upath, _get_s3_upath, _get_gcs_upath], ids=["azure", "s3", "gcs"] ) @@ -45,6 +66,9 @@ def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: ids=["azure", "s3", "gcs"], ) +# Ensure buckets/containers exist on emulators before any test (see tests/io/conftest.py) +pytestmark = pytest.mark.usefixtures("_remote_storage_buckets_containers") + def _assert_read_identical(expected: SpatialData, upath: UPath, *, check_path: bool = True) -> None: """Read SpatialData from upath and assert it equals expected; optionally assert path.""" @@ -66,7 +90,7 @@ def test_path_setter_accepts_upath(self, get_upath) -> None: None | str | Path, not UPath, preventing the use of remote storage. """ sdata = SpatialData() - upath = get_upath() + upath = get_upath(path=f"test-accept-{uuid.uuid4().hex}.zarr") sdata.path = upath assert sdata.path == upath @@ -78,17 +102,15 @@ def test_write_with_upath_sets_path(self, get_upath) -> None: _validate_can_safely_write_to_path() before it can set sdata.path. """ sdata = SpatialData() - upath = get_upath() + upath = get_upath(path=f"test-write-path-{uuid.uuid4().hex}.zarr") sdata.write(upath) assert isinstance(sdata.path, UPath) def test_path_setter_rejects_other_types(self) -> None: """Test that SpatialData.path setter rejects other types.""" sdata = SpatialData() - with pytest.raises(TypeError, match="Path must be.*str.*Path"): sdata.path = 123 - with pytest.raises(TypeError, match="Path must be.*str.*Path"): sdata.path = {"not": "a path"} @@ -101,9 +123,7 @@ class TestRemoteStorage: """ @REMOTE_STORAGE_PARAMS - def test_write_read_roundtrip_remote( - self, full_sdata: SpatialData, get_upath, storage_name: str - ) -> None: + def test_write_read_roundtrip_remote(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: """Test writing and reading SpatialData to/from remote storage. This test verifies the full workflow: @@ -111,12 +131,10 @@ def test_write_read_roundtrip_remote( 2. Read SpatialData from remote storage using UPath 3. Verify data integrity (round-trip) """ - upath = get_upath(container=f"test-{storage_name}", path=f"roundtrip-{id(full_sdata)}.zarr") - + upath = get_upath(container=f"test-{storage_name}", path=f"roundtrip-{uuid.uuid4().hex}.zarr") full_sdata.write(upath, overwrite=True) assert isinstance(full_sdata.path, UPath) assert full_sdata.path == upath - _assert_read_identical(full_sdata, upath) @REMOTE_STORAGE_PARAMS @@ -130,37 +148,29 @@ def test_path_setter_with_remote_then_operations( 2. Write operations work 3. Read operations work """ - upath = get_upath(container=f"test-{storage_name}", path=f"operations-{id(full_sdata)}.zarr") - + upath = get_upath(container=f"test-{storage_name}", path=f"operations-{uuid.uuid4().hex}.zarr") full_sdata.path = upath assert full_sdata.path == upath assert full_sdata.is_backed() is True - full_sdata.write(overwrite=True) assert full_sdata.path == upath - _assert_read_identical(full_sdata, upath) @REMOTE_STORAGE_PARAMS - def test_overwrite_existing_remote_data( - self, full_sdata: SpatialData, get_upath, storage_name: str - ) -> None: + def test_overwrite_existing_remote_data(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: """Test overwriting existing data in remote storage. Verifies that overwriting existing remote data works (path-exists handling) and data integrity after overwrite. Round-trip is covered by test_write_read_roundtrip_remote. """ - upath = get_upath(container=f"test-{storage_name}", path=f"overwrite-{id(full_sdata)}.zarr") - + upath = get_upath(container=f"test-{storage_name}", path=f"overwrite-{uuid.uuid4().hex}.zarr") full_sdata.write(upath, overwrite=True) full_sdata.write(upath, overwrite=True) _assert_read_identical(full_sdata, upath, check_path=False) @REMOTE_STORAGE_PARAMS - def test_write_element_to_remote_storage( - self, full_sdata: SpatialData, get_upath, storage_name: str - ) -> None: + def test_write_element_to_remote_storage(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: """Test writing individual elements to remote storage using write_element(). This test verifies that: @@ -168,18 +178,13 @@ def test_write_element_to_remote_storage( 2. write_element() works with remote storage 3. Written elements can be read back correctly """ - upath = get_upath(container=f"test-{storage_name}", path=f"write-element-{id(full_sdata)}.zarr") - + upath = get_upath(container=f"test-{storage_name}", path=f"write-element-{uuid.uuid4().hex}.zarr") # Create empty SpatialData and write to remote storage empty_sdata = SpatialData() empty_sdata.write(upath, overwrite=True) - - # Set path and write individual elements full_sdata.path = upath assert full_sdata.path == upath - # Write each element type individually - for element_type, element_name, _ in full_sdata.gen_elements(): + for _element_type, element_name, _ in full_sdata.gen_elements(): full_sdata.write_element(element_name, overwrite=True) - _assert_read_identical(full_sdata, upath, check_path=False) From c22b8bf1004342ea0c4719e7c50a0af4b2b5bc56 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 16:20:29 +0100 Subject: [PATCH 08/32] fix: update Dask internal keys for zarr compatibility - Added "dimension_separator" to the frozenset of internal keys that should not be passed to zarr.Group.create_array(), ensuring compatibility with various zarr versions. - Updated test to set region labels for full_sdata table, allowing the test_set_table_annotates_spatialelement to succeed without errors. --- src/spatialdata/_io/_dask_zarr_compat.py | 3 ++- tests/io/test_multi_table.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_io/_dask_zarr_compat.py b/src/spatialdata/_io/_dask_zarr_compat.py index 350207056..a1b643451 100644 --- a/src/spatialdata/_io/_dask_zarr_compat.py +++ b/src/spatialdata/_io/_dask_zarr_compat.py @@ -13,7 +13,8 @@ _orig_to_zarr = _da.to_zarr # Keys from ome_zarr/dask **kwargs that must not be passed to zarr.Group.create_array() -_DASK_INTERNAL_KEYS = frozenset({"zarr_format"}) +# dimension_separator: not accepted by all zarr versions in the create_array() path. +_DASK_INTERNAL_KEYS = frozenset({"zarr_format", "dimension_separator"}) def _to_zarr( diff --git a/tests/io/test_multi_table.py b/tests/io/test_multi_table.py index abaaea8d2..77b17a177 100644 --- a/tests/io/test_multi_table.py +++ b/tests/io/test_multi_table.py @@ -113,6 +113,10 @@ def test_set_table_nonexisting_target(self, full_sdata): def test_set_table_annotates_spatialelement(self, full_sdata, tmp_path): tmpdir = Path(tmp_path) / "tmp.zarr" del full_sdata["table"].uns[TableModel.ATTRS_KEY] + # full_sdata table has region labels2d+poly; set to labels2d only so set_table_annotates_spatialelement succeeds + full_sdata["table"].obs["region"] = pd.Categorical( + ["labels2d"] * full_sdata["table"].n_obs + ) with pytest.raises( TypeError, match="No current annotation metadata found. Please specify both region_key and instance_key." ): From 0c0716938566d9fceffc7a0d8d7b50cca59f194e Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 16:32:43 +0100 Subject: [PATCH 09/32] test: refine subset and table validation in spatial data tests - Updated the `test_subset` function to exclude labels and poly from the default table, ensuring accurate subset validation. - Enhanced `test_validate_table_in_spatialdata` to assert that both regions (labels2d and poly) are correctly annotated in the table. - Adjusted `test_labels_table_joins` to restrict the table to labels2d, ensuring the join returns the expected results. --- tests/core/operations/test_spatialdata_operations.py | 9 ++++++--- tests/core/query/test_relational_query.py | 5 +++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/core/operations/test_spatialdata_operations.py b/tests/core/operations/test_spatialdata_operations.py index 68b538e0a..a898bed0c 100644 --- a/tests/core/operations/test_spatialdata_operations.py +++ b/tests/core/operations/test_spatialdata_operations.py @@ -559,14 +559,15 @@ def test_init_from_elements(full_sdata: SpatialData) -> None: def test_subset(full_sdata: SpatialData) -> None: - element_names = ["image2d", "points_0", "circles", "poly"] + # Exclude labels and poly so the default table (annotating labels2d and poly) is not included + element_names = ["image2d", "points_0", "circles"] subset0 = full_sdata.subset(element_names) unique_names = set() for _, k, _ in subset0.gen_spatial_elements(): unique_names.add(k) assert "image3d_xarray" in full_sdata.images assert unique_names == set(element_names) - # no table since the labels are not present in the subset + # no table since neither labels2d nor poly are in the subset assert "table" not in subset0.tables adata = AnnData( @@ -675,7 +676,9 @@ def test_transform_to_data_extent(full_sdata: SpatialData, maintain_positioning: def test_validate_table_in_spatialdata(full_sdata): table = full_sdata["table"] region, region_key, _ = get_table_keys(table) - assert region == "labels2d" + # full_sdata uses two regions (labels2d, poly) so the table annotates both + expected = {"labels2d", "poly"} + assert set(region if isinstance(region, list) else [region]) == expected full_sdata.validate_table_in_spatialdata(table) diff --git a/tests/core/query/test_relational_query.py b/tests/core/query/test_relational_query.py index 63e7a6f19..07f7b8c70 100644 --- a/tests/core/query/test_relational_query.py +++ b/tests/core/query/test_relational_query.py @@ -914,6 +914,11 @@ def test_filter_table_non_annotating(full_sdata): def test_labels_table_joins(full_sdata): + # Restrict table to labels2d only so the join returns one row per label (full_sdata default has two regions) + full_sdata["table"].obs["region"] = pd.Categorical( + ["labels2d"] * full_sdata["table"].n_obs + ) + full_sdata["table"].uns["spatialdata_attrs"]["region"] = "labels2d" element_dict, table = join_spatialelement_table( sdata=full_sdata, spatial_element_names="labels2d", From f21bb52e09d08c279abfacff1c71b2cedaecfb93 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 20:26:50 +0100 Subject: [PATCH 10/32] feat: move Dockerfile for storage emulators to facilitate testing --- .../io/remote_storage/Dockerfile.emulators | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename Dockerfile.emulators => tests/io/remote_storage/Dockerfile.emulators (90%) diff --git a/Dockerfile.emulators b/tests/io/remote_storage/Dockerfile.emulators similarity index 90% rename from Dockerfile.emulators rename to tests/io/remote_storage/Dockerfile.emulators index b4846a595..bc3bb6f53 100644 --- a/Dockerfile.emulators +++ b/tests/io/remote_storage/Dockerfile.emulators @@ -1,8 +1,8 @@ -# Storage emulators for tests/io/remote_storage/ (S3, Azure, GCS). +# Storage emulators for tests in this directory (S3, Azure, GCS). # Emulator URLs: S3 127.0.0.1:5000 | Azure 127.0.0.1:10000 | GCS 127.0.0.1:4443 # # Build (from project root): -# docker build -f Dockerfile.emulators -t spatialdata-emulators . +# docker build -f tests/io/remote_storage/Dockerfile.emulators -t spatialdata-emulators . # # Run in background (detached): # docker run --rm -d --name spatialdata-emulators -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators From 072566a8b9db41e51ff960f559d9923ab1bed07d Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 20:36:23 +0100 Subject: [PATCH 11/32] ci: enhance GitHub Actions workflow to support storage emulators on Linux - Added steps to build and run storage emulators (S3, Azure, GCS) using Docker, specifically for the Ubuntu environment. - Implemented a wait mechanism to ensure emulators are ready before running tests. - Adjusted test execution to skip remote storage tests on non-Linux platforms. --- .github/workflows/test.yaml | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1635bdd2a..cd1d60ade 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -53,13 +53,43 @@ jobs: fi fi uv sync --group=test + # Start storage emulators (S3, Azure, GCS) only on Linux; service containers are not available on Windows/macOS + - name: Build and start storage emulators + if: matrix.os == 'ubuntu-latest' + run: | + docker build -f tests/io/remote_storage/Dockerfile.emulators -t spatialdata-emulators . + docker run --rm -d --name spatialdata-emulators \ + -p 5000:5000 -p 10000:10000 -p 4443:4443 \ + spatialdata-emulators + - name: Wait for emulator ports + if: matrix.os == 'ubuntu-latest' + run: | + echo "Waiting for S3 (5000), Azure (10000), GCS (4443)..." + python3 -c " + import socket, time + for _ in range(45): + try: + for p in (5000, 10000, 4443): + socket.create_connection(('127.0.0.1', p), timeout=2) + print('Emulators ready.') + break + except (socket.error, OSError): + time.sleep(2) + else: + raise SystemExit('Emulators did not become ready.') + " + # On Linux, emulators run above so full suite (incl. tests/io/remote_storage/) runs. On Windows/macOS, skip remote_storage. - name: Test env: MPLBACKEND: agg PLATFORM: ${{ matrix.os }} DISPLAY: :42 run: | - uv run pytest --cov --color=yes --cov-report=xml + if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then + uv run pytest --cov --color=yes --cov-report=xml + else + uv run pytest --cov --color=yes --cov-report=xml --ignore=tests/io/remote_storage/ + fi - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: From ee6e4dc5453a9ee98d509f3496ba0e0e905477d7 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 21:28:29 +0100 Subject: [PATCH 12/32] fix: handle RuntimeError in fsspec async session closure - Wrapped the fsspec async sync function to prevent RuntimeError "Loop is not running" during process exit when using remote storage (Azure, S3, GCS). - Ensured compatibility with async session management in the _utils module. --- src/spatialdata/_io/_utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 747d8ed7b..6424cbab7 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -15,6 +15,7 @@ from pathlib import Path from typing import Any, Literal +import fsspec.asyn as _asyn_mod import zarr from anndata import AnnData from dask._task_spec import Task @@ -670,3 +671,20 @@ def handle_read_errors( else: # on_bad_files == BadFileHandleMethod.ERROR # Let it raise exceptions yield + + +# Avoid RuntimeError "Loop is not running" when fsspec closes async sessions at process exit +# (remote storage: Azure, S3, GCS). _utils is used for all store resolution. +_orig_sync = _asyn_mod.sync + + +def _fsspec_sync_wrapped(loop, func, *args, timeout=None, **kwargs): + try: + return _orig_sync(loop, func, *args, timeout=timeout, **kwargs) + except RuntimeError as e: + if "Loop is not running" in str(e) or "different loop" in str(e): + return None + raise + + +_asyn_mod.sync = _fsspec_sync_wrapped From 9019e6aeb00e0665677aa6a8b1b20078cd76490d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Mar 2026 20:51:19 +0000 Subject: [PATCH 13/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/core/query/test_relational_query.py | 4 +--- tests/io/test_multi_table.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/core/query/test_relational_query.py b/tests/core/query/test_relational_query.py index 07f7b8c70..c28725681 100644 --- a/tests/core/query/test_relational_query.py +++ b/tests/core/query/test_relational_query.py @@ -915,9 +915,7 @@ def test_filter_table_non_annotating(full_sdata): def test_labels_table_joins(full_sdata): # Restrict table to labels2d only so the join returns one row per label (full_sdata default has two regions) - full_sdata["table"].obs["region"] = pd.Categorical( - ["labels2d"] * full_sdata["table"].n_obs - ) + full_sdata["table"].obs["region"] = pd.Categorical(["labels2d"] * full_sdata["table"].n_obs) full_sdata["table"].uns["spatialdata_attrs"]["region"] = "labels2d" element_dict, table = join_spatialelement_table( sdata=full_sdata, diff --git a/tests/io/test_multi_table.py b/tests/io/test_multi_table.py index 77b17a177..5c6bcf6e2 100644 --- a/tests/io/test_multi_table.py +++ b/tests/io/test_multi_table.py @@ -114,9 +114,7 @@ def test_set_table_annotates_spatialelement(self, full_sdata, tmp_path): tmpdir = Path(tmp_path) / "tmp.zarr" del full_sdata["table"].uns[TableModel.ATTRS_KEY] # full_sdata table has region labels2d+poly; set to labels2d only so set_table_annotates_spatialelement succeeds - full_sdata["table"].obs["region"] = pd.Categorical( - ["labels2d"] * full_sdata["table"].n_obs - ) + full_sdata["table"].obs["region"] = pd.Categorical(["labels2d"] * full_sdata["table"].n_obs) with pytest.raises( TypeError, match="No current annotation metadata found. Please specify both region_key and instance_key." ): From 42c31332ce944730012480560ff3ca52ec1c0514 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 22:14:51 +0100 Subject: [PATCH 14/32] refactor: add type hints to functions in _dask_zarr_compat, _utils, and io_points modules --- src/spatialdata/_io/_dask_zarr_compat.py | 24 +++++++++++++----------- src/spatialdata/_io/_utils.py | 2 +- src/spatialdata/_io/io_points.py | 3 ++- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/spatialdata/_io/_dask_zarr_compat.py b/src/spatialdata/_io/_dask_zarr_compat.py index a1b643451..b0988aef7 100644 --- a/src/spatialdata/_io/_dask_zarr_compat.py +++ b/src/spatialdata/_io/_dask_zarr_compat.py @@ -8,6 +8,8 @@ from __future__ import annotations +from typing import Any + import dask.array as _da _orig_to_zarr = _da.to_zarr @@ -18,17 +20,17 @@ def _to_zarr( - arr, - url, - component=None, - storage_options=None, - region=None, - compute=True, - return_stored=False, - zarr_array_kwargs=None, - zarr_read_kwargs=None, - **kwargs, -): + arr: Any, + url: Any, + component: Any = None, + storage_options: Any = None, + region: Any = None, + compute: bool = True, + return_stored: bool = False, + zarr_array_kwargs: Any = None, + zarr_read_kwargs: Any = None, + **kwargs: Any, +) -> Any: """Forward deprecated **kwargs into zarr_array_kwargs, excluding _DASK_INTERNAL_KEYS.""" if kwargs: zarr_array_kwargs = dict(zarr_array_kwargs) if zarr_array_kwargs else {} diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 6424cbab7..2a5d44e26 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -678,7 +678,7 @@ def handle_read_errors( _orig_sync = _asyn_mod.sync -def _fsspec_sync_wrapped(loop, func, *args, timeout=None, **kwargs): +def _fsspec_sync_wrapped(loop: Any, func: Any, *args: Any, timeout: Any = None, **kwargs: Any) -> Any: try: return _orig_sync(loop, func, *args, timeout=timeout, **kwargs) except RuntimeError as e: diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index e41273dcb..684b39a27 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -1,6 +1,7 @@ from __future__ import annotations from pathlib import Path +from typing import Any import zarr from dask.dataframe import DataFrame as DaskDataFrame @@ -92,7 +93,7 @@ def write_points( points_without_transform = points.copy() del points_without_transform.attrs["transform"] - storage_options: dict = {} + storage_options: dict[str, Any] = {} if isinstance(path, _FsspecStoreRoot): storage_options = _storage_options_from_fs(path._store.fs) points_without_transform.to_parquet(str(path), storage_options=storage_options or None) From 70ababe06437bb86c098f817a900f9d4e5066d74 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 4 Mar 2026 11:41:43 +0100 Subject: [PATCH 15/32] chore: remove pytest-timeout from test dependencies in pyproject.toml --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6ab9b42e7..cce73720b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,6 @@ test = [ "pytest", "pytest-cov", "pytest-mock", - "pytest-timeout", "torch", ] docs = [ From cae231987f87783a194da7cb7e2726b254c27999 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 4 Mar 2026 12:02:50 +0100 Subject: [PATCH 16/32] test: add unit tests for remote storage store resolution and credential handling --- .../remote_storage/test_resolve_zarr_store.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 tests/io/remote_storage/test_resolve_zarr_store.py diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py new file mode 100644 index 000000000..d8c90d46d --- /dev/null +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -0,0 +1,55 @@ +"""Unit tests for remote-storage-specific store resolution and credential handling. + +Covers only code paths used when reading/writing from remote backends (Azure, S3, GCS): +- _FsspecStoreRoot resolution (used when reading elements from a remote zarr store). +- _storage_options_from_fs for Azure and GCS (used when writing parquet to remote). +""" + +from __future__ import annotations + +from zarr.storage import FsspecStore + +from spatialdata._io._utils import _FsspecStoreRoot, _resolve_zarr_store, _storage_options_from_fs + + +def test_resolve_zarr_store_fsspec_store_root() -> None: + """_FsspecStoreRoot is resolved to FsspecStore when reading from remote (e.g. points/shapes paths).""" + import fsspec + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + fs = fsspec.filesystem("memory") + async_fs = AsyncFileSystemWrapper(fs, asynchronous=True) + base = FsspecStore(async_fs, path="/") + root = _FsspecStoreRoot(base, "/") + store = _resolve_zarr_store(root) + assert isinstance(store, FsspecStore) + + +def test_storage_options_from_fs_azure_account_key() -> None: + """_storage_options_from_fs extracts Azure credentials for writing parquet to remote Azure Blob.""" + + class AzureBlobFileSystemMock: + account_name = "dev" + account_key = "key123" + connection_string = None + anon = None + + AzureBlobFileSystemMock.__name__ = "AzureBlobFileSystem" + out = _storage_options_from_fs(AzureBlobFileSystemMock()) + assert out["account_name"] == "dev" + assert out["account_key"] == "key123" + + +def test_storage_options_from_fs_gcs_endpoint() -> None: + """_storage_options_from_fs extracts GCS endpoint and project for writing parquet to remote GCS.""" + + class GCSFileSystemMock: + token = "anon" + _endpoint = "http://localhost:4443" + project = "test" + + GCSFileSystemMock.__name__ = "GCSFileSystem" + out = _storage_options_from_fs(GCSFileSystemMock()) + assert out["token"] == "anon" + assert out["endpoint_url"] == "http://localhost:4443" + assert out["project"] == "test" From fe6bf2455535b49e7ca5edddcac442fbdccaa76a Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 09:48:25 +0200 Subject: [PATCH 17/32] chore(ci): fix GCS emulator tests (gcsfs, sync upload, multi-arch) --- .github/workflows/test.yaml | 2 ++ src/spatialdata/_io/io_shapes.py | 23 ++++++++------------ tests/conftest.py | 4 ++++ tests/io/remote_storage/Dockerfile.emulators | 9 ++++++-- tests/io/remote_storage/conftest.py | 14 +++++++++++- 5 files changed, 35 insertions(+), 17 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index cd1d60ade..a626165c8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -84,6 +84,8 @@ jobs: MPLBACKEND: agg PLATFORM: ${{ matrix.os }} DISPLAY: :42 + # gcsfs otherwise defaults to ExtendedGcsFileSystem (prod Storage Control gRPC; breaks fake-gcs-server). + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT: "false" run: | if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then uv run pytest --cov --color=yes --cov-report=xml diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index adf4716f3..cd521f51b 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import json import os import tempfile from pathlib import Path @@ -65,7 +66,8 @@ def _read_shapes( store_root = _get_store_root(f.store_path.store) path = store_root / f.path / "shapes.parquet" if isinstance(path, _FsspecStoreRoot): - geo_df = read_parquet(str(path), storage_options=_storage_options_from_fs(path._store.fs)) + opts = _storage_options_from_fs(path._store.fs) + geo_df = read_parquet(str(path), storage_options=opts if opts else {}) else: geo_df = read_parquet(path) else: @@ -195,18 +197,6 @@ def _upload_parquet_to_s3(tmp_path: str, bucket: str, key: str, fs: Any) -> None s3.upload_file(tmp_path, bucket, key) -def _upload_parquet_to_gcs(tmp_path: str, bucket: str, key: str, fs: Any) -> None: - from google.auth.credentials import AnonymousCredentials - from google.cloud import storage - - client = storage.Client( - credentials=AnonymousCredentials(), - project=getattr(fs, "project", None) or "test", - ) - blob = client.bucket(bucket).blob(key) - blob.upload_from_filename(tmp_path) - - def _upload_parquet_to_fsspec(path: _FsspecStoreRoot, tmp_path: str) -> None: """Upload local parquet file to remote fsspec store using sync APIs to avoid event-loop issues.""" fs = path._store.fs @@ -217,7 +207,12 @@ def _upload_parquet_to_fsspec(path: _FsspecStoreRoot, tmp_path: str) -> None: elif fs_name in ("S3FileSystem", "MotoS3FS"): _upload_parquet_to_s3(tmp_path, bucket, key, fs) elif fs_name == "GCSFileSystem": - _upload_parquet_to_gcs(tmp_path, bucket, key, fs) + import fsspec + + fs_dict = json.loads(fs.to_json()) + fs_dict["asynchronous"] = False + sync_fs = fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict)) + sync_fs.put_file(tmp_path, path._path) else: fs.put(tmp_path, str(path)) diff --git a/tests/conftest.py b/tests/conftest.py index a6deba0ae..a9aa8ebaa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,9 @@ from __future__ import annotations +import os + +os.environ.setdefault("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false") + from collections.abc import Sequence from pathlib import Path from typing import Any diff --git a/tests/io/remote_storage/Dockerfile.emulators b/tests/io/remote_storage/Dockerfile.emulators index bc3bb6f53..43b6835e6 100644 --- a/tests/io/remote_storage/Dockerfile.emulators +++ b/tests/io/remote_storage/Dockerfile.emulators @@ -17,12 +17,17 @@ FROM node:20-slim RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip python3-venv curl ca-certificates \ && rm -rf /var/lib/apt/lists/* +RUN npm install -g azurite RUN python3 -m venv /opt/venv && /opt/venv/bin/pip install --no-cache-dir 'moto[server]' ENV PATH="/opt/venv/bin:$PATH" -RUN cd /tmp && curl -sSL -o fgs.tgz https://github.com/fsouza/fake-gcs-server/releases/download/v1.54.0/fake-gcs-server_1.54.0_linux_amd64.tar.gz \ +# fake-gcs-server must match the image CPU. `ARG TARGETARCH=amd64` can stay amd64 on arm64 builds. +RUN set -eux; \ + arch="$(uname -m)"; \ + case "$arch" in x86_64) fgs=amd64 ;; aarch64|arm64) fgs=arm64 ;; *) echo "unsupported arch: $arch" >&2; exit 1 ;; esac; \ + cd /tmp && curl -fsSL -o fgs.tgz "https://github.com/fsouza/fake-gcs-server/releases/download/v1.54.0/fake-gcs-server_1.54.0_linux_${fgs}.tar.gz" \ && tar xzf fgs.tgz && mv fake-gcs-server /usr/local/bin/ 2>/dev/null || mv fake-gcs-server_*/fake-gcs-server /usr/local/bin/ \ && chmod +x /usr/local/bin/fake-gcs-server && rm -f fgs.tgz RUN mkdir -p /data EXPOSE 5000 10000 4443 -RUN echo 'moto_server -H 0.0.0.0 -p 5000 & npx --yes azurite --silent --location /data --blobHost 0.0.0.0 --skipApiVersionCheck & fake-gcs-server -scheme http -port 4443 & wait' > /start.sh && chmod +x /start.sh +RUN echo 'moto_server -H 0.0.0.0 -p 5000 & azurite --silent --location /data --blobHost 0.0.0.0 --skipApiVersionCheck & fake-gcs-server -scheme http -port 4443 & wait' > /start.sh && chmod +x /start.sh CMD ["/bin/sh", "/start.sh"] diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index c650ab53c..0d4d08da7 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -13,6 +13,16 @@ import pytest + +def _ensure_gcs_emulator_env() -> None: + """Point google-cloud-storage / gcsfs defaults at fake-gcs-server (not production).""" + raw = os.environ.get("STORAGE_EMULATOR_HOST", "").strip() + if raw in ("", "default"): + os.environ["STORAGE_EMULATOR_HOST"] = "http://127.0.0.1:4443" + elif not raw.startswith(("http://", "https://")): + os.environ["STORAGE_EMULATOR_HOST"] = f"http://{raw}" + + # Error messages from asyncio when closing sessions after the event loop is gone (e.g. at process exit) _LOOP_GONE_ERRORS = ("different loop", "Loop is not running") @@ -155,7 +165,7 @@ def _ensure_gcs_buckets(host: str) -> None: client.create_bucket(name) -def _wait_for_emulator_ports(host: str = "127.0.0.1", timeout: float = 60.0, check_interval: float = 2.0) -> None: +def _wait_for_emulator_ports(host: str = "127.0.0.1", timeout: float = 10.0, check_interval: float = 2.0) -> None: """Wait until all three emulator ports accept connections (e.g. after docker run).""" deadline = time.monotonic() + timeout while time.monotonic() < deadline: @@ -187,6 +197,8 @@ def _remote_storage_buckets_containers(): def pytest_collection_modifyitems(config: pytest.Config, items: list) -> None: """Inject bucket/container creation for test_remote_storage.py.""" + if any("remote_storage" in str(getattr(item, "path", None) or getattr(item, "fspath", "")) for item in items): + _ensure_gcs_emulator_env() for item in items: path = getattr(item, "path", None) or getattr(item, "fspath", None) if path and "test_remote_storage" in str(path): From 3cb2c9366fc352b53a276e780d54a2e07676430b Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 10:12:45 +0200 Subject: [PATCH 18/32] refactor: remove deprecated dask array compatibility layer --- src/spatialdata/_io/__init__.py | 2 - src/spatialdata/_io/_dask_zarr_compat.py | 55 ------------------------ 2 files changed, 57 deletions(-) delete mode 100644 src/spatialdata/_io/_dask_zarr_compat.py diff --git a/src/spatialdata/_io/__init__.py b/src/spatialdata/_io/__init__.py index 9e4b11de1..38ff8c6bb 100644 --- a/src/spatialdata/_io/__init__.py +++ b/src/spatialdata/_io/__init__.py @@ -1,7 +1,5 @@ from __future__ import annotations -# Patch da.to_zarr so ome_zarr's **kwargs are passed as zarr_array_kwargs (avoids FutureWarning) -import spatialdata._io._dask_zarr_compat # noqa: F401 from spatialdata._io._utils import get_dask_backing_files from spatialdata._io.format import SpatialDataFormatType from spatialdata._io.io_points import write_points diff --git a/src/spatialdata/_io/_dask_zarr_compat.py b/src/spatialdata/_io/_dask_zarr_compat.py deleted file mode 100644 index b0988aef7..000000000 --- a/src/spatialdata/_io/_dask_zarr_compat.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Compatibility layer for dask.array.to_zarr when callers pass array options via **kwargs. - -ome_zarr.writer calls da.to_zarr(..., **options) with array options (compressor, dimension_names, -etc.). Dask deprecated **kwargs in favor of zarr_array_kwargs. This module patches da.to_zarr to -forward such kwargs into zarr_array_kwargs (excluding dask-internal keys like zarr_format that -zarr.Group.create_array() does not accept), avoiding the FutureWarning and keeping behavior correct. -""" - -from __future__ import annotations - -from typing import Any - -import dask.array as _da - -_orig_to_zarr = _da.to_zarr - -# Keys from ome_zarr/dask **kwargs that must not be passed to zarr.Group.create_array() -# dimension_separator: not accepted by all zarr versions in the create_array() path. -_DASK_INTERNAL_KEYS = frozenset({"zarr_format", "dimension_separator"}) - - -def _to_zarr( - arr: Any, - url: Any, - component: Any = None, - storage_options: Any = None, - region: Any = None, - compute: bool = True, - return_stored: bool = False, - zarr_array_kwargs: Any = None, - zarr_read_kwargs: Any = None, - **kwargs: Any, -) -> Any: - """Forward deprecated **kwargs into zarr_array_kwargs, excluding _DASK_INTERNAL_KEYS.""" - if kwargs: - zarr_array_kwargs = dict(zarr_array_kwargs) if zarr_array_kwargs else {} - for k, v in kwargs.items(): - if k not in _DASK_INTERNAL_KEYS: - zarr_array_kwargs[k] = v - kwargs = {} - return _orig_to_zarr( - arr, - url, - component=component, - storage_options=storage_options, - region=region, - compute=compute, - return_stored=return_stored, - zarr_array_kwargs=zarr_array_kwargs, - zarr_read_kwargs=zarr_read_kwargs, - **kwargs, - ) - - -_da.to_zarr = _to_zarr From 6cf359a108f7cb45e35f0da9e61a90d18bc0c94b Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 10:52:45 +0200 Subject: [PATCH 19/32] Improve path handling in FsspecStore and update read_parquet options --- src/spatialdata/_io/_utils.py | 16 +++++++++++++--- src/spatialdata/_io/io_points.py | 3 ++- tests/io/remote_storage/test_remote_storage.py | 4 ++-- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 2a5d44e26..4553ab664 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -41,6 +41,15 @@ from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes +def _join_fsspec_store_path(store_path: str, relative_path: str) -> str: + """Combine FsspecStore root with a zarr group path using POSIX ``/`` (fsspec keys; safe on Windows).""" + base = str(store_path).replace("\\", "/").rstrip("/") + rel = str(relative_path).replace("\\", "/").lstrip("/") + if not base: + return f"/{rel}" if rel else "/" + return f"{base}/{rel}" if rel else base + + class _FsspecStoreRoot: """Path-like root for FsspecStore (no .root attribute); supports __truediv__ and str() as full URL.""" @@ -48,10 +57,11 @@ class _FsspecStoreRoot: def __init__(self, store: FsspecStore, path: str | None = None) -> None: self._store = store - self._path = (path or store.path).rstrip("/") + raw = path or store.path + self._path = str(raw).replace("\\", "/").rstrip("/") def __truediv__(self, other: str | Path) -> _FsspecStoreRoot: - return _FsspecStoreRoot(self._store, self._path + "/" + str(other).lstrip("/")) + return _FsspecStoreRoot(self._store, _join_fsspec_store_path(self._path, str(other))) def __str__(self) -> str: protocol = getattr(self._store.fs, "protocol", None) @@ -597,8 +607,8 @@ def _resolve_zarr_store( # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group return FsspecStore( - path.store.path + "/" + path.path, fs=_ensure_async_fs(path.store.fs), + path=_join_fsspec_store_path(path.store.path, path.path), **kwargs, ) if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 684b39a27..be2e30796 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -41,7 +41,8 @@ def _read_points( # cache on remote file needed for parquet reader to work # TODO: allow reading in the metadata without caching all the data if isinstance(path, _FsspecStoreRoot): - points = read_parquet(str(path), storage_options=_storage_options_from_fs(path._store.fs)) + opts = _storage_options_from_fs(path._store.fs) + points = read_parquet(str(path), storage_options=opts if opts else {}) else: points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) assert isinstance(points, DaskDataFrame) diff --git a/tests/io/remote_storage/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py index 44685061a..fa72ff914 100644 --- a/tests/io/remote_storage/test_remote_storage.py +++ b/tests/io/remote_storage/test_remote_storage.py @@ -2,7 +2,7 @@ Emulators must be running (e.g. Docker: docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. -tests/io/conftest.py creates the required buckets/containers when emulators are up. +tests/io/remote_storage/conftest.py creates buckets/containers when emulators are up. All remote paths use uuid.uuid4().hex so each test run writes to a unique location. """ @@ -66,7 +66,7 @@ def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: ids=["azure", "s3", "gcs"], ) -# Ensure buckets/containers exist on emulators before any test (see tests/io/conftest.py) +# Ensure buckets/containers exist on emulators before any test (see tests/io/remote_storage/conftest.py). pytestmark = pytest.mark.usefixtures("_remote_storage_buckets_containers") From df7be9a0341a05b671118683e9d78b4d20f509cc Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 12:01:52 +0200 Subject: [PATCH 20/32] Add fsspec integration by adding support for cloud object store protocols and improving storage options handling for parquet files. --- src/spatialdata/_io/_utils.py | 104 +++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 27 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 4553ab664..adc25a0ce 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -75,38 +75,83 @@ def __fspath__(self) -> str: return str(self) +_PARQUET_FSSPEC_NAMES: frozenset[str] = frozenset( + {"AzureBlobFileSystem", "ExtendedGcsFileSystem", "GCSFileSystem", "MotoS3FS", "S3FileSystem"} +) +_CLOUD_OBJECT_STORE_PROTOCOLS: frozenset[str] = frozenset({"abfs", "adl", "az", "gcs", "gs", "s3", "s3a"}) + + +def _unwrap_fsspec_sync_fs(fs: Any) -> Any: + inner = getattr(fs, "sync_fs", None) + if inner is not None and inner is not fs: + return _unwrap_fsspec_sync_fs(inner) + return fs + + +def _fsspec_protocols(core: Any) -> set[str]: + raw = getattr(core, "protocol", None) + if isinstance(raw, str): + return {raw} + if isinstance(raw, (list, tuple)): + return set(raw) + return set() + + +def _require_known_parquet_fsspec(core: Any) -> None: + if type(core).__name__ in _PARQUET_FSSPEC_NAMES: + return + supported = ", ".join(sorted(_PARQUET_FSSPEC_NAMES)) + label = f"{type(core).__module__}.{type(core).__qualname__}" + raise ValueError( + f"Cannot derive parquet storage_options from filesystem {label!r}. Supported filesystem classes: {supported}." + ) + + +def _check_fsspec_at_remote_store_open(fs: Any) -> None: + """If ``fs`` looks like S3/GCS/Azure, ensure we can build parquet ``storage_options`` for it.""" + core = _unwrap_fsspec_sync_fs(fs) + if not (_fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS): + return + _require_known_parquet_fsspec(core) + + def _storage_options_from_fs(fs: Any) -> dict[str, Any]: - """Build storage_options dict from an fsspec filesystem for use with to_parquet/write_parquet. + """Build storage_options dict from an fsspec filesystem for use with to_parquet/read_parquet. - Ensures parquet writes to remote stores (Azure, S3, GCS) use the same credentials as the - zarr store. + Unwraps ``sync_fs`` chains (e.g. async wrappers). Raises if the implementation is not one we + support for adlfs / s3fs / gcsfs-style credentials. """ + core = _unwrap_fsspec_sync_fs(fs) + _require_known_parquet_fsspec(core) out: dict[str, Any] = {} - name = type(fs).__name__ + name = type(core).__name__ if name == "AzureBlobFileSystem": - if getattr(fs, "connection_string", None): - out["connection_string"] = fs.connection_string - elif getattr(fs, "account_name", None) and getattr(fs, "account_key", None): - out["account_name"] = fs.account_name - out["account_key"] = fs.account_key - if getattr(fs, "anon", None) is not None: - out["anon"] = fs.anon + if getattr(core, "connection_string", None): + out["connection_string"] = core.connection_string + elif getattr(core, "account_name", None) and getattr(core, "account_key", None): + out["account_name"] = core.account_name + out["account_key"] = core.account_key + if getattr(core, "anon", None) is not None: + out["anon"] = core.anon elif name in ("S3FileSystem", "MotoS3FS"): - if getattr(fs, "endpoint_url", None): - out["endpoint_url"] = fs.endpoint_url - if getattr(fs, "key", None): - out["key"] = fs.key - if getattr(fs, "secret", None): - out["secret"] = fs.secret - if getattr(fs, "anon", None) is not None: - out["anon"] = fs.anon - elif name == "GCSFileSystem": - if getattr(fs, "token", None) is not None: - out["token"] = fs.token - if getattr(fs, "_endpoint", None): - out["endpoint_url"] = fs._endpoint - if getattr(fs, "project", None): - out["project"] = fs.project + if getattr(core, "endpoint_url", None): + out["endpoint_url"] = core.endpoint_url + if getattr(core, "key", None): + out["key"] = core.key + if getattr(core, "secret", None): + out["secret"] = core.secret + if getattr(core, "anon", None) is not None: + out["anon"] = core.anon + elif name in ("GCSFileSystem", "ExtendedGcsFileSystem"): + if getattr(core, "token", None) is not None: + out["token"] = core.token + if getattr(core, "_endpoint", None): + out["endpoint_url"] = core._endpoint + if getattr(core, "project", None): + out["project"] = core.project + else: + raise AssertionError(f"Unhandled fsspec class {name!r} (out of sync with _PARQUET_FSSPEC_NAMES)") + return out @@ -587,7 +632,9 @@ def _resolve_zarr_store( TypeError If the input type is unsupported. ValueError - If a `zarr.Group` has an unsupported store type. + If a `zarr.Group` has an unsupported store type, or if the fsspec filesystem uses a cloud + object-store protocol (S3, GCS, Azure, …) but is not a supported implementation for parquet + ``storage_options`` (see :func:`_check_fsspec_at_remote_store_open`). """ # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): @@ -606,6 +653,7 @@ def _resolve_zarr_store( if isinstance(path.store, FsspecStore): # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group + _check_fsspec_at_remote_store_open(path.store.fs) return FsspecStore( fs=_ensure_async_fs(path.store.fs), path=_join_fsspec_store_path(path.store.path, path.path), @@ -617,9 +665,11 @@ def _resolve_zarr_store( raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, _FsspecStoreRoot): # path-like from read_zarr that carries the same fs (preserves Azure/GCS credentials) + _check_fsspec_at_remote_store_open(path._store.fs) return FsspecStore(_ensure_async_fs(path._store.fs), path=path._path, **kwargs) if isinstance(path, UPath): # if input is a remote UPath, map it to an FSStore (check before StoreLike to avoid UnionType isinstance) + _check_fsspec_at_remote_store_open(path.fs) return FsspecStore(_ensure_async_fs(path.fs), path=path.path, **kwargs) if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore From a0bcc65a36c516d84a41b41e08fa47622351ad79 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 13:54:51 +0200 Subject: [PATCH 21/32] Enhance path handling for hierarchical URIs in SpatialData and related utilities. --- src/spatialdata/_core/spatialdata.py | 13 ++++++++++--- src/spatialdata/_io/_utils.py | 1 - src/spatialdata/_io/io_zarr.py | 19 +++++++++++++++++-- .../io/remote_storage/test_remote_storage.py | 7 +++++++ 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 810713d45..3c06f8f44 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1043,7 +1043,11 @@ def _validate_can_safely_write_to_path( _resolve_zarr_store, ) - if isinstance(file_path, str): + # Hierarchical URIs (``scheme://…``) must become UPath: plain ``Path(str)`` breaks cloud URLs + # (S3-compatible stores, Azure ``abfs://`` / ``az://``, GCS ``gs://``, ``https://``, fsspec chains, etc.). + if isinstance(file_path, str) and "://" in file_path: + file_path = UPath(file_path) + elif isinstance(file_path, str): file_path = Path(file_path) if not isinstance(file_path, (Path, UPath)): @@ -1186,9 +1190,12 @@ def write( if self.path is None: raise ValueError("file_path must be provided when SpatialData.path is not set.") file_path = self.path - if isinstance(file_path, str): + # Hierarchical URIs (``scheme://…``) must become UPath: plain ``Path(str)`` breaks cloud URLs + # (S3-compatible stores, Azure ``abfs://`` / ``az://``, GCS ``gs://``, ``https://``, fsspec chains, etc.). + if isinstance(file_path, str) and "://" in file_path: + file_path = UPath(file_path) + elif isinstance(file_path, str): file_path = Path(file_path) - # Keep UPath as-is; do not convert to Path self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) self._validate_all_elements() diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index adc25a0ce..9050348c4 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -638,7 +638,6 @@ def _resolve_zarr_store( """ # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): - # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 48795513c..f3506beed 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any, Literal, cast -import zarr.storage +import zarr from anndata import AnnData from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -14,6 +14,7 @@ from pyarrow import ArrowInvalid from upath import UPath from zarr.errors import ArrayNotFoundError +from zarr.storage import FsspecStore, LocalStore from spatialdata._core.spatialdata import SpatialData from spatialdata._io._utils import ( @@ -232,7 +233,21 @@ def read_zarr( tables=tables, attrs=attrs, ) - sdata.path = store if isinstance(store, UPath) else resolved_store.root + if isinstance(store, UPath): + sdata.path = store + elif isinstance(store, str): + sdata.path = UPath(store) if "://" in store else Path(store) + elif isinstance(store, Path): + sdata.path = store + elif isinstance(store, zarr.Group): + if isinstance(resolved_store, LocalStore): + sdata.path = Path(resolved_store.root) + elif isinstance(resolved_store, FsspecStore): + sdata.path = UPath(str(_FsspecStoreRoot(resolved_store))) + else: + sdata.path = None + else: + sdata.path = None return sdata diff --git a/tests/io/remote_storage/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py index fa72ff914..065211910 100644 --- a/tests/io/remote_storage/test_remote_storage.py +++ b/tests/io/remote_storage/test_remote_storage.py @@ -136,6 +136,13 @@ def test_write_read_roundtrip_remote(self, full_sdata: SpatialData, get_upath, s assert isinstance(full_sdata.path, UPath) assert full_sdata.path == upath _assert_read_identical(full_sdata, upath) + # ``str(upath)`` drops storage options on the UPath; S3 against moto still works via + # ``AWS_*`` / ``AWS_ENDPOINT_URL`` from conftest. Azure/GCS strings would omit credentials + # or emulator endpoints, so we only assert the string-URL read path for S3 here. + if storage_name == "s3": + sdata_str_url = SpatialData.read(str(upath)) + assert isinstance(sdata_str_url.path, UPath) + assert_spatial_data_objects_are_identical(full_sdata, sdata_str_url) @REMOTE_STORAGE_PARAMS def test_path_setter_with_remote_then_operations( From f1cc6516897683aea3e2ca9a5e9c69d19cb9496b Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 13:57:51 +0200 Subject: [PATCH 22/32] Ensure existing Zarr stores are returned unchanged in _resolve_zarr_store --- src/spatialdata/_io/_utils.py | 5 +++-- tests/io/remote_storage/test_resolve_zarr_store.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 9050348c4..d776d35f9 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -671,8 +671,9 @@ def _resolve_zarr_store( _check_fsspec_at_remote_store_open(path.fs) return FsspecStore(_ensure_async_fs(path.fs), path=path.path, **kwargs) if isinstance(path, zarr.storage.StoreLike): - # if the input already a store, wrap it in an FSStore - return FsspecStore(path, **kwargs) + # Already a concrete store (LocalStore, FsspecStore, MemoryStore, …). Do not pass it as ``fs=`` to + # FsspecStore — that only accepts an async fsspec filesystem and raises on stores (e.g. ``async_impl``). + return path raise TypeError(f"Unsupported type: {type(path)}") diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py index d8c90d46d..c34f26eee 100644 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -7,11 +7,21 @@ from __future__ import annotations -from zarr.storage import FsspecStore +import tempfile + +from zarr.storage import FsspecStore, LocalStore, MemoryStore from spatialdata._io._utils import _FsspecStoreRoot, _resolve_zarr_store, _storage_options_from_fs +def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: + """StoreLike inputs must not be wrapped as FsspecStore(fs=store) — that is only for async filesystems.""" + mem = MemoryStore() + assert _resolve_zarr_store(mem) is mem + loc = LocalStore(tempfile.mkdtemp()) + assert _resolve_zarr_store(loc) is loc + + def test_resolve_zarr_store_fsspec_store_root() -> None: """_FsspecStoreRoot is resolved to FsspecStore when reading from remote (e.g. points/shapes paths).""" import fsspec From 55ba3d08f7e2e7e3fbd1924c5000533fb1a2c7ca Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 14:05:43 +0200 Subject: [PATCH 23/32] remove unused fsspec async handling code and update related test documentation --- src/spatialdata/_io/_utils.py | 18 ------------------ tests/io/remote_storage/conftest.py | 6 +++++- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index d776d35f9..81623cc6b 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -15,7 +15,6 @@ from pathlib import Path from typing import Any, Literal -import fsspec.asyn as _asyn_mod import zarr from anndata import AnnData from dask._task_spec import Task @@ -731,20 +730,3 @@ def handle_read_errors( else: # on_bad_files == BadFileHandleMethod.ERROR # Let it raise exceptions yield - - -# Avoid RuntimeError "Loop is not running" when fsspec closes async sessions at process exit -# (remote storage: Azure, S3, GCS). _utils is used for all store resolution. -_orig_sync = _asyn_mod.sync - - -def _fsspec_sync_wrapped(loop: Any, func: Any, *args: Any, timeout: Any = None, **kwargs: Any) -> Any: - try: - return _orig_sync(loop, func, *args, timeout=timeout, **kwargs) - except RuntimeError as e: - if "Loop is not running" in str(e) or "different loop" in str(e): - return None - raise - - -_asyn_mod.sync = _fsspec_sync_wrapped diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 0d4d08da7..9bcc5af5e 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -28,7 +28,11 @@ def _ensure_gcs_emulator_env() -> None: def _patch_fsspec_sync_for_shutdown() -> None: - """If fsspec.asyn.sync() runs at exit when the loop is gone, return None instead of raising.""" + """If fsspec.asyn.sync() runs at exit when the loop is gone, return None instead of raising. + + SpatialData does not patch ``fsspec.asyn.sync`` at import time (too broad for a library); this + hook runs only for pytest sessions that load this conftest (remote emulator tests). + """ import fsspec.asyn as asyn_mod _orig = asyn_mod.sync From 0e2e424800279639fe0641fad132cf9c7b217e71 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 14:17:14 +0200 Subject: [PATCH 24/32] Updating the path setter to accept strings and normalize them to Path or UPath, and add tests to verify correct coercion of string paths to appropriate types. --- src/spatialdata/_core/spatialdata.py | 14 +++++++++++--- tests/io/test_readwrite.py | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 3c06f8f44..f3ce37226 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -549,13 +549,21 @@ def is_backed(self) -> bool: @property def path(self) -> Path | UPath | None: - """Path to the Zarr storage.""" + """Path to the Zarr storage (always :class:`pathlib.Path` or :class:`upath.UPath` when set).""" return self._path @path.setter - def path(self, value: Path | UPath | None) -> None: - if value is None or isinstance(value, (str, Path, UPath)): + def path(self, value: str | Path | UPath | None) -> None: + if value is None: + self._path = None + elif isinstance(value, (Path, UPath)): self._path = value + elif isinstance(value, str): + # Match ``write()`` / ``_validate_can_safely_write_to_path``: keep ``self._path`` as Path | UPath only. + if "://" in value: + self._path = UPath(value) + else: + self._path = Path(value) else: raise TypeError("Path must be `None`, a `str`, a `Path` or a `UPath` object.") diff --git a/tests/io/test_readwrite.py b/tests/io/test_readwrite.py index 209a43046..bc220c073 100644 --- a/tests/io/test_readwrite.py +++ b/tests/io/test_readwrite.py @@ -1190,6 +1190,17 @@ def test_read_sdata(tmp_path: Path, points: SpatialData) -> None: assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_zarr_group) +def test_path_setter_coerces_str_to_path_or_upath(tmp_path: Path) -> None: + """``SpatialData.path`` is stored as Path | UPath | None; strings are normalized like ``write()``.""" + sdata = SpatialData() + p = tmp_path / "store.zarr" + sdata.path = str(p) + assert isinstance(sdata.path, Path) + assert sdata.path == p + sdata.path = "s3://bucket/key.zarr" + assert isinstance(sdata.path, UPath) + + def test_sdata_with_nan_in_obs(tmp_path: Path) -> None: """Test writing SpatialData with mixed string/NaN values in obs works correctly. From ce20830e1e87f24a00178cb43b86f3ed90f56074 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 14:19:20 +0200 Subject: [PATCH 25/32] write method safeguards for local and remote paths in SpatialData. --- src/spatialdata/_core/spatialdata.py | 11 ++++++++++- src/spatialdata/_io/_utils.py | 11 +++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index f3ce37226..184421ed2 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1044,6 +1044,13 @@ def _validate_can_safely_write_to_path( overwrite: bool = False, saving_an_element: bool = False, ) -> None: + """ + Guard against unsafe writes for **local** paths (zarr check, Dask backing, subfolders). + + For :class:`upath.UPath`, only "store exists vs ``overwrite``" is checked. Local Dask-backing + and subfolder checks are omitted because backing paths are filesystem-local and are not + compared to object-store keys; ``overwrite=True`` on remote URLs must be chosen carefully. + """ from spatialdata._io._utils import ( _backed_elements_contained_in_path, _is_subfolder, @@ -1151,7 +1158,9 @@ def write( The path to the Zarr store to write to. If ``None``, uses :attr:`path` (must be set). overwrite If `True`, overwrite the Zarr store if it already exists. If `False`, `write()` will fail if the Zarr store - already exists. + already exists. For remote paths (:class:`upath.UPath`), the extra safeguards used for local paths (that + Dask-backed files are not inside the write target) are not applied; use ``overwrite=True`` only when you + are sure the destination store may be replaced. consolidate_metadata If `True`, triggers :func:`zarr.convenience.consolidate_metadata`, which writes all the metadata in a single file at the root directory of the store. This makes the data cloud accessible, which is required for certain diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 81623cc6b..ec71e2031 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -516,9 +516,14 @@ def _backed_elements_contained_in_path( ----- If an object does not have a Dask computational graph, it will return an empty list. It is possible for a single SpatialElement to contain multiple files in their Dask computational graph. + + For a remote ``path`` (:class:`upath.UPath`), this always returns an empty list: Dask backing paths + are resolved as local filesystem paths, so they cannot be compared to object-store locations. + :meth:`spatialdata.SpatialData.write` therefore skips the local "backing files in target" guard + for remote targets; ``overwrite=True`` on a remote URL must be used only when overwriting is safe. """ if isinstance(path, UPath): - return [] # no local backing files are "contained" in a remote path + return [] if not isinstance(path, Path): raise TypeError(f"Expected a Path or UPath object, got {type(path)}") return [_is_subfolder(parent=path, child=Path(fp)) for fp in get_dask_backing_files(object)] @@ -552,8 +557,10 @@ def _is_element_self_contained( element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, element_path: Path | UPath, ) -> bool: + """Whether element Dask graphs only reference files under ``element_path`` (local) or N/A (remote).""" if isinstance(element_path, UPath): - return True # treat remote-backed as self-contained for this check + # Backing-file paths are local; cannot relate them to remote keys—assume OK for this heuristic. + return True if isinstance(element, DaskDataFrame): pass # TODO when running test_save_transformations it seems that for the same element this is called multiple times From fbc3040eb04358319c0defc2a93bde4082117e7e Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 14:33:56 +0200 Subject: [PATCH 26/32] Support for UPath in data reading functions and improve error handling for unsupported protocols in storage options, and add test cases to validate new functionality and ensure compatibility with cloud object store protocols. --- .github/workflows/test.yaml | 4 +- src/spatialdata/_io/_utils.py | 76 +++++++++++++------ src/spatialdata/_io/io_points.py | 5 +- src/spatialdata/_io/io_shapes.py | 5 +- src/spatialdata/_io/io_table.py | 3 +- tests/io/remote_storage/conftest.py | 9 ++- .../remote_storage/test_resolve_zarr_store.py | 32 +++++++- 7 files changed, 97 insertions(+), 37 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a626165c8..df6637ea9 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -53,7 +53,9 @@ jobs: fi fi uv sync --group=test - # Start storage emulators (S3, Azure, GCS) only on Linux; service containers are not available on Windows/macOS + # Start storage emulators (S3, Azure, GCS) only on Linux; Docker service containers are not available on + # Windows/macOS runners, so tests/io/remote_storage/ is skipped there (see Test step). Remote I/O is still + # exercised on every PR via the Ubuntu matrix jobs. - name: Build and start storage emulators if: matrix.os == 'ubuntu-latest' run: | diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index ec71e2031..9fc247c69 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -74,9 +74,6 @@ def __fspath__(self) -> str: return str(self) -_PARQUET_FSSPEC_NAMES: frozenset[str] = frozenset( - {"AzureBlobFileSystem", "ExtendedGcsFileSystem", "GCSFileSystem", "MotoS3FS", "S3FileSystem"} -) _CLOUD_OBJECT_STORE_PROTOCOLS: frozenset[str] = frozenset({"abfs", "adl", "az", "gcs", "gs", "s3", "s3a"}) @@ -96,35 +93,54 @@ def _fsspec_protocols(core: Any) -> set[str]: return set() -def _require_known_parquet_fsspec(core: Any) -> None: - if type(core).__name__ in _PARQUET_FSSPEC_NAMES: - return - supported = ", ".join(sorted(_PARQUET_FSSPEC_NAMES)) - label = f"{type(core).__module__}.{type(core).__qualname__}" - raise ValueError( - f"Cannot derive parquet storage_options from filesystem {label!r}. Supported filesystem classes: {supported}." - ) +def _cloud_parquet_protocol_family(core: Any) -> Literal["azure", "gcs", "s3"] | None: + """Map fsspec filesystem protocol(s) to how we extract parquet ``storage_options`` (not by class name).""" + protos = _fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS + if not protos: + return None + if protos & {"s3", "s3a"}: + return "s3" + if protos & {"abfs", "adl", "az"}: + return "azure" + if protos & {"gcs", "gs"}: + return "gcs" + return None def _check_fsspec_at_remote_store_open(fs: Any) -> None: """If ``fs`` looks like S3/GCS/Azure, ensure we can build parquet ``storage_options`` for it.""" core = _unwrap_fsspec_sync_fs(fs) - if not (_fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS): + protos = _fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS + if not protos: return - _require_known_parquet_fsspec(core) + if _cloud_parquet_protocol_family(core) is None: + label = f"{type(core).__module__}.{type(core).__qualname__}" + raise ValueError( + f"Cannot derive parquet storage_options for filesystem {label!r} with protocol(s) {protos!r}. " + "Supported protocol families: S3 (s3, s3a), Azure (abfs, adl, az), GCS (gcs, gs). " + "Custom implementations should expose a matching ``protocol`` (see fsspec)." + ) def _storage_options_from_fs(fs: Any) -> dict[str, Any]: """Build storage_options dict from an fsspec filesystem for use with to_parquet/read_parquet. - Unwraps ``sync_fs`` chains (e.g. async wrappers). Raises if the implementation is not one we - support for adlfs / s3fs / gcsfs-style credentials. + Unwraps ``sync_fs`` chains (e.g. async wrappers). Dispatches by **reported fsspec protocol** (``fs.protocol``), + not by concrete class name, so subclasses and thin wrappers that speak ``s3``/``gs``/``az`` still work as long as + they expose the credential attributes we copy (same shape as s3fs, gcsfs, adlfs). """ core = _unwrap_fsspec_sync_fs(fs) - _require_known_parquet_fsspec(core) + family = _cloud_parquet_protocol_family(core) + if family is None: + label = f"{type(core).__module__}.{type(core).__qualname__}" + protos = _fsspec_protocols(core) + raise ValueError( + f"Cannot derive parquet storage_options from filesystem {label!r} (protocols {protos!r}). " + "Expected an object-store protocol among " + f"{sorted(_CLOUD_OBJECT_STORE_PROTOCOLS)}." + ) out: dict[str, Any] = {} - name = type(core).__name__ - if name == "AzureBlobFileSystem": + if family == "azure": if getattr(core, "connection_string", None): out["connection_string"] = core.connection_string elif getattr(core, "account_name", None) and getattr(core, "account_key", None): @@ -132,7 +148,7 @@ def _storage_options_from_fs(fs: Any) -> dict[str, Any]: out["account_key"] = core.account_key if getattr(core, "anon", None) is not None: out["anon"] = core.anon - elif name in ("S3FileSystem", "MotoS3FS"): + elif family == "s3": if getattr(core, "endpoint_url", None): out["endpoint_url"] = core.endpoint_url if getattr(core, "key", None): @@ -141,7 +157,7 @@ def _storage_options_from_fs(fs: Any) -> dict[str, Any]: out["secret"] = core.secret if getattr(core, "anon", None) is not None: out["anon"] = core.anon - elif name in ("GCSFileSystem", "ExtendedGcsFileSystem"): + elif family == "gcs": if getattr(core, "token", None) is not None: out["token"] = core.token if getattr(core, "_endpoint", None): @@ -149,7 +165,7 @@ def _storage_options_from_fs(fs: Any) -> dict[str, Any]: if getattr(core, "project", None): out["project"] = core.project else: - raise AssertionError(f"Unhandled fsspec class {name!r} (out of sync with _PARQUET_FSSPEC_NAMES)") + raise AssertionError(f"Unhandled protocol family {family!r}") return out @@ -651,6 +667,7 @@ def _resolve_zarr_store( return LocalStore(path.path) if isinstance(path, zarr.Group): + _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) # if the input is a zarr.Group, wrap it with a store if isinstance(path.store, LocalStore): store_path = UPath(path.store.root) / path.path @@ -664,9 +681,20 @@ def _resolve_zarr_store( path=_join_fsspec_store_path(path.store.path, path.path), **kwargs, ) - if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): - # if the store is a ConsolidatedMetadataStore, just return the underlying FSSpec store - return path.store.store + if _cms is not None and isinstance(path.store, _cms): + # Unwrap and apply the same async-fs + parquet guards as a direct FsspecStore on the group. + inner = path.store.store + if isinstance(inner, FsspecStore): + _check_fsspec_at_remote_store_open(inner.fs) + return FsspecStore( + fs=_ensure_async_fs(inner.fs), + path=_join_fsspec_store_path(inner.path, path.path), + **kwargs, + ) + if isinstance(inner, LocalStore): + store_path = UPath(inner.root) / path.path + return LocalStore(store_path.path) + return inner raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, _FsspecStoreRoot): # path-like from read_zarr that carries the same fs (preserves Azure/GCS credentials) diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index be2e30796..90d784742 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -7,6 +7,7 @@ from dask.dataframe import DataFrame as DaskDataFrame from dask.dataframe import read_parquet from ome_zarr.format import Format +from upath import UPath from spatialdata._io._utils import ( _FsspecStoreRoot, @@ -26,9 +27,9 @@ def _read_points( - store: str | Path, + store: str | Path | UPath, ) -> DaskDataFrame: - """Read points from a zarr store.""" + """Read points from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" resolved_store = _resolve_zarr_store(store) f = zarr.open(resolved_store, mode="r") diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index cd521f51b..ccba50dae 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -13,6 +13,7 @@ from natsort import natsorted from ome_zarr.format import Format from shapely import from_ragged_array, to_ragged_array +from upath import UPath from spatialdata._io._utils import ( _FsspecStoreRoot, @@ -39,9 +40,9 @@ def _read_shapes( - store: str | Path, + store: str | Path | UPath, ) -> GeoDataFrame: - """Read shapes from a zarr store.""" + """Read shapes from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" resolved_store = _resolve_zarr_store(store) f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 03ec78526..11414fd66 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -8,6 +8,7 @@ from anndata import read_zarr as read_anndata_zarr from anndata._io.specs import write_elem as write_adata from ome_zarr.format import Format +from upath import UPath from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import ( @@ -20,7 +21,7 @@ from spatialdata.models import TableModel, get_table_keys -def _read_table(store: str | Path) -> AnnData: +def _read_table(store: str | Path | UPath) -> AnnData: resolved_store = _resolve_zarr_store(store) table = read_anndata_zarr(resolved_store) diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 9bcc5af5e..62f87b6c2 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -1,8 +1,11 @@ -"""Minimal pytest config for IO tests. Creates buckets/containers when remote emulators are running. +"""Pytest hooks for ``tests/io/remote_storage/`` only (not loaded from repo-root ``tests/conftest.py``). -Assumes emulators are already running (e.g. Docker: - docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). +Creates buckets/containers when remote emulators are running. Assumes emulators are already up +(e.g. Docker: ``docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators``). Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. + +``pytest_configure`` here patches ``fsspec.asyn.sync`` and ``gcsfs`` session teardown for this subtree +only; the library package itself does not apply those patches globally. """ from __future__ import annotations diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py index c34f26eee..57a0a2257 100644 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -9,6 +9,7 @@ import tempfile +import pytest from zarr.storage import FsspecStore, LocalStore, MemoryStore from spatialdata._io._utils import _FsspecStoreRoot, _resolve_zarr_store, _storage_options_from_fs @@ -39,12 +40,11 @@ def test_storage_options_from_fs_azure_account_key() -> None: """_storage_options_from_fs extracts Azure credentials for writing parquet to remote Azure Blob.""" class AzureBlobFileSystemMock: + protocol = "abfs" account_name = "dev" account_key = "key123" connection_string = None anon = None - - AzureBlobFileSystemMock.__name__ = "AzureBlobFileSystem" out = _storage_options_from_fs(AzureBlobFileSystemMock()) assert out["account_name"] == "dev" assert out["account_key"] == "key123" @@ -54,12 +54,36 @@ def test_storage_options_from_fs_gcs_endpoint() -> None: """_storage_options_from_fs extracts GCS endpoint and project for writing parquet to remote GCS.""" class GCSFileSystemMock: + protocol = "gs" token = "anon" _endpoint = "http://localhost:4443" project = "test" - - GCSFileSystemMock.__name__ = "GCSFileSystem" out = _storage_options_from_fs(GCSFileSystemMock()) assert out["token"] == "anon" assert out["endpoint_url"] == "http://localhost:4443" assert out["project"] == "test" + + +def test_storage_options_from_fs_s3_by_protocol_not_class_name() -> None: + """Subclasses / wrappers are accepted when ``protocol`` is s3 and attrs match s3fs-style credentials.""" + + class CustomS3Wrapper: + protocol = "s3" + endpoint_url = "http://127.0.0.1:9000" + key = "access" + secret = "secret" + anon = False + + out = _storage_options_from_fs(CustomS3Wrapper()) + assert out["endpoint_url"] == "http://127.0.0.1:9000" + assert out["key"] == "access" + assert out["secret"] == "secret" + assert out["anon"] is False + + +def test_storage_options_from_fs_requires_object_store_protocol() -> None: + class NoCloud: + protocol = "file" + + with pytest.raises(ValueError, match="Cannot derive parquet storage_options"): + _storage_options_from_fs(NoCloud()) From 175fbea8a7ef8937a86523dcf18fdbb67888c4d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 12:34:12 +0000 Subject: [PATCH 27/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/io/remote_storage/test_resolve_zarr_store.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py index 57a0a2257..d37e0aa35 100644 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -45,6 +45,7 @@ class AzureBlobFileSystemMock: account_key = "key123" connection_string = None anon = None + out = _storage_options_from_fs(AzureBlobFileSystemMock()) assert out["account_name"] == "dev" assert out["account_key"] == "key123" @@ -58,6 +59,7 @@ class GCSFileSystemMock: token = "anon" _endpoint = "http://localhost:4443" project = "test" + out = _storage_options_from_fs(GCSFileSystemMock()) assert out["token"] == "anon" assert out["endpoint_url"] == "http://localhost:4443" From 3beed0e00aeaa6c90f14821508fdb55d7c122891 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 17:50:49 +0200 Subject: [PATCH 28/32] Refactor full_sdata fixture for consistency in remote I/O tests. --- tests/conftest.py | 12 +----------- tests/io/remote_storage/conftest.py | 4 ++++ 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a9aa8ebaa..c97939129 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,5 @@ from __future__ import annotations -import os - -os.environ.setdefault("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false") - from collections.abc import Sequence from pathlib import Path from typing import Any @@ -93,18 +89,12 @@ def tables() -> list[AnnData]: @pytest.fixture() def full_sdata() -> SpatialData: - # Use two regions so the table categorical has two categories; otherwise anndata does not - # write the obs/region/codes/c/0 chunk (only codes/zarr.json), causing 404 on remote read. return SpatialData( images=_get_images(), labels=_get_labels(), shapes=_get_shapes(), points=_get_points(), - tables=_get_tables( - region=["labels2d", "poly"], - region_key="region", - instance_key="instance_id", - ), + tables=_get_tables(region="labels2d", region_key="region", instance_key="instance_id"), ) diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 62f87b6c2..2ed93e9b6 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -11,12 +11,16 @@ from __future__ import annotations import os + +os.environ.setdefault("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false") + import socket import time import pytest + def _ensure_gcs_emulator_env() -> None: """Point google-cloud-storage / gcsfs defaults at fake-gcs-server (not production).""" raw = os.environ.get("STORAGE_EMULATOR_HOST", "").strip() From a7c51c23d84b4b009261c0aa4bb66a01cc3e7efd Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 17:58:18 +0200 Subject: [PATCH 29/32] rollback the unneeded changes for test cases within the core --- tests/core/operations/test_spatialdata_operations.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/core/operations/test_spatialdata_operations.py b/tests/core/operations/test_spatialdata_operations.py index a898bed0c..68b538e0a 100644 --- a/tests/core/operations/test_spatialdata_operations.py +++ b/tests/core/operations/test_spatialdata_operations.py @@ -559,15 +559,14 @@ def test_init_from_elements(full_sdata: SpatialData) -> None: def test_subset(full_sdata: SpatialData) -> None: - # Exclude labels and poly so the default table (annotating labels2d and poly) is not included - element_names = ["image2d", "points_0", "circles"] + element_names = ["image2d", "points_0", "circles", "poly"] subset0 = full_sdata.subset(element_names) unique_names = set() for _, k, _ in subset0.gen_spatial_elements(): unique_names.add(k) assert "image3d_xarray" in full_sdata.images assert unique_names == set(element_names) - # no table since neither labels2d nor poly are in the subset + # no table since the labels are not present in the subset assert "table" not in subset0.tables adata = AnnData( @@ -676,9 +675,7 @@ def test_transform_to_data_extent(full_sdata: SpatialData, maintain_positioning: def test_validate_table_in_spatialdata(full_sdata): table = full_sdata["table"] region, region_key, _ = get_table_keys(table) - # full_sdata uses two regions (labels2d, poly) so the table annotates both - expected = {"labels2d", "poly"} - assert set(region if isinstance(region, list) else [region]) == expected + assert region == "labels2d" full_sdata.validate_table_in_spatialdata(table) From 6443422cd91d8d332628f26640e325953e17e113 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 18:01:27 +0200 Subject: [PATCH 30/32] rollback the unneeded changes for test cases within the query --- tests/core/query/test_relational_query.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/core/query/test_relational_query.py b/tests/core/query/test_relational_query.py index c28725681..63e7a6f19 100644 --- a/tests/core/query/test_relational_query.py +++ b/tests/core/query/test_relational_query.py @@ -914,9 +914,6 @@ def test_filter_table_non_annotating(full_sdata): def test_labels_table_joins(full_sdata): - # Restrict table to labels2d only so the join returns one row per label (full_sdata default has two regions) - full_sdata["table"].obs["region"] = pd.Categorical(["labels2d"] * full_sdata["table"].n_obs) - full_sdata["table"].uns["spatialdata_attrs"]["region"] = "labels2d" element_dict, table = join_spatialelement_table( sdata=full_sdata, spatial_element_names="labels2d", From be230218371b870f341b4807aee8f4d4288989a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:01:44 +0000 Subject: [PATCH 31/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/io/remote_storage/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 2ed93e9b6..0a0b608b9 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -20,7 +20,6 @@ import pytest - def _ensure_gcs_emulator_env() -> None: """Point google-cloud-storage / gcsfs defaults at fake-gcs-server (not production).""" raw = os.environ.get("STORAGE_EMULATOR_HOST", "").strip() From 53c45eefae006f255737d568962e81ab6bc2ae4d Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 16 Apr 2026 10:56:55 +0200 Subject: [PATCH 32/32] Adding a dedicated job for remote storage tests, updating coverage upload configurations, and refining test execution conditions for different operating systems. --- .github/workflows/test.yaml | 61 +++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index df6637ea9..5849c4dfa 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -53,18 +53,52 @@ jobs: fi fi uv sync --group=test - # Start storage emulators (S3, Azure, GCS) only on Linux; Docker service containers are not available on - # Windows/macOS runners, so tests/io/remote_storage/ is skipped there (see Test step). Remote I/O is still - # exercised on every PR via the Ubuntu matrix jobs. + - name: Test + env: + MPLBACKEND: agg + PLATFORM: ${{ matrix.os }} + DISPLAY: :42 + run: | + uv run pytest --cov --color=yes --cov-report=xml --ignore=tests/io/remote_storage/ + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v5 + with: + name: coverage + verbose: true + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + + test-remote-storage: + runs-on: ubuntu-latest + defaults: + run: + shell: bash + strategy: + fail-fast: false + matrix: + python: ["3.11", "3.13"] + env: + MPLBACKEND: agg + PLATFORM: ubuntu-latest + DISPLAY: :42 + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT: "false" + steps: + - uses: actions/checkout@v6 + - uses: astral-sh/setup-uv@v7 + with: + version: "latest" + python-version: ${{ matrix.python }} + - name: Install dependencies + run: | + uv add dask + uv sync --group=test - name: Build and start storage emulators - if: matrix.os == 'ubuntu-latest' run: | docker build -f tests/io/remote_storage/Dockerfile.emulators -t spatialdata-emulators . docker run --rm -d --name spatialdata-emulators \ -p 5000:5000 -p 10000:10000 -p 4443:4443 \ spatialdata-emulators - name: Wait for emulator ports - if: matrix.os == 'ubuntu-latest' run: | echo "Waiting for S3 (5000), Azure (10000), GCS (4443)..." python3 -c " @@ -80,24 +114,13 @@ jobs: else: raise SystemExit('Emulators did not become ready.') " - # On Linux, emulators run above so full suite (incl. tests/io/remote_storage/) runs. On Windows/macOS, skip remote_storage. - - name: Test - env: - MPLBACKEND: agg - PLATFORM: ${{ matrix.os }} - DISPLAY: :42 - # gcsfs otherwise defaults to ExtendedGcsFileSystem (prod Storage Control gRPC; breaks fake-gcs-server). - GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT: "false" + - name: Test remote storage run: | - if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then - uv run pytest --cov --color=yes --cov-report=xml - else - uv run pytest --cov --color=yes --cov-report=xml --ignore=tests/io/remote_storage/ - fi + uv run pytest tests/io/remote_storage/ --cov --color=yes --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: - name: coverage + name: coverage-remote-storage-${{ matrix.python }} verbose: true env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}