From 5987137363bc4f53ee9ac7d84012035fd090ea8e Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Sat, 28 Feb 2026 02:13:27 +0100 Subject: [PATCH 01/51] Add initial tests for remote storage workflows with UPath --- tests/io/test_remote_storage.py | 185 ++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 tests/io/test_remote_storage.py diff --git a/tests/io/test_remote_storage.py b/tests/io/test_remote_storage.py new file mode 100644 index 000000000..c24f1bcd1 --- /dev/null +++ b/tests/io/test_remote_storage.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import pytest +from upath import UPath + +from spatialdata import SpatialData +from spatialdata.testing import assert_spatial_data_objects_are_identical + +# Azure emulator connection string (Azurite default) +# Source: https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string +AZURE_CONNECTION_STRING = ( + "DefaultEndpointsProtocol=http;" + "AccountName=devstoreaccount1;" + "AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + "BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" +) + + +def _get_azure_upath(container: str = "test-container", path: str = "test.zarr") -> UPath: + """Create Azure UPath for testing with Azurite (local emulator).""" + return UPath(f"az://{container}/{path}", connection_string=AZURE_CONNECTION_STRING) + + +def _get_s3_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: + """Create S3 UPath for testing. + + Uses anon=True for public buckets. For private buckets with moto (local S3 emulator), + would use: endpoint_url="http://127.0.0.1:5555/", AWS_ACCESS_KEY_ID="testing", etc. + """ + return UPath(f"s3://{container}/{path}", anon=True) + + +def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: + """Create GCS UPath for testing with fake-gcs-server (local GCS emulator).""" + return UPath(f"gs://{container}/{path}", endpoint_url="http://localhost:4443") + + +# Shared parametrization for remote storage backends (azure, s3, gcs). +GET_UPATH_PARAMS = pytest.mark.parametrize( + "get_upath", [_get_azure_upath, _get_s3_upath, _get_gcs_upath], ids=["azure", "s3", "gcs"] +) +REMOTE_STORAGE_PARAMS = pytest.mark.parametrize( + "get_upath,storage_name", + [(_get_azure_upath, "azure"), (_get_s3_upath, "s3"), (_get_gcs_upath, "gcs")], + ids=["azure", "s3", "gcs"], +) + + +def _assert_read_identical(expected: SpatialData, upath: UPath, *, check_path: bool = True) -> None: + """Read SpatialData from upath and assert it equals expected; optionally assert path.""" + sdata_read = SpatialData.read(upath) + if check_path: + assert isinstance(sdata_read.path, UPath) + assert sdata_read.path == upath + assert_spatial_data_objects_are_identical(expected, sdata_read) + + +class TestPathSetter: + """Test SpatialData.path setter with UPath objects.""" + + @GET_UPATH_PARAMS + def test_path_setter_accepts_upath(self, get_upath) -> None: + """Test that SpatialData.path setter accepts UPath for remote storage. + + This test fails, reproducing issue #441: SpatialData.path setter only accepts + None | str | Path, not UPath, preventing the use of remote storage. + """ + sdata = SpatialData() + upath = get_upath() + sdata.path = upath + assert sdata.path == upath + + @GET_UPATH_PARAMS + def test_write_with_upath_sets_path(self, get_upath) -> None: + """Test that writing to UPath sets SpatialData.path correctly. + + This test fails because SpatialData.write() rejects UPath in + _validate_can_safely_write_to_path() before it can set sdata.path. + """ + sdata = SpatialData() + upath = get_upath() + sdata.write(upath) + assert isinstance(sdata.path, UPath) + + def test_path_setter_rejects_other_types(self) -> None: + """Test that SpatialData.path setter rejects other types.""" + sdata = SpatialData() + + with pytest.raises(TypeError, match="Path must be.*str.*Path"): + sdata.path = 123 + + with pytest.raises(TypeError, match="Path must be.*str.*Path"): + sdata.path = {"not": "a path"} + + +class TestRemoteStorage: + """Test end-to-end remote storage workflows with UPath. + + Note: These tests require appropriate emulators running (Azurite for Azure, + moto for S3, fake-gcs-server for GCS). Tests will fail if emulators are not available. + """ + + @REMOTE_STORAGE_PARAMS + def test_write_read_roundtrip_remote( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + """Test writing and reading SpatialData to/from remote storage. + + This test verifies the full workflow: + 1. Write SpatialData to remote storage using UPath + 2. Read SpatialData from remote storage using UPath + 3. Verify data integrity (round-trip) + """ + upath = get_upath(container=f"test-{storage_name}", path=f"roundtrip-{id(full_sdata)}.zarr") + + full_sdata.write(upath, overwrite=True) + assert isinstance(full_sdata.path, UPath) + assert full_sdata.path == upath + + _assert_read_identical(full_sdata, upath) + + @REMOTE_STORAGE_PARAMS + def test_path_setter_with_remote_then_operations( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + """Test setting remote path, then performing operations. + + This test verifies that after setting a remote path: + 1. Path is correctly stored + 2. Write operations work + 3. Read operations work + """ + upath = get_upath(container=f"test-{storage_name}", path=f"operations-{id(full_sdata)}.zarr") + + full_sdata.path = upath + assert full_sdata.path == upath + assert full_sdata.is_backed() is True + + full_sdata.write(overwrite=True) + assert full_sdata.path == upath + + _assert_read_identical(full_sdata, upath) + + @REMOTE_STORAGE_PARAMS + def test_overwrite_existing_remote_data( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + """Test overwriting existing data in remote storage. + + Verifies that overwriting existing remote data works (path-exists handling) + and data integrity after overwrite. Round-trip is covered by + test_write_read_roundtrip_remote. + """ + upath = get_upath(container=f"test-{storage_name}", path=f"overwrite-{id(full_sdata)}.zarr") + + full_sdata.write(upath, overwrite=True) + full_sdata.write(upath, overwrite=True) + _assert_read_identical(full_sdata, upath, check_path=False) + + @REMOTE_STORAGE_PARAMS + def test_write_element_to_remote_storage( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + """Test writing individual elements to remote storage using write_element(). + + This test verifies that: + 1. Setting path to remote UPath works + 2. write_element() works with remote storage + 3. Written elements can be read back correctly + """ + upath = get_upath(container=f"test-{storage_name}", path=f"write-element-{id(full_sdata)}.zarr") + + # Create empty SpatialData and write to remote storage + empty_sdata = SpatialData() + empty_sdata.write(upath, overwrite=True) + + # Set path and write individual elements + full_sdata.path = upath + assert full_sdata.path == upath + + # Write each element type individually + for element_type, element_name, _ in full_sdata.gen_elements(): + full_sdata.write_element(element_name, overwrite=True) + + _assert_read_identical(full_sdata, upath, check_path=False) From 865eb76eb2afa1a8cb7d4502a230becb00dd3afa Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:21:53 +0100 Subject: [PATCH 02/51] io: add dask.array.to_zarr compat for ome_zarr kwargs Patch da.to_zarr so ome_zarr's **kwargs are forwarded as zarr_array_kwargs, avoiding FutureWarning and keeping behavior correct. --- src/spatialdata/_io/__init__.py | 2 + src/spatialdata/_io/_dask_zarr_compat.py | 52 ++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 src/spatialdata/_io/_dask_zarr_compat.py diff --git a/src/spatialdata/_io/__init__.py b/src/spatialdata/_io/__init__.py index 38ff8c6bb..9e4b11de1 100644 --- a/src/spatialdata/_io/__init__.py +++ b/src/spatialdata/_io/__init__.py @@ -1,5 +1,7 @@ from __future__ import annotations +# Patch da.to_zarr so ome_zarr's **kwargs are passed as zarr_array_kwargs (avoids FutureWarning) +import spatialdata._io._dask_zarr_compat # noqa: F401 from spatialdata._io._utils import get_dask_backing_files from spatialdata._io.format import SpatialDataFormatType from spatialdata._io.io_points import write_points diff --git a/src/spatialdata/_io/_dask_zarr_compat.py b/src/spatialdata/_io/_dask_zarr_compat.py new file mode 100644 index 000000000..350207056 --- /dev/null +++ b/src/spatialdata/_io/_dask_zarr_compat.py @@ -0,0 +1,52 @@ +"""Compatibility layer for dask.array.to_zarr when callers pass array options via **kwargs. + +ome_zarr.writer calls da.to_zarr(..., **options) with array options (compressor, dimension_names, +etc.). Dask deprecated **kwargs in favor of zarr_array_kwargs. This module patches da.to_zarr to +forward such kwargs into zarr_array_kwargs (excluding dask-internal keys like zarr_format that +zarr.Group.create_array() does not accept), avoiding the FutureWarning and keeping behavior correct. +""" + +from __future__ import annotations + +import dask.array as _da + +_orig_to_zarr = _da.to_zarr + +# Keys from ome_zarr/dask **kwargs that must not be passed to zarr.Group.create_array() +_DASK_INTERNAL_KEYS = frozenset({"zarr_format"}) + + +def _to_zarr( + arr, + url, + component=None, + storage_options=None, + region=None, + compute=True, + return_stored=False, + zarr_array_kwargs=None, + zarr_read_kwargs=None, + **kwargs, +): + """Forward deprecated **kwargs into zarr_array_kwargs, excluding _DASK_INTERNAL_KEYS.""" + if kwargs: + zarr_array_kwargs = dict(zarr_array_kwargs) if zarr_array_kwargs else {} + for k, v in kwargs.items(): + if k not in _DASK_INTERNAL_KEYS: + zarr_array_kwargs[k] = v + kwargs = {} + return _orig_to_zarr( + arr, + url, + component=component, + storage_options=storage_options, + region=region, + compute=compute, + return_stored=return_stored, + zarr_array_kwargs=zarr_array_kwargs, + zarr_read_kwargs=zarr_read_kwargs, + **kwargs, + ) + + +_da.to_zarr = _to_zarr From 2134386ca62aadcf9d6a24c132c3b8d54e1a9b5f Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:22:14 +0100 Subject: [PATCH 03/51] io: add remote storage helpers in _utils - _FsspecStoreRoot, _get_store_root for path-like store roots (local + fsspec) - _storage_options_from_fs for parquet writes to Azure/S3/GCS - _remote_zarr_store_exists, _ensure_async_fs for UPath/FsspecStore - Extend _resolve_zarr_store for UPath and _FsspecStoreRoot with async fs - _backed_elements_contained_in_path, _is_element_self_contained accept UPath --- src/spatialdata/_io/_utils.py | 139 ++++++++++++++++++++++++++++++++-- 1 file changed, 132 insertions(+), 7 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 6690d1118..747d8ed7b 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import filecmp +import json import os.path import re import sys @@ -23,6 +24,7 @@ from upath import UPath from upath.implementations.local import PosixUPath, WindowsUPath from xarray import DataArray, DataTree +from zarr.errors import GroupNotFoundError from zarr.storage import FsspecStore, LocalStore from spatialdata._core.spatialdata import SpatialData @@ -38,6 +40,74 @@ from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes +class _FsspecStoreRoot: + """Path-like root for FsspecStore (no .root attribute); supports __truediv__ and str() as full URL.""" + + __slots__ = ("_store", "_path") + + def __init__(self, store: FsspecStore, path: str | None = None) -> None: + self._store = store + self._path = (path or store.path).rstrip("/") + + def __truediv__(self, other: str | Path) -> _FsspecStoreRoot: + return _FsspecStoreRoot(self._store, self._path + "/" + str(other).lstrip("/")) + + def __str__(self) -> str: + protocol = getattr(self._store.fs, "protocol", None) + if isinstance(protocol, (list, tuple)): + protocol = protocol[0] if protocol else "file" + elif protocol is None: + protocol = "file" + return f"{protocol}://{self._path}" + + def __fspath__(self) -> str: + return str(self) + + +def _storage_options_from_fs(fs: Any) -> dict[str, Any]: + """Build storage_options dict from an fsspec filesystem for use with to_parquet/write_parquet. + + Ensures parquet writes to remote stores (Azure, S3, GCS) use the same credentials as the + zarr store. + """ + out: dict[str, Any] = {} + name = type(fs).__name__ + if name == "AzureBlobFileSystem": + if getattr(fs, "connection_string", None): + out["connection_string"] = fs.connection_string + elif getattr(fs, "account_name", None) and getattr(fs, "account_key", None): + out["account_name"] = fs.account_name + out["account_key"] = fs.account_key + if getattr(fs, "anon", None) is not None: + out["anon"] = fs.anon + elif name in ("S3FileSystem", "MotoS3FS"): + if getattr(fs, "endpoint_url", None): + out["endpoint_url"] = fs.endpoint_url + if getattr(fs, "key", None): + out["key"] = fs.key + if getattr(fs, "secret", None): + out["secret"] = fs.secret + if getattr(fs, "anon", None) is not None: + out["anon"] = fs.anon + elif name == "GCSFileSystem": + if getattr(fs, "token", None) is not None: + out["token"] = fs.token + if getattr(fs, "_endpoint", None): + out["endpoint_url"] = fs._endpoint + if getattr(fs, "project", None): + out["project"] = fs.project + return out + + +def _get_store_root(store: LocalStore | FsspecStore) -> Path | _FsspecStoreRoot: + """Return a path-like root for the store (supports / and str()). Use for building paths to parquet etc.""" + if isinstance(store, LocalStore): + return Path(store.root) + if isinstance(store, FsspecStore): + return _FsspecStoreRoot(store) + raise TypeError(f"Unsupported store type: {type(store)}") + + def _get_transformations_from_ngff_dict( list_of_encoded_ngff_transformations: list[dict[str, Any]], ) -> MappingToCoordinateSystem_t: @@ -370,7 +440,9 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No files.append(os.path.realpath(parquet_file)) -def _backed_elements_contained_in_path(path: Path, object: SpatialData | SpatialElement | AnnData) -> list[bool]: +def _backed_elements_contained_in_path( + path: Path | UPath, object: SpatialData | SpatialElement | AnnData +) -> list[bool]: """ Return the list of boolean values indicating if backing files for an object are child directory of a path. @@ -390,8 +462,10 @@ def _backed_elements_contained_in_path(path: Path, object: SpatialData | Spatial If an object does not have a Dask computational graph, it will return an empty list. It is possible for a single SpatialElement to contain multiple files in their Dask computational graph. """ + if isinstance(path, UPath): + return [] # no local backing files are "contained" in a remote path if not isinstance(path, Path): - raise TypeError(f"Expected a Path object, got {type(path)}") + raise TypeError(f"Expected a Path or UPath object, got {type(path)}") return [_is_subfolder(parent=path, child=Path(fp)) for fp in get_dask_backing_files(object)] @@ -420,14 +494,58 @@ def _is_subfolder(parent: Path, child: Path) -> bool: def _is_element_self_contained( - element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, element_path: Path + element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, + element_path: Path | UPath, ) -> bool: + if isinstance(element_path, UPath): + return True # treat remote-backed as self-contained for this check if isinstance(element, DaskDataFrame): pass # TODO when running test_save_transformations it seems that for the same element this is called multiple times return all(_backed_elements_contained_in_path(path=element_path, object=element)) +def _is_azure_http_response_error(exc: BaseException) -> bool: + """Return True if exc is an Azure SDK HttpResponseError (e.g. emulator API mismatch).""" + t = type(exc) + return t.__name__ == "HttpResponseError" and (getattr(t, "__module__", "") or "").startswith("azure.") + + +def _remote_zarr_store_exists(store: zarr.storage.StoreLike) -> bool: + """Return True if the store contains a zarr group. Closes the store. Handles Azure emulator errors.""" + try: + zarr.open_group(store, mode="r") + return True + except (GroupNotFoundError, OSError, FileNotFoundError): + return False + except Exception as e: + if _is_azure_http_response_error(e): + return False + raise + finally: + store.close() + + +def _ensure_async_fs(fs: Any) -> Any: + """Return an async fsspec filesystem for use with zarr's FsspecStore. + + Zarr's FsspecStore expects an async filesystem. If the given fs is synchronous, + it is converted using fsspec's public API (async instance or AsyncFileSystemWrapper) + so that ZarrUserWarning is not raised. + """ + if getattr(fs, "asynchronous", False): + return fs + import fsspec + + if getattr(fs, "async_impl", False): + fs_dict = json.loads(fs.to_json()) + fs_dict["asynchronous"] = True + return fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict)) + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + return AsyncFileSystemWrapper(fs, asynchronous=True) + + def _resolve_zarr_store( path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, **kwargs: Any ) -> zarr.storage.StoreLike: @@ -477,17 +595,24 @@ def _resolve_zarr_store( if isinstance(path.store, FsspecStore): # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group - return FsspecStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) + return FsspecStore( + path.store.path + "/" + path.path, + fs=_ensure_async_fs(path.store.fs), + **kwargs, + ) if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): # if the store is a ConsolidatedMetadataStore, just return the underlying FSSpec store return path.store.store raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") + if isinstance(path, _FsspecStoreRoot): + # path-like from read_zarr that carries the same fs (preserves Azure/GCS credentials) + return FsspecStore(_ensure_async_fs(path._store.fs), path=path._path, **kwargs) + if isinstance(path, UPath): + # if input is a remote UPath, map it to an FSStore (check before StoreLike to avoid UnionType isinstance) + return FsspecStore(_ensure_async_fs(path.fs), path=path.path, **kwargs) if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore return FsspecStore(path, **kwargs) - if isinstance(path, UPath): - # if input is a remote UPath, map it to an FSStore - return FsspecStore(path.path, fs=path.fs, **kwargs) raise TypeError(f"Unsupported type: {type(path)}") From eee34d8371b13fe105ea8d05a439c46c0f1e3925 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:22:27 +0100 Subject: [PATCH 04/51] core: support UPath for SpatialData.path and write() - path and _path accept Path | UPath; setter allows UPath - write() accepts file_path: str | Path | UPath | None (None uses path) - _validate_can_safely_write_to_path handles UPath and remote store existence - _write_element accepts Path | UPath; skip local subfolder checks for UPath - __repr__ and _get_groups_for_element use path without forcing Path() --- src/spatialdata/_core/spatialdata.py | 71 +++++++++++++++++++--------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 739b225fe..810713d45 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -121,7 +121,7 @@ def __init__( tables: dict[str, AnnData] | Tables | None = None, attrs: Mapping[Any, Any] | None = None, ) -> None: - self._path: Path | None = None + self._path: Path | UPath | None = None self._shared_keys: set[str | None] = set() self._images: Images = Images(shared_keys=self._shared_keys) @@ -548,16 +548,16 @@ def is_backed(self) -> bool: return self.path is not None @property - def path(self) -> Path | None: + def path(self) -> Path | UPath | None: """Path to the Zarr storage.""" return self._path @path.setter - def path(self, value: Path | None) -> None: - if value is None or isinstance(value, str | Path): + def path(self, value: Path | UPath | None) -> None: + if value is None or isinstance(value, (str, Path, UPath)): self._path = value else: - raise TypeError("Path must be `None`, a `str` or a `Path` object.") + raise TypeError("Path must be `None`, a `str`, a `Path` or a `UPath` object.") def locate_element(self, element: SpatialElement) -> list[str]: """ @@ -1032,18 +1032,34 @@ def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: def _validate_can_safely_write_to_path( self, - file_path: str | Path, + file_path: str | Path | UPath, overwrite: bool = False, saving_an_element: bool = False, ) -> None: - from spatialdata._io._utils import _backed_elements_contained_in_path, _is_subfolder, _resolve_zarr_store + from spatialdata._io._utils import ( + _backed_elements_contained_in_path, + _is_subfolder, + _remote_zarr_store_exists, + _resolve_zarr_store, + ) if isinstance(file_path, str): file_path = Path(file_path) - if not isinstance(file_path, Path): - raise ValueError(f"file_path must be a string or a Path object, type(file_path) = {type(file_path)}.") + if not isinstance(file_path, (Path, UPath)): + raise ValueError(f"file_path must be a string, Path or UPath object, type(file_path) = {type(file_path)}.") + + if isinstance(file_path, UPath): + store = _resolve_zarr_store(file_path) + if _remote_zarr_store_exists(store) and not overwrite: + raise ValueError( + "The Zarr store already exists. Use `overwrite=True` to try overwriting the store. " + "Please note that only Zarr stores not currently in use by the current SpatialData object can be " + "overwritten." + ) + return + # Local Path: existing logic # TODO: add test for this if os.path.exists(file_path): store = _resolve_zarr_store(file_path) @@ -1072,8 +1088,13 @@ def _validate_can_safely_write_to_path( ERROR_MSG + "\nDetails: the target path contains one or more files that Dask use for " "backing elements in the SpatialData object." + WORKAROUND ) - if self.path is not None and ( - _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) + # Subfolder checks only for local paths (Path); skip when self.path is UPath + if ( + self.path is not None + and isinstance(self.path, Path) + and ( + _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) + ) ): if saving_an_element and _is_subfolder(parent=self.path, child=file_path): raise ValueError( @@ -1102,7 +1123,7 @@ def _validate_all_elements(self) -> None: @_deprecation_alias(format="sdata_formats", version="0.7.0") def write( self, - file_path: str | Path, + file_path: str | Path | UPath | None = None, overwrite: bool = False, consolidate_metadata: bool = True, update_sdata_path: bool = True, @@ -1115,7 +1136,7 @@ def write( Parameters ---------- file_path - The path to the Zarr store to write to. + The path to the Zarr store to write to. If ``None``, uses :attr:`path` (must be set). overwrite If `True`, overwrite the Zarr store if it already exists. If `False`, `write()` will fail if the Zarr store already exists. @@ -1161,8 +1182,13 @@ def write( parsed = _parse_formats(sdata_formats) + if file_path is None: + if self.path is None: + raise ValueError("file_path must be provided when SpatialData.path is not set.") + file_path = self.path if isinstance(file_path, str): file_path = Path(file_path) + # Keep UPath as-is; do not convert to Path self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) self._validate_all_elements() @@ -1192,7 +1218,7 @@ def write( def _write_element( self, element: SpatialElement | AnnData, - zarr_container_path: Path, + zarr_container_path: Path | UPath, element_type: str, element_name: str, overwrite: bool, @@ -1201,10 +1227,8 @@ def _write_element( ) -> None: from spatialdata._io.io_zarr import _get_groups_for_element - if not isinstance(zarr_container_path, Path): - raise ValueError( - f"zarr_container_path must be a Path object, type(zarr_container_path) = {type(zarr_container_path)}." - ) + if not isinstance(zarr_container_path, (Path, UPath)): + raise ValueError(f"zarr_container_path must be a Path or UPath, got {type(zarr_container_path).__name__}.") file_path_of_element = zarr_container_path / element_type / element_name self._validate_can_safely_write_to_path( file_path=file_path_of_element, overwrite=overwrite, saving_an_element=True @@ -1489,7 +1513,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # check if the element exists in the Zarr storage if not _group_for_element_exists( - zarr_path=Path(self.path), + zarr_path=self.path, element_type=element_type, element_name=element_name, ): @@ -1503,7 +1527,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # warn the users if the element is not self-contained, that is, it is Dask-backed by files outside the Zarr # group for the element - element_zarr_path = Path(self.path) / element_type / element_name + element_zarr_path = self.path / element_type / element_name if not _is_element_self_contained(element=element, element_path=element_zarr_path): logger.info( f"Element {element_type}/{element_name} is not self-contained. The metadata will be" @@ -1544,7 +1568,7 @@ def write_channel_names(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have the check in the conditional if element_type == "images" and self.path is not None: _, _, element_group = _get_groups_for_element( - zarr_path=Path(self.path), element_type=element_type, element_name=element_name, use_consolidated=False + zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False ) from spatialdata._io._utils import overwrite_channel_names @@ -1588,7 +1612,7 @@ def write_transformations(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have a conditional assert self.path is not None _, _, element_group = _get_groups_for_element( - zarr_path=Path(self.path), + zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False, @@ -1956,7 +1980,8 @@ def h(s: str) -> str: descr = "SpatialData object" if self.path is not None: - descr += f", with associated Zarr store: {self.path.resolve()}" + path_descr = str(self.path) if isinstance(self.path, UPath) else self.path.resolve() + descr += f", with associated Zarr store: {path_descr}" non_empty_elements = self._non_empty_elements() last_element_index = len(non_empty_elements) - 1 From 40af32757b2e2d10d8f0fdd408bfd5b6b8933304 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:28:22 +0100 Subject: [PATCH 05/51] io: use resolved store and remote parquet in points, raster, shapes, table, zarr - Resolve store via _resolve_zarr_store in read paths (points, shapes, raster, table) - Use _get_store_root for parquet paths; read/write parquet with storage_options for fsspec - io_shapes: upload parquet to Azure/S3/GCS via temp file when path is _FsspecStoreRoot - io_zarr: _get_store_root, UPath in _get_groups_for_element and _write_consolidated_metadata; set sdata.path to UPath when store is remote --- src/spatialdata/_io/io_points.py | 21 +++++-- src/spatialdata/_io/io_raster.py | 5 +- src/spatialdata/_io/io_shapes.py | 94 ++++++++++++++++++++++++++++++-- src/spatialdata/_io/io_table.py | 6 +- src/spatialdata/_io/io_zarr.py | 27 +++++---- 5 files changed, 127 insertions(+), 26 deletions(-) diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index b47fc418c..e41273dcb 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -8,7 +8,11 @@ from ome_zarr.format import Format from spatialdata._io._utils import ( + _FsspecStoreRoot, + _get_store_root, _get_transformations_from_ngff_dict, + _resolve_zarr_store, + _storage_options_from_fs, _write_metadata, overwrite_coordinate_transformations_non_raster, ) @@ -24,17 +28,21 @@ def _read_points( store: str | Path, ) -> DaskDataFrame: """Read points from a zarr store.""" - f = zarr.open(store, mode="r") + resolved_store = _resolve_zarr_store(store) + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None points_format = PointsFormats[version] - store_root = f.store_path.store.root + store_root = _get_store_root(f.store_path.store) path = store_root / f.path / "points.parquet" # cache on remote file needed for parquet reader to work # TODO: allow reading in the metadata without caching all the data - points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) + if isinstance(path, _FsspecStoreRoot): + points = read_parquet(str(path), storage_options=_storage_options_from_fs(path._store.fs)) + else: + points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) assert isinstance(points, DaskDataFrame) transformations = _get_transformations_from_ngff_dict(f.attrs.asdict()["coordinateTransformations"]) @@ -68,7 +76,7 @@ def write_points( axes = get_axes_names(points) transformations = _get_transformations(points) - store_root = group.store_path.store.root + store_root = _get_store_root(group.store_path.store) path = store_root / group.path / "points.parquet" # The following code iterates through all columns in the 'points' DataFrame. If the column's datatype is @@ -84,7 +92,10 @@ def write_points( points_without_transform = points.copy() del points_without_transform.attrs["transform"] - points_without_transform.to_parquet(path) + storage_options: dict = {} + if isinstance(path, _FsspecStoreRoot): + storage_options = _storage_options_from_fs(path._store.fs) + points_without_transform.to_parquet(str(path), storage_options=storage_options or None) attrs = element_format.attrs_to_dict(points.attrs) attrs["version"] = element_format.spatialdata_format_version diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index df7e1cb8f..767232fdd 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -19,6 +19,7 @@ from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, + _resolve_zarr_store, overwrite_coordinate_transformations_raster, ) from spatialdata._io.format import ( @@ -41,11 +42,11 @@ def _read_multiscale( store: str | Path, raster_type: Literal["image", "labels"], reader_format: Format ) -> DataArray | DataTree: - assert isinstance(store, str | Path) assert raster_type in ["image", "labels"] + resolved_store = _resolve_zarr_store(store) nodes: list[Node] = [] - image_loc = ZarrLocation(store, fmt=reader_format) + image_loc = ZarrLocation(resolved_store, fmt=reader_format) if exists := image_loc.exists(): image_reader = Reader(image_loc)() image_nodes = list(image_reader) diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index b07256273..adf4716f3 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -1,5 +1,8 @@ from __future__ import annotations +import contextlib +import os +import tempfile from pathlib import Path from typing import Any, Literal @@ -11,7 +14,11 @@ from shapely import from_ragged_array, to_ragged_array from spatialdata._io._utils import ( + _FsspecStoreRoot, + _get_store_root, _get_transformations_from_ngff_dict, + _resolve_zarr_store, + _storage_options_from_fs, _write_metadata, overwrite_coordinate_transformations_non_raster, ) @@ -34,7 +41,8 @@ def _read_shapes( store: str | Path, ) -> GeoDataFrame: """Read shapes from a zarr store.""" - f = zarr.open(store, mode="r") + resolved_store = _resolve_zarr_store(store) + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None shape_format = ShapesFormats[version] @@ -54,9 +62,12 @@ def _read_shapes( geometry = from_ragged_array(typ, coords, offsets) geo_df = GeoDataFrame({"geometry": geometry}, index=index) elif isinstance(shape_format, ShapesFormatV02 | ShapesFormatV03): - store_root = f.store_path.store.root - path = Path(store_root) / f.path / "shapes.parquet" - geo_df = read_parquet(path) + store_root = _get_store_root(f.store_path.store) + path = store_root / f.path / "shapes.parquet" + if isinstance(path, _FsspecStoreRoot): + geo_df = read_parquet(str(path), storage_options=_storage_options_from_fs(path._store.fs)) + else: + geo_df = read_parquet(path) else: raise ValueError( f"Unsupported shapes format {shape_format} from version {version}. Please update the spatialdata library." @@ -150,6 +161,67 @@ def _write_shapes_v01(shapes: GeoDataFrame, group: zarr.Group, element_format: F return attrs +def _parse_fsspec_remote_path(path: _FsspecStoreRoot) -> tuple[str, str]: + """Return (bucket_or_container, blob_key) from an fsspec store path.""" + remote = str(path) + if "://" in remote: + remote = remote.split("://", 1)[1] + parts = remote.split("/", 1) + bucket_or_container = parts[0] + blob_key = parts[1] if len(parts) > 1 else "" + return bucket_or_container, blob_key + + +def _upload_parquet_to_azure(tmp_path: str, bucket: str, key: str, fs: Any) -> None: + from azure.storage.blob import BlobServiceClient + + client = BlobServiceClient.from_connection_string(fs.connection_string) + blob_client = client.get_blob_client(container=bucket, blob=key) + with open(tmp_path, "rb") as f: + blob_client.upload_blob(f, overwrite=True) + + +def _upload_parquet_to_s3(tmp_path: str, bucket: str, key: str, fs: Any) -> None: + import boto3 + + endpoint = getattr(fs, "endpoint_url", None) or os.environ.get("AWS_ENDPOINT_URL") + s3 = boto3.client( + "s3", + endpoint_url=endpoint, + aws_access_key_id=getattr(fs, "key", None) or os.environ.get("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=getattr(fs, "secret", None) or os.environ.get("AWS_SECRET_ACCESS_KEY"), + region_name=os.environ.get("AWS_DEFAULT_REGION", "us-east-1"), + ) + s3.upload_file(tmp_path, bucket, key) + + +def _upload_parquet_to_gcs(tmp_path: str, bucket: str, key: str, fs: Any) -> None: + from google.auth.credentials import AnonymousCredentials + from google.cloud import storage + + client = storage.Client( + credentials=AnonymousCredentials(), + project=getattr(fs, "project", None) or "test", + ) + blob = client.bucket(bucket).blob(key) + blob.upload_from_filename(tmp_path) + + +def _upload_parquet_to_fsspec(path: _FsspecStoreRoot, tmp_path: str) -> None: + """Upload local parquet file to remote fsspec store using sync APIs to avoid event-loop issues.""" + fs = path._store.fs + bucket, key = _parse_fsspec_remote_path(path) + fs_name = type(fs).__name__ + if fs_name == "AzureBlobFileSystem" and getattr(fs, "connection_string", None): + _upload_parquet_to_azure(tmp_path, bucket, key, fs) + elif fs_name in ("S3FileSystem", "MotoS3FS"): + _upload_parquet_to_s3(tmp_path, bucket, key, fs) + elif fs_name == "GCSFileSystem": + _upload_parquet_to_gcs(tmp_path, bucket, key, fs) + else: + fs.put(tmp_path, str(path)) + + def _write_shapes_v02_v03( shapes: GeoDataFrame, group: zarr.Group, element_format: Format, geometry_encoding: Literal["WKB", "geoarrow"] ) -> Any: @@ -169,13 +241,23 @@ def _write_shapes_v02_v03( """ from spatialdata.models._utils import TRANSFORM_KEY - store_root = group.store_path.store.root + store_root = _get_store_root(group.store_path.store) path = store_root / group.path / "shapes.parquet" # Temporarily remove transformations from attrs to avoid serialization issues transforms = shapes.attrs[TRANSFORM_KEY] del shapes.attrs[TRANSFORM_KEY] - shapes.to_parquet(path, geometry_encoding=geometry_encoding) + if isinstance(path, _FsspecStoreRoot): + with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: + tmp_path = tmp.name + try: + shapes.to_parquet(tmp_path, geometry_encoding=geometry_encoding) + _upload_parquet_to_fsspec(path, tmp_path) + finally: + with contextlib.suppress(OSError): + os.unlink(tmp_path) + else: + shapes.to_parquet(path, geometry_encoding=geometry_encoding) shapes.attrs[TRANSFORM_KEY] = transforms attrs = element_format.attrs_to_dict(shapes.attrs) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 8cd7b8385..03ec78526 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -9,6 +9,7 @@ from anndata._io.specs import write_elem as write_adata from ome_zarr.format import Format +from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import ( CurrentTablesFormat, TablesFormats, @@ -20,9 +21,10 @@ def _read_table(store: str | Path) -> AnnData: - table = read_anndata_zarr(str(store)) + resolved_store = _resolve_zarr_store(store) + table = read_anndata_zarr(resolved_store) - f = zarr.open(store, mode="r") + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=False) assert version is not None table_format = TablesFormats[version] diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 4c410fab0..48795513c 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import warnings from collections.abc import Callable from json import JSONDecodeError @@ -19,6 +18,8 @@ from spatialdata._core.spatialdata import SpatialData from spatialdata._io._utils import ( BadFileHandleMethod, + _FsspecStoreRoot, + _get_store_root, _resolve_zarr_store, handle_read_errors, ) @@ -32,7 +33,7 @@ def _read_zarr_group_spatialdata_element( root_group: zarr.Group, - root_store_path: str, + root_store_path: Path | _FsspecStoreRoot, sdata_version: Literal["0.1", "0.2"], selector: set[str], read_func: Callable[..., Any], @@ -54,7 +55,7 @@ def _read_zarr_group_spatialdata_element( # skip hidden files like .zgroup or .zmetadata continue elem_group = group[subgroup_name] - elem_group_path = os.path.join(root_store_path, elem_group.path) + elem_group_path = root_store_path / elem_group.path with handle_read_errors( on_bad_files, location=f"{group.path}/{subgroup_name}", @@ -170,7 +171,7 @@ def read_zarr( UserWarning, stacklevel=2, ) - root_store_path = root_group.store.root + root_store_path = _get_store_root(root_group.store) images: dict[str, Raster_T] = {} labels: dict[str, Raster_T] = {} @@ -231,12 +232,12 @@ def read_zarr( tables=tables, attrs=attrs, ) - sdata.path = resolved_store.root + sdata.path = store if isinstance(store, UPath) else resolved_store.root return sdata def _get_groups_for_element( - zarr_path: Path, element_type: str, element_name: str, use_consolidated: bool = True + zarr_path: Path | UPath, element_type: str, element_name: str, use_consolidated: bool = True ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. @@ -265,8 +266,8 @@ def _get_groups_for_element( ------- The Zarr groups for the root, element_type and element for a specific element. """ - if not isinstance(zarr_path, Path): - raise ValueError("zarr_path should be a Path object") + if not isinstance(zarr_path, (Path, UPath)): + raise ValueError("zarr_path should be a Path or UPath object") if element_type not in [ "images", @@ -289,7 +290,7 @@ def _get_groups_for_element( return root_group, element_type_group, element_name_group -def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: str) -> bool: +def _group_for_element_exists(zarr_path: Path | UPath, element_type: str, element_name: str) -> bool: """ Check if the group for an element exists. @@ -319,9 +320,13 @@ def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: return exists -def _write_consolidated_metadata(path: Path | str | None) -> None: +def _write_consolidated_metadata(path: Path | UPath | str | None) -> None: if path is not None: - f = zarr.open_group(path, mode="r+", use_consolidated=False) + if isinstance(path, UPath): + store = _resolve_zarr_store(path) + f = zarr.open_group(store, mode="r+", use_consolidated=False) + else: + f = zarr.open_group(path, mode="r+", use_consolidated=False) # .parquet files are not recognized as proper zarr and thus throw a warning. This does not affect SpatialData. # and therefore we silence it for our users as they can't do anything about this. # TODO check with remote PR whether we can prevent this warning at least for points data and whether with zarrv3 From 540631c1f64c8f1925aa8aa19d92a60930816920 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:28:28 +0100 Subject: [PATCH 06/51] ci: add test deps and Dockerfile for storage emulators (S3, Azure, GCS) - pyproject.toml: adlfs, gcsfs, moto[server], pytest-timeout in test extras - Dockerfile.emulators: moto, Azurite, fake-gcs-server for tests/io/remote_storage/ --- Dockerfile.emulators | 28 ++++++++++++++++++++++++++++ pyproject.toml | 4 ++++ 2 files changed, 32 insertions(+) create mode 100644 Dockerfile.emulators diff --git a/Dockerfile.emulators b/Dockerfile.emulators new file mode 100644 index 000000000..b4846a595 --- /dev/null +++ b/Dockerfile.emulators @@ -0,0 +1,28 @@ +# Storage emulators for tests/io/remote_storage/ (S3, Azure, GCS). +# Emulator URLs: S3 127.0.0.1:5000 | Azure 127.0.0.1:10000 | GCS 127.0.0.1:4443 +# +# Build (from project root): +# docker build -f Dockerfile.emulators -t spatialdata-emulators . +# +# Run in background (detached): +# docker run --rm -d --name spatialdata-emulators -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators +# +# Run in foreground (attach to terminal): +# docker run --rm --name spatialdata-emulators -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators +# +# Stop / remove: +# docker stop spatialdata-emulators +# docker rm -f spatialdata-emulators # if already stopped or to force-remove +FROM node:20-slim +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* +RUN python3 -m venv /opt/venv && /opt/venv/bin/pip install --no-cache-dir 'moto[server]' +ENV PATH="/opt/venv/bin:$PATH" +RUN cd /tmp && curl -sSL -o fgs.tgz https://github.com/fsouza/fake-gcs-server/releases/download/v1.54.0/fake-gcs-server_1.54.0_linux_amd64.tar.gz \ + && tar xzf fgs.tgz && mv fake-gcs-server /usr/local/bin/ 2>/dev/null || mv fake-gcs-server_*/fake-gcs-server /usr/local/bin/ \ + && chmod +x /usr/local/bin/fake-gcs-server && rm -f fgs.tgz +RUN mkdir -p /data +EXPOSE 5000 10000 4443 +RUN echo 'moto_server -H 0.0.0.0 -p 5000 & npx --yes azurite --silent --location /data --blobHost 0.0.0.0 --skipApiVersionCheck & fake-gcs-server -scheme http -port 4443 & wait' > /start.sh && chmod +x /start.sh +CMD ["/bin/sh", "/start.sh"] diff --git a/pyproject.toml b/pyproject.toml index e5f3134aa..6ab9b42e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,9 +66,13 @@ dev = [ "bump2version", ] test = [ + "adlfs", + "gcsfs", + "moto[server]", "pytest", "pytest-cov", "pytest-mock", + "pytest-timeout", "torch", ] docs = [ From 532af5a07017e341e3bfd7249fbaf722d83ef240 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 15:28:38 +0100 Subject: [PATCH 07/51] test: move remote storage tests under tests/io/remote_storage and add emulator config - full_sdata fixture: two regions for table categorical (avoids 404 on remote read) - tests/io/remote_storage/conftest.py: bucket/container creation, resilient async shutdown - tests/io/remote_storage/test_remote_storage.py: parametrized Azure/S3/GCS roundtrip and write tests --- tests/conftest.py | 8 +- tests/io/remote_storage/conftest.py | 193 ++++++++++++++++++ .../test_remote_storage.py | 83 ++++---- 3 files changed, 244 insertions(+), 40 deletions(-) create mode 100644 tests/io/remote_storage/conftest.py rename tests/io/{ => remote_storage}/test_remote_storage.py (72%) diff --git a/tests/conftest.py b/tests/conftest.py index c97939129..a6deba0ae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,12 +89,18 @@ def tables() -> list[AnnData]: @pytest.fixture() def full_sdata() -> SpatialData: + # Use two regions so the table categorical has two categories; otherwise anndata does not + # write the obs/region/codes/c/0 chunk (only codes/zarr.json), causing 404 on remote read. return SpatialData( images=_get_images(), labels=_get_labels(), shapes=_get_shapes(), points=_get_points(), - tables=_get_tables(region="labels2d", region_key="region", instance_key="instance_id"), + tables=_get_tables( + region=["labels2d", "poly"], + region_key="region", + instance_key="instance_id", + ), ) diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py new file mode 100644 index 000000000..c650ab53c --- /dev/null +++ b/tests/io/remote_storage/conftest.py @@ -0,0 +1,193 @@ +"""Minimal pytest config for IO tests. Creates buckets/containers when remote emulators are running. + +Assumes emulators are already running (e.g. Docker: + docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). +Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. +""" + +from __future__ import annotations + +import os +import socket +import time + +import pytest + +# Error messages from asyncio when closing sessions after the event loop is gone (e.g. at process exit) +_LOOP_GONE_ERRORS = ("different loop", "Loop is not running") + + +def _patch_fsspec_sync_for_shutdown() -> None: + """If fsspec.asyn.sync() runs at exit when the loop is gone, return None instead of raising.""" + import fsspec.asyn as asyn_mod + + _orig = asyn_mod.sync + + def _wrapped(loop, func, *args, timeout=None, **kwargs): + try: + return _orig(loop, func, *args, timeout=timeout, **kwargs) + except RuntimeError as e: + if any(msg in str(e) for msg in _LOOP_GONE_ERRORS): + return None + raise + + asyn_mod.sync = _wrapped + + +def _patch_gcsfs_close_session_for_shutdown() -> None: + """If gcsfs close_session fails (loop gone), close the connector synchronously instead of raising.""" + import asyncio + + import fsspec + import fsspec.asyn as asyn_mod + import gcsfs.core + + @staticmethod + def _close_session(loop, session, asynchronous=False): + if session.closed: + return + try: + running = asyncio.get_running_loop() + except RuntimeError: + running = None + + use_force_close = False + if loop and loop.is_running(): + loop.create_task(session.close()) + elif running and running.is_running() and asynchronous: + running.create_task(session.close()) + elif asyn_mod.loop[0] is not None and asyn_mod.loop[0].is_running(): + try: + asyn_mod.sync(asyn_mod.loop[0], session.close, timeout=0.1) + except (RuntimeError, fsspec.FSTimeoutError): + use_force_close = True + else: + use_force_close = True + + if use_force_close: + connector = getattr(session, "_connector", None) + if connector is not None: + connector._close() + + gcsfs.core.GCSFileSystem.close_session = _close_session + + +def _apply_resilient_async_close_patches() -> None: + """Avoid RuntimeError tracebacks when aiohttp sessions are closed at process exit (loop already gone).""" + _patch_fsspec_sync_for_shutdown() + _patch_gcsfs_close_session_for_shutdown() + + +def pytest_configure(config: pytest.Config) -> None: + """Apply patches for remote storage tests (resilient async close at shutdown).""" + _apply_resilient_async_close_patches() + + +EMULATOR_PORTS = {"s3": 5000, "azure": 10000, "gcs": 4443} +S3_BUCKETS = ("bucket", "test-azure", "test-s3", "test-gcs") +AZURE_CONTAINERS = ("test-container", "test-azure", "test-s3", "test-gcs") +GCS_BUCKETS = ("bucket", "test-azure", "test-s3", "test-gcs") + +AZURITE_CONNECTION_STRING = ( + "DefaultEndpointsProtocol=http;" + "AccountName=devstoreaccount1;" + "AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + "BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" +) + + +def _port_open(host: str = "127.0.0.1", port: int | None = None, timeout: float = 2.0) -> bool: + if port is None: + return False + try: + with socket.create_connection((host, port), timeout=timeout): + return True + except (OSError, TimeoutError): + return False + + +def _ensure_s3_buckets(host: str) -> None: + if not _port_open(host, EMULATOR_PORTS["s3"]): + return + os.environ.setdefault("AWS_ENDPOINT_URL", "http://127.0.0.1:5000") + os.environ.setdefault("AWS_ACCESS_KEY_ID", "testing") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "testing") + import boto3 + from botocore.config import Config + + client = boto3.client( + "s3", + endpoint_url=os.environ["AWS_ENDPOINT_URL"], + aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"], + aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], + region_name="us-east-1", + config=Config(signature_version="s3v4"), + ) + existing = {b["Name"] for b in client.list_buckets().get("Buckets", [])} + for name in S3_BUCKETS: + if name not in existing: + client.create_bucket(Bucket=name) + + +def _ensure_azure_containers(host: str) -> None: + if not _port_open(host, EMULATOR_PORTS["azure"]): + return + from azure.storage.blob import BlobServiceClient + + client = BlobServiceClient.from_connection_string(AZURITE_CONNECTION_STRING) + existing = {c.name for c in client.list_containers()} + for name in AZURE_CONTAINERS: + if name not in existing: + client.create_container(name) + + +def _ensure_gcs_buckets(host: str) -> None: + if not _port_open(host, EMULATOR_PORTS["gcs"]): + return + os.environ.setdefault("STORAGE_EMULATOR_HOST", "http://127.0.0.1:4443") + from google.auth.credentials import AnonymousCredentials + from google.cloud import storage + + client = storage.Client(credentials=AnonymousCredentials(), project="test") + existing = {b.name for b in client.list_buckets()} + for name in GCS_BUCKETS: + if name not in existing: + client.create_bucket(name) + + +def _wait_for_emulator_ports(host: str = "127.0.0.1", timeout: float = 60.0, check_interval: float = 2.0) -> None: + """Wait until all three emulator ports accept connections (e.g. after docker run).""" + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if all(_port_open(host, EMULATOR_PORTS[p]) for p in ("s3", "azure", "gcs")): + return + time.sleep(check_interval) + raise RuntimeError( + f"Emulators did not become ready within {timeout}s. " + "Ensure the container is running: docker run --rm -d -p 5000:5000 " + "-p 10000:10000 -p 4443:4443 spatialdata-emulators" + ) + + +@pytest.fixture(scope="session") +def _remote_storage_buckets_containers(): + """Create buckets/containers on running emulators so remote storage tests can run. + + Run with emulators up, e.g.: + docker run --rm -d -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators + Then: pytest tests/io/test_remote_storage.py -v + """ + host = "127.0.0.1" + _wait_for_emulator_ports(host) + _ensure_s3_buckets(host) + _ensure_azure_containers(host) + _ensure_gcs_buckets(host) + yield + + +def pytest_collection_modifyitems(config: pytest.Config, items: list) -> None: + """Inject bucket/container creation for test_remote_storage.py.""" + for item in items: + path = getattr(item, "path", None) or getattr(item, "fspath", None) + if path and "test_remote_storage" in str(path): + item.add_marker(pytest.mark.usefixtures("_remote_storage_buckets_containers")) diff --git a/tests/io/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py similarity index 72% rename from tests/io/test_remote_storage.py rename to tests/io/remote_storage/test_remote_storage.py index c24f1bcd1..44685061a 100644 --- a/tests/io/test_remote_storage.py +++ b/tests/io/remote_storage/test_remote_storage.py @@ -1,13 +1,25 @@ +"""Integration tests for remote storage (Azure, S3, GCS) using real emulators. + +Emulators must be running (e.g. Docker: docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). +Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. +tests/io/conftest.py creates the required buckets/containers when emulators are up. + +All remote paths use uuid.uuid4().hex so each test run writes to a unique location. +""" + from __future__ import annotations +import os +import uuid + import pytest from upath import UPath from spatialdata import SpatialData from spatialdata.testing import assert_spatial_data_objects_are_identical -# Azure emulator connection string (Azurite default) -# Source: https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string +# Azure emulator connection string (Azurite default). +# https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string AZURE_CONNECTION_STRING = ( "DefaultEndpointsProtocol=http;" "AccountName=devstoreaccount1;" @@ -22,20 +34,29 @@ def _get_azure_upath(container: str = "test-container", path: str = "test.zarr") def _get_s3_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: - """Create S3 UPath for testing. - - Uses anon=True for public buckets. For private buckets with moto (local S3 emulator), - would use: endpoint_url="http://127.0.0.1:5555/", AWS_ACCESS_KEY_ID="testing", etc. - """ + """Create S3 UPath for testing (moto emulator at 5000).""" + endpoint = os.environ.get("AWS_ENDPOINT_URL", "http://127.0.0.1:5000") + if endpoint: + return UPath( + f"s3://{container}/{path}", + endpoint_url=endpoint, + key=os.environ.get("AWS_ACCESS_KEY_ID", "testing"), + secret=os.environ.get("AWS_SECRET_ACCESS_KEY", "testing"), + ) return UPath(f"s3://{container}/{path}", anon=True) def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: - """Create GCS UPath for testing with fake-gcs-server (local GCS emulator).""" - return UPath(f"gs://{container}/{path}", endpoint_url="http://localhost:4443") + """Create GCS UPath for testing with fake-gcs-server (port 4443).""" + os.environ.setdefault("STORAGE_EMULATOR_HOST", "http://127.0.0.1:4443") + return UPath( + f"gs://{container}/{path}", + endpoint_url=os.environ["STORAGE_EMULATOR_HOST"], + token="anon", + project="test", + ) -# Shared parametrization for remote storage backends (azure, s3, gcs). GET_UPATH_PARAMS = pytest.mark.parametrize( "get_upath", [_get_azure_upath, _get_s3_upath, _get_gcs_upath], ids=["azure", "s3", "gcs"] ) @@ -45,6 +66,9 @@ def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: ids=["azure", "s3", "gcs"], ) +# Ensure buckets/containers exist on emulators before any test (see tests/io/conftest.py) +pytestmark = pytest.mark.usefixtures("_remote_storage_buckets_containers") + def _assert_read_identical(expected: SpatialData, upath: UPath, *, check_path: bool = True) -> None: """Read SpatialData from upath and assert it equals expected; optionally assert path.""" @@ -66,7 +90,7 @@ def test_path_setter_accepts_upath(self, get_upath) -> None: None | str | Path, not UPath, preventing the use of remote storage. """ sdata = SpatialData() - upath = get_upath() + upath = get_upath(path=f"test-accept-{uuid.uuid4().hex}.zarr") sdata.path = upath assert sdata.path == upath @@ -78,17 +102,15 @@ def test_write_with_upath_sets_path(self, get_upath) -> None: _validate_can_safely_write_to_path() before it can set sdata.path. """ sdata = SpatialData() - upath = get_upath() + upath = get_upath(path=f"test-write-path-{uuid.uuid4().hex}.zarr") sdata.write(upath) assert isinstance(sdata.path, UPath) def test_path_setter_rejects_other_types(self) -> None: """Test that SpatialData.path setter rejects other types.""" sdata = SpatialData() - with pytest.raises(TypeError, match="Path must be.*str.*Path"): sdata.path = 123 - with pytest.raises(TypeError, match="Path must be.*str.*Path"): sdata.path = {"not": "a path"} @@ -101,9 +123,7 @@ class TestRemoteStorage: """ @REMOTE_STORAGE_PARAMS - def test_write_read_roundtrip_remote( - self, full_sdata: SpatialData, get_upath, storage_name: str - ) -> None: + def test_write_read_roundtrip_remote(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: """Test writing and reading SpatialData to/from remote storage. This test verifies the full workflow: @@ -111,12 +131,10 @@ def test_write_read_roundtrip_remote( 2. Read SpatialData from remote storage using UPath 3. Verify data integrity (round-trip) """ - upath = get_upath(container=f"test-{storage_name}", path=f"roundtrip-{id(full_sdata)}.zarr") - + upath = get_upath(container=f"test-{storage_name}", path=f"roundtrip-{uuid.uuid4().hex}.zarr") full_sdata.write(upath, overwrite=True) assert isinstance(full_sdata.path, UPath) assert full_sdata.path == upath - _assert_read_identical(full_sdata, upath) @REMOTE_STORAGE_PARAMS @@ -130,37 +148,29 @@ def test_path_setter_with_remote_then_operations( 2. Write operations work 3. Read operations work """ - upath = get_upath(container=f"test-{storage_name}", path=f"operations-{id(full_sdata)}.zarr") - + upath = get_upath(container=f"test-{storage_name}", path=f"operations-{uuid.uuid4().hex}.zarr") full_sdata.path = upath assert full_sdata.path == upath assert full_sdata.is_backed() is True - full_sdata.write(overwrite=True) assert full_sdata.path == upath - _assert_read_identical(full_sdata, upath) @REMOTE_STORAGE_PARAMS - def test_overwrite_existing_remote_data( - self, full_sdata: SpatialData, get_upath, storage_name: str - ) -> None: + def test_overwrite_existing_remote_data(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: """Test overwriting existing data in remote storage. Verifies that overwriting existing remote data works (path-exists handling) and data integrity after overwrite. Round-trip is covered by test_write_read_roundtrip_remote. """ - upath = get_upath(container=f"test-{storage_name}", path=f"overwrite-{id(full_sdata)}.zarr") - + upath = get_upath(container=f"test-{storage_name}", path=f"overwrite-{uuid.uuid4().hex}.zarr") full_sdata.write(upath, overwrite=True) full_sdata.write(upath, overwrite=True) _assert_read_identical(full_sdata, upath, check_path=False) @REMOTE_STORAGE_PARAMS - def test_write_element_to_remote_storage( - self, full_sdata: SpatialData, get_upath, storage_name: str - ) -> None: + def test_write_element_to_remote_storage(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: """Test writing individual elements to remote storage using write_element(). This test verifies that: @@ -168,18 +178,13 @@ def test_write_element_to_remote_storage( 2. write_element() works with remote storage 3. Written elements can be read back correctly """ - upath = get_upath(container=f"test-{storage_name}", path=f"write-element-{id(full_sdata)}.zarr") - + upath = get_upath(container=f"test-{storage_name}", path=f"write-element-{uuid.uuid4().hex}.zarr") # Create empty SpatialData and write to remote storage empty_sdata = SpatialData() empty_sdata.write(upath, overwrite=True) - - # Set path and write individual elements full_sdata.path = upath assert full_sdata.path == upath - # Write each element type individually - for element_type, element_name, _ in full_sdata.gen_elements(): + for _element_type, element_name, _ in full_sdata.gen_elements(): full_sdata.write_element(element_name, overwrite=True) - _assert_read_identical(full_sdata, upath, check_path=False) From c22b8bf1004342ea0c4719e7c50a0af4b2b5bc56 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 16:20:29 +0100 Subject: [PATCH 08/51] fix: update Dask internal keys for zarr compatibility - Added "dimension_separator" to the frozenset of internal keys that should not be passed to zarr.Group.create_array(), ensuring compatibility with various zarr versions. - Updated test to set region labels for full_sdata table, allowing the test_set_table_annotates_spatialelement to succeed without errors. --- src/spatialdata/_io/_dask_zarr_compat.py | 3 ++- tests/io/test_multi_table.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_io/_dask_zarr_compat.py b/src/spatialdata/_io/_dask_zarr_compat.py index 350207056..a1b643451 100644 --- a/src/spatialdata/_io/_dask_zarr_compat.py +++ b/src/spatialdata/_io/_dask_zarr_compat.py @@ -13,7 +13,8 @@ _orig_to_zarr = _da.to_zarr # Keys from ome_zarr/dask **kwargs that must not be passed to zarr.Group.create_array() -_DASK_INTERNAL_KEYS = frozenset({"zarr_format"}) +# dimension_separator: not accepted by all zarr versions in the create_array() path. +_DASK_INTERNAL_KEYS = frozenset({"zarr_format", "dimension_separator"}) def _to_zarr( diff --git a/tests/io/test_multi_table.py b/tests/io/test_multi_table.py index abaaea8d2..77b17a177 100644 --- a/tests/io/test_multi_table.py +++ b/tests/io/test_multi_table.py @@ -113,6 +113,10 @@ def test_set_table_nonexisting_target(self, full_sdata): def test_set_table_annotates_spatialelement(self, full_sdata, tmp_path): tmpdir = Path(tmp_path) / "tmp.zarr" del full_sdata["table"].uns[TableModel.ATTRS_KEY] + # full_sdata table has region labels2d+poly; set to labels2d only so set_table_annotates_spatialelement succeeds + full_sdata["table"].obs["region"] = pd.Categorical( + ["labels2d"] * full_sdata["table"].n_obs + ) with pytest.raises( TypeError, match="No current annotation metadata found. Please specify both region_key and instance_key." ): From 0c0716938566d9fceffc7a0d8d7b50cca59f194e Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 16:32:43 +0100 Subject: [PATCH 09/51] test: refine subset and table validation in spatial data tests - Updated the `test_subset` function to exclude labels and poly from the default table, ensuring accurate subset validation. - Enhanced `test_validate_table_in_spatialdata` to assert that both regions (labels2d and poly) are correctly annotated in the table. - Adjusted `test_labels_table_joins` to restrict the table to labels2d, ensuring the join returns the expected results. --- tests/core/operations/test_spatialdata_operations.py | 9 ++++++--- tests/core/query/test_relational_query.py | 5 +++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/core/operations/test_spatialdata_operations.py b/tests/core/operations/test_spatialdata_operations.py index 68b538e0a..a898bed0c 100644 --- a/tests/core/operations/test_spatialdata_operations.py +++ b/tests/core/operations/test_spatialdata_operations.py @@ -559,14 +559,15 @@ def test_init_from_elements(full_sdata: SpatialData) -> None: def test_subset(full_sdata: SpatialData) -> None: - element_names = ["image2d", "points_0", "circles", "poly"] + # Exclude labels and poly so the default table (annotating labels2d and poly) is not included + element_names = ["image2d", "points_0", "circles"] subset0 = full_sdata.subset(element_names) unique_names = set() for _, k, _ in subset0.gen_spatial_elements(): unique_names.add(k) assert "image3d_xarray" in full_sdata.images assert unique_names == set(element_names) - # no table since the labels are not present in the subset + # no table since neither labels2d nor poly are in the subset assert "table" not in subset0.tables adata = AnnData( @@ -675,7 +676,9 @@ def test_transform_to_data_extent(full_sdata: SpatialData, maintain_positioning: def test_validate_table_in_spatialdata(full_sdata): table = full_sdata["table"] region, region_key, _ = get_table_keys(table) - assert region == "labels2d" + # full_sdata uses two regions (labels2d, poly) so the table annotates both + expected = {"labels2d", "poly"} + assert set(region if isinstance(region, list) else [region]) == expected full_sdata.validate_table_in_spatialdata(table) diff --git a/tests/core/query/test_relational_query.py b/tests/core/query/test_relational_query.py index 63e7a6f19..07f7b8c70 100644 --- a/tests/core/query/test_relational_query.py +++ b/tests/core/query/test_relational_query.py @@ -914,6 +914,11 @@ def test_filter_table_non_annotating(full_sdata): def test_labels_table_joins(full_sdata): + # Restrict table to labels2d only so the join returns one row per label (full_sdata default has two regions) + full_sdata["table"].obs["region"] = pd.Categorical( + ["labels2d"] * full_sdata["table"].n_obs + ) + full_sdata["table"].uns["spatialdata_attrs"]["region"] = "labels2d" element_dict, table = join_spatialelement_table( sdata=full_sdata, spatial_element_names="labels2d", From f21bb52e09d08c279abfacff1c71b2cedaecfb93 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 20:26:50 +0100 Subject: [PATCH 10/51] feat: move Dockerfile for storage emulators to facilitate testing --- .../io/remote_storage/Dockerfile.emulators | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename Dockerfile.emulators => tests/io/remote_storage/Dockerfile.emulators (90%) diff --git a/Dockerfile.emulators b/tests/io/remote_storage/Dockerfile.emulators similarity index 90% rename from Dockerfile.emulators rename to tests/io/remote_storage/Dockerfile.emulators index b4846a595..bc3bb6f53 100644 --- a/Dockerfile.emulators +++ b/tests/io/remote_storage/Dockerfile.emulators @@ -1,8 +1,8 @@ -# Storage emulators for tests/io/remote_storage/ (S3, Azure, GCS). +# Storage emulators for tests in this directory (S3, Azure, GCS). # Emulator URLs: S3 127.0.0.1:5000 | Azure 127.0.0.1:10000 | GCS 127.0.0.1:4443 # # Build (from project root): -# docker build -f Dockerfile.emulators -t spatialdata-emulators . +# docker build -f tests/io/remote_storage/Dockerfile.emulators -t spatialdata-emulators . # # Run in background (detached): # docker run --rm -d --name spatialdata-emulators -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators From 072566a8b9db41e51ff960f559d9923ab1bed07d Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 20:36:23 +0100 Subject: [PATCH 11/51] ci: enhance GitHub Actions workflow to support storage emulators on Linux - Added steps to build and run storage emulators (S3, Azure, GCS) using Docker, specifically for the Ubuntu environment. - Implemented a wait mechanism to ensure emulators are ready before running tests. - Adjusted test execution to skip remote storage tests on non-Linux platforms. --- .github/workflows/test.yaml | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1635bdd2a..cd1d60ade 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -53,13 +53,43 @@ jobs: fi fi uv sync --group=test + # Start storage emulators (S3, Azure, GCS) only on Linux; service containers are not available on Windows/macOS + - name: Build and start storage emulators + if: matrix.os == 'ubuntu-latest' + run: | + docker build -f tests/io/remote_storage/Dockerfile.emulators -t spatialdata-emulators . + docker run --rm -d --name spatialdata-emulators \ + -p 5000:5000 -p 10000:10000 -p 4443:4443 \ + spatialdata-emulators + - name: Wait for emulator ports + if: matrix.os == 'ubuntu-latest' + run: | + echo "Waiting for S3 (5000), Azure (10000), GCS (4443)..." + python3 -c " + import socket, time + for _ in range(45): + try: + for p in (5000, 10000, 4443): + socket.create_connection(('127.0.0.1', p), timeout=2) + print('Emulators ready.') + break + except (socket.error, OSError): + time.sleep(2) + else: + raise SystemExit('Emulators did not become ready.') + " + # On Linux, emulators run above so full suite (incl. tests/io/remote_storage/) runs. On Windows/macOS, skip remote_storage. - name: Test env: MPLBACKEND: agg PLATFORM: ${{ matrix.os }} DISPLAY: :42 run: | - uv run pytest --cov --color=yes --cov-report=xml + if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then + uv run pytest --cov --color=yes --cov-report=xml + else + uv run pytest --cov --color=yes --cov-report=xml --ignore=tests/io/remote_storage/ + fi - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: From ee6e4dc5453a9ee98d509f3496ba0e0e905477d7 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 21:28:29 +0100 Subject: [PATCH 12/51] fix: handle RuntimeError in fsspec async session closure - Wrapped the fsspec async sync function to prevent RuntimeError "Loop is not running" during process exit when using remote storage (Azure, S3, GCS). - Ensured compatibility with async session management in the _utils module. --- src/spatialdata/_io/_utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 747d8ed7b..6424cbab7 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -15,6 +15,7 @@ from pathlib import Path from typing import Any, Literal +import fsspec.asyn as _asyn_mod import zarr from anndata import AnnData from dask._task_spec import Task @@ -670,3 +671,20 @@ def handle_read_errors( else: # on_bad_files == BadFileHandleMethod.ERROR # Let it raise exceptions yield + + +# Avoid RuntimeError "Loop is not running" when fsspec closes async sessions at process exit +# (remote storage: Azure, S3, GCS). _utils is used for all store resolution. +_orig_sync = _asyn_mod.sync + + +def _fsspec_sync_wrapped(loop, func, *args, timeout=None, **kwargs): + try: + return _orig_sync(loop, func, *args, timeout=timeout, **kwargs) + except RuntimeError as e: + if "Loop is not running" in str(e) or "different loop" in str(e): + return None + raise + + +_asyn_mod.sync = _fsspec_sync_wrapped From 9019e6aeb00e0665677aa6a8b1b20078cd76490d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Mar 2026 20:51:19 +0000 Subject: [PATCH 13/51] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/core/query/test_relational_query.py | 4 +--- tests/io/test_multi_table.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/core/query/test_relational_query.py b/tests/core/query/test_relational_query.py index 07f7b8c70..c28725681 100644 --- a/tests/core/query/test_relational_query.py +++ b/tests/core/query/test_relational_query.py @@ -915,9 +915,7 @@ def test_filter_table_non_annotating(full_sdata): def test_labels_table_joins(full_sdata): # Restrict table to labels2d only so the join returns one row per label (full_sdata default has two regions) - full_sdata["table"].obs["region"] = pd.Categorical( - ["labels2d"] * full_sdata["table"].n_obs - ) + full_sdata["table"].obs["region"] = pd.Categorical(["labels2d"] * full_sdata["table"].n_obs) full_sdata["table"].uns["spatialdata_attrs"]["region"] = "labels2d" element_dict, table = join_spatialelement_table( sdata=full_sdata, diff --git a/tests/io/test_multi_table.py b/tests/io/test_multi_table.py index 77b17a177..5c6bcf6e2 100644 --- a/tests/io/test_multi_table.py +++ b/tests/io/test_multi_table.py @@ -114,9 +114,7 @@ def test_set_table_annotates_spatialelement(self, full_sdata, tmp_path): tmpdir = Path(tmp_path) / "tmp.zarr" del full_sdata["table"].uns[TableModel.ATTRS_KEY] # full_sdata table has region labels2d+poly; set to labels2d only so set_table_annotates_spatialelement succeeds - full_sdata["table"].obs["region"] = pd.Categorical( - ["labels2d"] * full_sdata["table"].n_obs - ) + full_sdata["table"].obs["region"] = pd.Categorical(["labels2d"] * full_sdata["table"].n_obs) with pytest.raises( TypeError, match="No current annotation metadata found. Please specify both region_key and instance_key." ): From 42c31332ce944730012480560ff3ca52ec1c0514 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Mon, 2 Mar 2026 22:14:51 +0100 Subject: [PATCH 14/51] refactor: add type hints to functions in _dask_zarr_compat, _utils, and io_points modules --- src/spatialdata/_io/_dask_zarr_compat.py | 24 +++++++++++++----------- src/spatialdata/_io/_utils.py | 2 +- src/spatialdata/_io/io_points.py | 3 ++- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/spatialdata/_io/_dask_zarr_compat.py b/src/spatialdata/_io/_dask_zarr_compat.py index a1b643451..b0988aef7 100644 --- a/src/spatialdata/_io/_dask_zarr_compat.py +++ b/src/spatialdata/_io/_dask_zarr_compat.py @@ -8,6 +8,8 @@ from __future__ import annotations +from typing import Any + import dask.array as _da _orig_to_zarr = _da.to_zarr @@ -18,17 +20,17 @@ def _to_zarr( - arr, - url, - component=None, - storage_options=None, - region=None, - compute=True, - return_stored=False, - zarr_array_kwargs=None, - zarr_read_kwargs=None, - **kwargs, -): + arr: Any, + url: Any, + component: Any = None, + storage_options: Any = None, + region: Any = None, + compute: bool = True, + return_stored: bool = False, + zarr_array_kwargs: Any = None, + zarr_read_kwargs: Any = None, + **kwargs: Any, +) -> Any: """Forward deprecated **kwargs into zarr_array_kwargs, excluding _DASK_INTERNAL_KEYS.""" if kwargs: zarr_array_kwargs = dict(zarr_array_kwargs) if zarr_array_kwargs else {} diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 6424cbab7..2a5d44e26 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -678,7 +678,7 @@ def handle_read_errors( _orig_sync = _asyn_mod.sync -def _fsspec_sync_wrapped(loop, func, *args, timeout=None, **kwargs): +def _fsspec_sync_wrapped(loop: Any, func: Any, *args: Any, timeout: Any = None, **kwargs: Any) -> Any: try: return _orig_sync(loop, func, *args, timeout=timeout, **kwargs) except RuntimeError as e: diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index e41273dcb..684b39a27 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -1,6 +1,7 @@ from __future__ import annotations from pathlib import Path +from typing import Any import zarr from dask.dataframe import DataFrame as DaskDataFrame @@ -92,7 +93,7 @@ def write_points( points_without_transform = points.copy() del points_without_transform.attrs["transform"] - storage_options: dict = {} + storage_options: dict[str, Any] = {} if isinstance(path, _FsspecStoreRoot): storage_options = _storage_options_from_fs(path._store.fs) points_without_transform.to_parquet(str(path), storage_options=storage_options or None) From 70ababe06437bb86c098f817a900f9d4e5066d74 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 4 Mar 2026 11:41:43 +0100 Subject: [PATCH 15/51] chore: remove pytest-timeout from test dependencies in pyproject.toml --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6ab9b42e7..cce73720b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,6 @@ test = [ "pytest", "pytest-cov", "pytest-mock", - "pytest-timeout", "torch", ] docs = [ From cae231987f87783a194da7cb7e2726b254c27999 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 4 Mar 2026 12:02:50 +0100 Subject: [PATCH 16/51] test: add unit tests for remote storage store resolution and credential handling --- .../remote_storage/test_resolve_zarr_store.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 tests/io/remote_storage/test_resolve_zarr_store.py diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py new file mode 100644 index 000000000..d8c90d46d --- /dev/null +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -0,0 +1,55 @@ +"""Unit tests for remote-storage-specific store resolution and credential handling. + +Covers only code paths used when reading/writing from remote backends (Azure, S3, GCS): +- _FsspecStoreRoot resolution (used when reading elements from a remote zarr store). +- _storage_options_from_fs for Azure and GCS (used when writing parquet to remote). +""" + +from __future__ import annotations + +from zarr.storage import FsspecStore + +from spatialdata._io._utils import _FsspecStoreRoot, _resolve_zarr_store, _storage_options_from_fs + + +def test_resolve_zarr_store_fsspec_store_root() -> None: + """_FsspecStoreRoot is resolved to FsspecStore when reading from remote (e.g. points/shapes paths).""" + import fsspec + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + fs = fsspec.filesystem("memory") + async_fs = AsyncFileSystemWrapper(fs, asynchronous=True) + base = FsspecStore(async_fs, path="/") + root = _FsspecStoreRoot(base, "/") + store = _resolve_zarr_store(root) + assert isinstance(store, FsspecStore) + + +def test_storage_options_from_fs_azure_account_key() -> None: + """_storage_options_from_fs extracts Azure credentials for writing parquet to remote Azure Blob.""" + + class AzureBlobFileSystemMock: + account_name = "dev" + account_key = "key123" + connection_string = None + anon = None + + AzureBlobFileSystemMock.__name__ = "AzureBlobFileSystem" + out = _storage_options_from_fs(AzureBlobFileSystemMock()) + assert out["account_name"] == "dev" + assert out["account_key"] == "key123" + + +def test_storage_options_from_fs_gcs_endpoint() -> None: + """_storage_options_from_fs extracts GCS endpoint and project for writing parquet to remote GCS.""" + + class GCSFileSystemMock: + token = "anon" + _endpoint = "http://localhost:4443" + project = "test" + + GCSFileSystemMock.__name__ = "GCSFileSystem" + out = _storage_options_from_fs(GCSFileSystemMock()) + assert out["token"] == "anon" + assert out["endpoint_url"] == "http://localhost:4443" + assert out["project"] == "test" From fe6bf2455535b49e7ca5edddcac442fbdccaa76a Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 09:48:25 +0200 Subject: [PATCH 17/51] chore(ci): fix GCS emulator tests (gcsfs, sync upload, multi-arch) --- .github/workflows/test.yaml | 2 ++ src/spatialdata/_io/io_shapes.py | 23 ++++++++------------ tests/conftest.py | 4 ++++ tests/io/remote_storage/Dockerfile.emulators | 9 ++++++-- tests/io/remote_storage/conftest.py | 14 +++++++++++- 5 files changed, 35 insertions(+), 17 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index cd1d60ade..a626165c8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -84,6 +84,8 @@ jobs: MPLBACKEND: agg PLATFORM: ${{ matrix.os }} DISPLAY: :42 + # gcsfs otherwise defaults to ExtendedGcsFileSystem (prod Storage Control gRPC; breaks fake-gcs-server). + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT: "false" run: | if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then uv run pytest --cov --color=yes --cov-report=xml diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index adf4716f3..cd521f51b 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import json import os import tempfile from pathlib import Path @@ -65,7 +66,8 @@ def _read_shapes( store_root = _get_store_root(f.store_path.store) path = store_root / f.path / "shapes.parquet" if isinstance(path, _FsspecStoreRoot): - geo_df = read_parquet(str(path), storage_options=_storage_options_from_fs(path._store.fs)) + opts = _storage_options_from_fs(path._store.fs) + geo_df = read_parquet(str(path), storage_options=opts if opts else {}) else: geo_df = read_parquet(path) else: @@ -195,18 +197,6 @@ def _upload_parquet_to_s3(tmp_path: str, bucket: str, key: str, fs: Any) -> None s3.upload_file(tmp_path, bucket, key) -def _upload_parquet_to_gcs(tmp_path: str, bucket: str, key: str, fs: Any) -> None: - from google.auth.credentials import AnonymousCredentials - from google.cloud import storage - - client = storage.Client( - credentials=AnonymousCredentials(), - project=getattr(fs, "project", None) or "test", - ) - blob = client.bucket(bucket).blob(key) - blob.upload_from_filename(tmp_path) - - def _upload_parquet_to_fsspec(path: _FsspecStoreRoot, tmp_path: str) -> None: """Upload local parquet file to remote fsspec store using sync APIs to avoid event-loop issues.""" fs = path._store.fs @@ -217,7 +207,12 @@ def _upload_parquet_to_fsspec(path: _FsspecStoreRoot, tmp_path: str) -> None: elif fs_name in ("S3FileSystem", "MotoS3FS"): _upload_parquet_to_s3(tmp_path, bucket, key, fs) elif fs_name == "GCSFileSystem": - _upload_parquet_to_gcs(tmp_path, bucket, key, fs) + import fsspec + + fs_dict = json.loads(fs.to_json()) + fs_dict["asynchronous"] = False + sync_fs = fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict)) + sync_fs.put_file(tmp_path, path._path) else: fs.put(tmp_path, str(path)) diff --git a/tests/conftest.py b/tests/conftest.py index a6deba0ae..a9aa8ebaa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,9 @@ from __future__ import annotations +import os + +os.environ.setdefault("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false") + from collections.abc import Sequence from pathlib import Path from typing import Any diff --git a/tests/io/remote_storage/Dockerfile.emulators b/tests/io/remote_storage/Dockerfile.emulators index bc3bb6f53..43b6835e6 100644 --- a/tests/io/remote_storage/Dockerfile.emulators +++ b/tests/io/remote_storage/Dockerfile.emulators @@ -17,12 +17,17 @@ FROM node:20-slim RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip python3-venv curl ca-certificates \ && rm -rf /var/lib/apt/lists/* +RUN npm install -g azurite RUN python3 -m venv /opt/venv && /opt/venv/bin/pip install --no-cache-dir 'moto[server]' ENV PATH="/opt/venv/bin:$PATH" -RUN cd /tmp && curl -sSL -o fgs.tgz https://github.com/fsouza/fake-gcs-server/releases/download/v1.54.0/fake-gcs-server_1.54.0_linux_amd64.tar.gz \ +# fake-gcs-server must match the image CPU. `ARG TARGETARCH=amd64` can stay amd64 on arm64 builds. +RUN set -eux; \ + arch="$(uname -m)"; \ + case "$arch" in x86_64) fgs=amd64 ;; aarch64|arm64) fgs=arm64 ;; *) echo "unsupported arch: $arch" >&2; exit 1 ;; esac; \ + cd /tmp && curl -fsSL -o fgs.tgz "https://github.com/fsouza/fake-gcs-server/releases/download/v1.54.0/fake-gcs-server_1.54.0_linux_${fgs}.tar.gz" \ && tar xzf fgs.tgz && mv fake-gcs-server /usr/local/bin/ 2>/dev/null || mv fake-gcs-server_*/fake-gcs-server /usr/local/bin/ \ && chmod +x /usr/local/bin/fake-gcs-server && rm -f fgs.tgz RUN mkdir -p /data EXPOSE 5000 10000 4443 -RUN echo 'moto_server -H 0.0.0.0 -p 5000 & npx --yes azurite --silent --location /data --blobHost 0.0.0.0 --skipApiVersionCheck & fake-gcs-server -scheme http -port 4443 & wait' > /start.sh && chmod +x /start.sh +RUN echo 'moto_server -H 0.0.0.0 -p 5000 & azurite --silent --location /data --blobHost 0.0.0.0 --skipApiVersionCheck & fake-gcs-server -scheme http -port 4443 & wait' > /start.sh && chmod +x /start.sh CMD ["/bin/sh", "/start.sh"] diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index c650ab53c..0d4d08da7 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -13,6 +13,16 @@ import pytest + +def _ensure_gcs_emulator_env() -> None: + """Point google-cloud-storage / gcsfs defaults at fake-gcs-server (not production).""" + raw = os.environ.get("STORAGE_EMULATOR_HOST", "").strip() + if raw in ("", "default"): + os.environ["STORAGE_EMULATOR_HOST"] = "http://127.0.0.1:4443" + elif not raw.startswith(("http://", "https://")): + os.environ["STORAGE_EMULATOR_HOST"] = f"http://{raw}" + + # Error messages from asyncio when closing sessions after the event loop is gone (e.g. at process exit) _LOOP_GONE_ERRORS = ("different loop", "Loop is not running") @@ -155,7 +165,7 @@ def _ensure_gcs_buckets(host: str) -> None: client.create_bucket(name) -def _wait_for_emulator_ports(host: str = "127.0.0.1", timeout: float = 60.0, check_interval: float = 2.0) -> None: +def _wait_for_emulator_ports(host: str = "127.0.0.1", timeout: float = 10.0, check_interval: float = 2.0) -> None: """Wait until all three emulator ports accept connections (e.g. after docker run).""" deadline = time.monotonic() + timeout while time.monotonic() < deadline: @@ -187,6 +197,8 @@ def _remote_storage_buckets_containers(): def pytest_collection_modifyitems(config: pytest.Config, items: list) -> None: """Inject bucket/container creation for test_remote_storage.py.""" + if any("remote_storage" in str(getattr(item, "path", None) or getattr(item, "fspath", "")) for item in items): + _ensure_gcs_emulator_env() for item in items: path = getattr(item, "path", None) or getattr(item, "fspath", None) if path and "test_remote_storage" in str(path): From 3cb2c9366fc352b53a276e780d54a2e07676430b Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 10:12:45 +0200 Subject: [PATCH 18/51] refactor: remove deprecated dask array compatibility layer --- src/spatialdata/_io/__init__.py | 2 - src/spatialdata/_io/_dask_zarr_compat.py | 55 ------------------------ 2 files changed, 57 deletions(-) delete mode 100644 src/spatialdata/_io/_dask_zarr_compat.py diff --git a/src/spatialdata/_io/__init__.py b/src/spatialdata/_io/__init__.py index 9e4b11de1..38ff8c6bb 100644 --- a/src/spatialdata/_io/__init__.py +++ b/src/spatialdata/_io/__init__.py @@ -1,7 +1,5 @@ from __future__ import annotations -# Patch da.to_zarr so ome_zarr's **kwargs are passed as zarr_array_kwargs (avoids FutureWarning) -import spatialdata._io._dask_zarr_compat # noqa: F401 from spatialdata._io._utils import get_dask_backing_files from spatialdata._io.format import SpatialDataFormatType from spatialdata._io.io_points import write_points diff --git a/src/spatialdata/_io/_dask_zarr_compat.py b/src/spatialdata/_io/_dask_zarr_compat.py deleted file mode 100644 index b0988aef7..000000000 --- a/src/spatialdata/_io/_dask_zarr_compat.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Compatibility layer for dask.array.to_zarr when callers pass array options via **kwargs. - -ome_zarr.writer calls da.to_zarr(..., **options) with array options (compressor, dimension_names, -etc.). Dask deprecated **kwargs in favor of zarr_array_kwargs. This module patches da.to_zarr to -forward such kwargs into zarr_array_kwargs (excluding dask-internal keys like zarr_format that -zarr.Group.create_array() does not accept), avoiding the FutureWarning and keeping behavior correct. -""" - -from __future__ import annotations - -from typing import Any - -import dask.array as _da - -_orig_to_zarr = _da.to_zarr - -# Keys from ome_zarr/dask **kwargs that must not be passed to zarr.Group.create_array() -# dimension_separator: not accepted by all zarr versions in the create_array() path. -_DASK_INTERNAL_KEYS = frozenset({"zarr_format", "dimension_separator"}) - - -def _to_zarr( - arr: Any, - url: Any, - component: Any = None, - storage_options: Any = None, - region: Any = None, - compute: bool = True, - return_stored: bool = False, - zarr_array_kwargs: Any = None, - zarr_read_kwargs: Any = None, - **kwargs: Any, -) -> Any: - """Forward deprecated **kwargs into zarr_array_kwargs, excluding _DASK_INTERNAL_KEYS.""" - if kwargs: - zarr_array_kwargs = dict(zarr_array_kwargs) if zarr_array_kwargs else {} - for k, v in kwargs.items(): - if k not in _DASK_INTERNAL_KEYS: - zarr_array_kwargs[k] = v - kwargs = {} - return _orig_to_zarr( - arr, - url, - component=component, - storage_options=storage_options, - region=region, - compute=compute, - return_stored=return_stored, - zarr_array_kwargs=zarr_array_kwargs, - zarr_read_kwargs=zarr_read_kwargs, - **kwargs, - ) - - -_da.to_zarr = _to_zarr From 6cf359a108f7cb45e35f0da9e61a90d18bc0c94b Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 10:52:45 +0200 Subject: [PATCH 19/51] Improve path handling in FsspecStore and update read_parquet options --- src/spatialdata/_io/_utils.py | 16 +++++++++++++--- src/spatialdata/_io/io_points.py | 3 ++- tests/io/remote_storage/test_remote_storage.py | 4 ++-- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 2a5d44e26..4553ab664 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -41,6 +41,15 @@ from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes +def _join_fsspec_store_path(store_path: str, relative_path: str) -> str: + """Combine FsspecStore root with a zarr group path using POSIX ``/`` (fsspec keys; safe on Windows).""" + base = str(store_path).replace("\\", "/").rstrip("/") + rel = str(relative_path).replace("\\", "/").lstrip("/") + if not base: + return f"/{rel}" if rel else "/" + return f"{base}/{rel}" if rel else base + + class _FsspecStoreRoot: """Path-like root for FsspecStore (no .root attribute); supports __truediv__ and str() as full URL.""" @@ -48,10 +57,11 @@ class _FsspecStoreRoot: def __init__(self, store: FsspecStore, path: str | None = None) -> None: self._store = store - self._path = (path or store.path).rstrip("/") + raw = path or store.path + self._path = str(raw).replace("\\", "/").rstrip("/") def __truediv__(self, other: str | Path) -> _FsspecStoreRoot: - return _FsspecStoreRoot(self._store, self._path + "/" + str(other).lstrip("/")) + return _FsspecStoreRoot(self._store, _join_fsspec_store_path(self._path, str(other))) def __str__(self) -> str: protocol = getattr(self._store.fs, "protocol", None) @@ -597,8 +607,8 @@ def _resolve_zarr_store( # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group return FsspecStore( - path.store.path + "/" + path.path, fs=_ensure_async_fs(path.store.fs), + path=_join_fsspec_store_path(path.store.path, path.path), **kwargs, ) if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 684b39a27..be2e30796 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -41,7 +41,8 @@ def _read_points( # cache on remote file needed for parquet reader to work # TODO: allow reading in the metadata without caching all the data if isinstance(path, _FsspecStoreRoot): - points = read_parquet(str(path), storage_options=_storage_options_from_fs(path._store.fs)) + opts = _storage_options_from_fs(path._store.fs) + points = read_parquet(str(path), storage_options=opts if opts else {}) else: points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) assert isinstance(points, DaskDataFrame) diff --git a/tests/io/remote_storage/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py index 44685061a..fa72ff914 100644 --- a/tests/io/remote_storage/test_remote_storage.py +++ b/tests/io/remote_storage/test_remote_storage.py @@ -2,7 +2,7 @@ Emulators must be running (e.g. Docker: docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. -tests/io/conftest.py creates the required buckets/containers when emulators are up. +tests/io/remote_storage/conftest.py creates buckets/containers when emulators are up. All remote paths use uuid.uuid4().hex so each test run writes to a unique location. """ @@ -66,7 +66,7 @@ def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: ids=["azure", "s3", "gcs"], ) -# Ensure buckets/containers exist on emulators before any test (see tests/io/conftest.py) +# Ensure buckets/containers exist on emulators before any test (see tests/io/remote_storage/conftest.py). pytestmark = pytest.mark.usefixtures("_remote_storage_buckets_containers") From df7be9a0341a05b671118683e9d78b4d20f509cc Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 12:01:52 +0200 Subject: [PATCH 20/51] Add fsspec integration by adding support for cloud object store protocols and improving storage options handling for parquet files. --- src/spatialdata/_io/_utils.py | 104 +++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 27 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 4553ab664..adc25a0ce 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -75,38 +75,83 @@ def __fspath__(self) -> str: return str(self) +_PARQUET_FSSPEC_NAMES: frozenset[str] = frozenset( + {"AzureBlobFileSystem", "ExtendedGcsFileSystem", "GCSFileSystem", "MotoS3FS", "S3FileSystem"} +) +_CLOUD_OBJECT_STORE_PROTOCOLS: frozenset[str] = frozenset({"abfs", "adl", "az", "gcs", "gs", "s3", "s3a"}) + + +def _unwrap_fsspec_sync_fs(fs: Any) -> Any: + inner = getattr(fs, "sync_fs", None) + if inner is not None and inner is not fs: + return _unwrap_fsspec_sync_fs(inner) + return fs + + +def _fsspec_protocols(core: Any) -> set[str]: + raw = getattr(core, "protocol", None) + if isinstance(raw, str): + return {raw} + if isinstance(raw, (list, tuple)): + return set(raw) + return set() + + +def _require_known_parquet_fsspec(core: Any) -> None: + if type(core).__name__ in _PARQUET_FSSPEC_NAMES: + return + supported = ", ".join(sorted(_PARQUET_FSSPEC_NAMES)) + label = f"{type(core).__module__}.{type(core).__qualname__}" + raise ValueError( + f"Cannot derive parquet storage_options from filesystem {label!r}. Supported filesystem classes: {supported}." + ) + + +def _check_fsspec_at_remote_store_open(fs: Any) -> None: + """If ``fs`` looks like S3/GCS/Azure, ensure we can build parquet ``storage_options`` for it.""" + core = _unwrap_fsspec_sync_fs(fs) + if not (_fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS): + return + _require_known_parquet_fsspec(core) + + def _storage_options_from_fs(fs: Any) -> dict[str, Any]: - """Build storage_options dict from an fsspec filesystem for use with to_parquet/write_parquet. + """Build storage_options dict from an fsspec filesystem for use with to_parquet/read_parquet. - Ensures parquet writes to remote stores (Azure, S3, GCS) use the same credentials as the - zarr store. + Unwraps ``sync_fs`` chains (e.g. async wrappers). Raises if the implementation is not one we + support for adlfs / s3fs / gcsfs-style credentials. """ + core = _unwrap_fsspec_sync_fs(fs) + _require_known_parquet_fsspec(core) out: dict[str, Any] = {} - name = type(fs).__name__ + name = type(core).__name__ if name == "AzureBlobFileSystem": - if getattr(fs, "connection_string", None): - out["connection_string"] = fs.connection_string - elif getattr(fs, "account_name", None) and getattr(fs, "account_key", None): - out["account_name"] = fs.account_name - out["account_key"] = fs.account_key - if getattr(fs, "anon", None) is not None: - out["anon"] = fs.anon + if getattr(core, "connection_string", None): + out["connection_string"] = core.connection_string + elif getattr(core, "account_name", None) and getattr(core, "account_key", None): + out["account_name"] = core.account_name + out["account_key"] = core.account_key + if getattr(core, "anon", None) is not None: + out["anon"] = core.anon elif name in ("S3FileSystem", "MotoS3FS"): - if getattr(fs, "endpoint_url", None): - out["endpoint_url"] = fs.endpoint_url - if getattr(fs, "key", None): - out["key"] = fs.key - if getattr(fs, "secret", None): - out["secret"] = fs.secret - if getattr(fs, "anon", None) is not None: - out["anon"] = fs.anon - elif name == "GCSFileSystem": - if getattr(fs, "token", None) is not None: - out["token"] = fs.token - if getattr(fs, "_endpoint", None): - out["endpoint_url"] = fs._endpoint - if getattr(fs, "project", None): - out["project"] = fs.project + if getattr(core, "endpoint_url", None): + out["endpoint_url"] = core.endpoint_url + if getattr(core, "key", None): + out["key"] = core.key + if getattr(core, "secret", None): + out["secret"] = core.secret + if getattr(core, "anon", None) is not None: + out["anon"] = core.anon + elif name in ("GCSFileSystem", "ExtendedGcsFileSystem"): + if getattr(core, "token", None) is not None: + out["token"] = core.token + if getattr(core, "_endpoint", None): + out["endpoint_url"] = core._endpoint + if getattr(core, "project", None): + out["project"] = core.project + else: + raise AssertionError(f"Unhandled fsspec class {name!r} (out of sync with _PARQUET_FSSPEC_NAMES)") + return out @@ -587,7 +632,9 @@ def _resolve_zarr_store( TypeError If the input type is unsupported. ValueError - If a `zarr.Group` has an unsupported store type. + If a `zarr.Group` has an unsupported store type, or if the fsspec filesystem uses a cloud + object-store protocol (S3, GCS, Azure, …) but is not a supported implementation for parquet + ``storage_options`` (see :func:`_check_fsspec_at_remote_store_open`). """ # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): @@ -606,6 +653,7 @@ def _resolve_zarr_store( if isinstance(path.store, FsspecStore): # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group + _check_fsspec_at_remote_store_open(path.store.fs) return FsspecStore( fs=_ensure_async_fs(path.store.fs), path=_join_fsspec_store_path(path.store.path, path.path), @@ -617,9 +665,11 @@ def _resolve_zarr_store( raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, _FsspecStoreRoot): # path-like from read_zarr that carries the same fs (preserves Azure/GCS credentials) + _check_fsspec_at_remote_store_open(path._store.fs) return FsspecStore(_ensure_async_fs(path._store.fs), path=path._path, **kwargs) if isinstance(path, UPath): # if input is a remote UPath, map it to an FSStore (check before StoreLike to avoid UnionType isinstance) + _check_fsspec_at_remote_store_open(path.fs) return FsspecStore(_ensure_async_fs(path.fs), path=path.path, **kwargs) if isinstance(path, zarr.storage.StoreLike): # if the input already a store, wrap it in an FSStore From a0bcc65a36c516d84a41b41e08fa47622351ad79 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 13:54:51 +0200 Subject: [PATCH 21/51] Enhance path handling for hierarchical URIs in SpatialData and related utilities. --- src/spatialdata/_core/spatialdata.py | 13 ++++++++++--- src/spatialdata/_io/_utils.py | 1 - src/spatialdata/_io/io_zarr.py | 19 +++++++++++++++++-- .../io/remote_storage/test_remote_storage.py | 7 +++++++ 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 810713d45..3c06f8f44 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1043,7 +1043,11 @@ def _validate_can_safely_write_to_path( _resolve_zarr_store, ) - if isinstance(file_path, str): + # Hierarchical URIs (``scheme://…``) must become UPath: plain ``Path(str)`` breaks cloud URLs + # (S3-compatible stores, Azure ``abfs://`` / ``az://``, GCS ``gs://``, ``https://``, fsspec chains, etc.). + if isinstance(file_path, str) and "://" in file_path: + file_path = UPath(file_path) + elif isinstance(file_path, str): file_path = Path(file_path) if not isinstance(file_path, (Path, UPath)): @@ -1186,9 +1190,12 @@ def write( if self.path is None: raise ValueError("file_path must be provided when SpatialData.path is not set.") file_path = self.path - if isinstance(file_path, str): + # Hierarchical URIs (``scheme://…``) must become UPath: plain ``Path(str)`` breaks cloud URLs + # (S3-compatible stores, Azure ``abfs://`` / ``az://``, GCS ``gs://``, ``https://``, fsspec chains, etc.). + if isinstance(file_path, str) and "://" in file_path: + file_path = UPath(file_path) + elif isinstance(file_path, str): file_path = Path(file_path) - # Keep UPath as-is; do not convert to Path self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) self._validate_all_elements() diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index adc25a0ce..9050348c4 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -638,7 +638,6 @@ def _resolve_zarr_store( """ # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): - # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 48795513c..f3506beed 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any, Literal, cast -import zarr.storage +import zarr from anndata import AnnData from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -14,6 +14,7 @@ from pyarrow import ArrowInvalid from upath import UPath from zarr.errors import ArrayNotFoundError +from zarr.storage import FsspecStore, LocalStore from spatialdata._core.spatialdata import SpatialData from spatialdata._io._utils import ( @@ -232,7 +233,21 @@ def read_zarr( tables=tables, attrs=attrs, ) - sdata.path = store if isinstance(store, UPath) else resolved_store.root + if isinstance(store, UPath): + sdata.path = store + elif isinstance(store, str): + sdata.path = UPath(store) if "://" in store else Path(store) + elif isinstance(store, Path): + sdata.path = store + elif isinstance(store, zarr.Group): + if isinstance(resolved_store, LocalStore): + sdata.path = Path(resolved_store.root) + elif isinstance(resolved_store, FsspecStore): + sdata.path = UPath(str(_FsspecStoreRoot(resolved_store))) + else: + sdata.path = None + else: + sdata.path = None return sdata diff --git a/tests/io/remote_storage/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py index fa72ff914..065211910 100644 --- a/tests/io/remote_storage/test_remote_storage.py +++ b/tests/io/remote_storage/test_remote_storage.py @@ -136,6 +136,13 @@ def test_write_read_roundtrip_remote(self, full_sdata: SpatialData, get_upath, s assert isinstance(full_sdata.path, UPath) assert full_sdata.path == upath _assert_read_identical(full_sdata, upath) + # ``str(upath)`` drops storage options on the UPath; S3 against moto still works via + # ``AWS_*`` / ``AWS_ENDPOINT_URL`` from conftest. Azure/GCS strings would omit credentials + # or emulator endpoints, so we only assert the string-URL read path for S3 here. + if storage_name == "s3": + sdata_str_url = SpatialData.read(str(upath)) + assert isinstance(sdata_str_url.path, UPath) + assert_spatial_data_objects_are_identical(full_sdata, sdata_str_url) @REMOTE_STORAGE_PARAMS def test_path_setter_with_remote_then_operations( From f1cc6516897683aea3e2ca9a5e9c69d19cb9496b Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 13:57:51 +0200 Subject: [PATCH 22/51] Ensure existing Zarr stores are returned unchanged in _resolve_zarr_store --- src/spatialdata/_io/_utils.py | 5 +++-- tests/io/remote_storage/test_resolve_zarr_store.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 9050348c4..d776d35f9 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -671,8 +671,9 @@ def _resolve_zarr_store( _check_fsspec_at_remote_store_open(path.fs) return FsspecStore(_ensure_async_fs(path.fs), path=path.path, **kwargs) if isinstance(path, zarr.storage.StoreLike): - # if the input already a store, wrap it in an FSStore - return FsspecStore(path, **kwargs) + # Already a concrete store (LocalStore, FsspecStore, MemoryStore, …). Do not pass it as ``fs=`` to + # FsspecStore — that only accepts an async fsspec filesystem and raises on stores (e.g. ``async_impl``). + return path raise TypeError(f"Unsupported type: {type(path)}") diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py index d8c90d46d..c34f26eee 100644 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -7,11 +7,21 @@ from __future__ import annotations -from zarr.storage import FsspecStore +import tempfile + +from zarr.storage import FsspecStore, LocalStore, MemoryStore from spatialdata._io._utils import _FsspecStoreRoot, _resolve_zarr_store, _storage_options_from_fs +def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: + """StoreLike inputs must not be wrapped as FsspecStore(fs=store) — that is only for async filesystems.""" + mem = MemoryStore() + assert _resolve_zarr_store(mem) is mem + loc = LocalStore(tempfile.mkdtemp()) + assert _resolve_zarr_store(loc) is loc + + def test_resolve_zarr_store_fsspec_store_root() -> None: """_FsspecStoreRoot is resolved to FsspecStore when reading from remote (e.g. points/shapes paths).""" import fsspec From 55ba3d08f7e2e7e3fbd1924c5000533fb1a2c7ca Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 14:05:43 +0200 Subject: [PATCH 23/51] remove unused fsspec async handling code and update related test documentation --- src/spatialdata/_io/_utils.py | 18 ------------------ tests/io/remote_storage/conftest.py | 6 +++++- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index d776d35f9..81623cc6b 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -15,7 +15,6 @@ from pathlib import Path from typing import Any, Literal -import fsspec.asyn as _asyn_mod import zarr from anndata import AnnData from dask._task_spec import Task @@ -731,20 +730,3 @@ def handle_read_errors( else: # on_bad_files == BadFileHandleMethod.ERROR # Let it raise exceptions yield - - -# Avoid RuntimeError "Loop is not running" when fsspec closes async sessions at process exit -# (remote storage: Azure, S3, GCS). _utils is used for all store resolution. -_orig_sync = _asyn_mod.sync - - -def _fsspec_sync_wrapped(loop: Any, func: Any, *args: Any, timeout: Any = None, **kwargs: Any) -> Any: - try: - return _orig_sync(loop, func, *args, timeout=timeout, **kwargs) - except RuntimeError as e: - if "Loop is not running" in str(e) or "different loop" in str(e): - return None - raise - - -_asyn_mod.sync = _fsspec_sync_wrapped diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 0d4d08da7..9bcc5af5e 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -28,7 +28,11 @@ def _ensure_gcs_emulator_env() -> None: def _patch_fsspec_sync_for_shutdown() -> None: - """If fsspec.asyn.sync() runs at exit when the loop is gone, return None instead of raising.""" + """If fsspec.asyn.sync() runs at exit when the loop is gone, return None instead of raising. + + SpatialData does not patch ``fsspec.asyn.sync`` at import time (too broad for a library); this + hook runs only for pytest sessions that load this conftest (remote emulator tests). + """ import fsspec.asyn as asyn_mod _orig = asyn_mod.sync From 0e2e424800279639fe0641fad132cf9c7b217e71 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 14:17:14 +0200 Subject: [PATCH 24/51] Updating the path setter to accept strings and normalize them to Path or UPath, and add tests to verify correct coercion of string paths to appropriate types. --- src/spatialdata/_core/spatialdata.py | 14 +++++++++++--- tests/io/test_readwrite.py | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 3c06f8f44..f3ce37226 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -549,13 +549,21 @@ def is_backed(self) -> bool: @property def path(self) -> Path | UPath | None: - """Path to the Zarr storage.""" + """Path to the Zarr storage (always :class:`pathlib.Path` or :class:`upath.UPath` when set).""" return self._path @path.setter - def path(self, value: Path | UPath | None) -> None: - if value is None or isinstance(value, (str, Path, UPath)): + def path(self, value: str | Path | UPath | None) -> None: + if value is None: + self._path = None + elif isinstance(value, (Path, UPath)): self._path = value + elif isinstance(value, str): + # Match ``write()`` / ``_validate_can_safely_write_to_path``: keep ``self._path`` as Path | UPath only. + if "://" in value: + self._path = UPath(value) + else: + self._path = Path(value) else: raise TypeError("Path must be `None`, a `str`, a `Path` or a `UPath` object.") diff --git a/tests/io/test_readwrite.py b/tests/io/test_readwrite.py index 209a43046..bc220c073 100644 --- a/tests/io/test_readwrite.py +++ b/tests/io/test_readwrite.py @@ -1190,6 +1190,17 @@ def test_read_sdata(tmp_path: Path, points: SpatialData) -> None: assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_zarr_group) +def test_path_setter_coerces_str_to_path_or_upath(tmp_path: Path) -> None: + """``SpatialData.path`` is stored as Path | UPath | None; strings are normalized like ``write()``.""" + sdata = SpatialData() + p = tmp_path / "store.zarr" + sdata.path = str(p) + assert isinstance(sdata.path, Path) + assert sdata.path == p + sdata.path = "s3://bucket/key.zarr" + assert isinstance(sdata.path, UPath) + + def test_sdata_with_nan_in_obs(tmp_path: Path) -> None: """Test writing SpatialData with mixed string/NaN values in obs works correctly. From ce20830e1e87f24a00178cb43b86f3ed90f56074 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 14:19:20 +0200 Subject: [PATCH 25/51] write method safeguards for local and remote paths in SpatialData. --- src/spatialdata/_core/spatialdata.py | 11 ++++++++++- src/spatialdata/_io/_utils.py | 11 +++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index f3ce37226..184421ed2 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1044,6 +1044,13 @@ def _validate_can_safely_write_to_path( overwrite: bool = False, saving_an_element: bool = False, ) -> None: + """ + Guard against unsafe writes for **local** paths (zarr check, Dask backing, subfolders). + + For :class:`upath.UPath`, only "store exists vs ``overwrite``" is checked. Local Dask-backing + and subfolder checks are omitted because backing paths are filesystem-local and are not + compared to object-store keys; ``overwrite=True`` on remote URLs must be chosen carefully. + """ from spatialdata._io._utils import ( _backed_elements_contained_in_path, _is_subfolder, @@ -1151,7 +1158,9 @@ def write( The path to the Zarr store to write to. If ``None``, uses :attr:`path` (must be set). overwrite If `True`, overwrite the Zarr store if it already exists. If `False`, `write()` will fail if the Zarr store - already exists. + already exists. For remote paths (:class:`upath.UPath`), the extra safeguards used for local paths (that + Dask-backed files are not inside the write target) are not applied; use ``overwrite=True`` only when you + are sure the destination store may be replaced. consolidate_metadata If `True`, triggers :func:`zarr.convenience.consolidate_metadata`, which writes all the metadata in a single file at the root directory of the store. This makes the data cloud accessible, which is required for certain diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 81623cc6b..ec71e2031 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -516,9 +516,14 @@ def _backed_elements_contained_in_path( ----- If an object does not have a Dask computational graph, it will return an empty list. It is possible for a single SpatialElement to contain multiple files in their Dask computational graph. + + For a remote ``path`` (:class:`upath.UPath`), this always returns an empty list: Dask backing paths + are resolved as local filesystem paths, so they cannot be compared to object-store locations. + :meth:`spatialdata.SpatialData.write` therefore skips the local "backing files in target" guard + for remote targets; ``overwrite=True`` on a remote URL must be used only when overwriting is safe. """ if isinstance(path, UPath): - return [] # no local backing files are "contained" in a remote path + return [] if not isinstance(path, Path): raise TypeError(f"Expected a Path or UPath object, got {type(path)}") return [_is_subfolder(parent=path, child=Path(fp)) for fp in get_dask_backing_files(object)] @@ -552,8 +557,10 @@ def _is_element_self_contained( element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, element_path: Path | UPath, ) -> bool: + """Whether element Dask graphs only reference files under ``element_path`` (local) or N/A (remote).""" if isinstance(element_path, UPath): - return True # treat remote-backed as self-contained for this check + # Backing-file paths are local; cannot relate them to remote keys—assume OK for this heuristic. + return True if isinstance(element, DaskDataFrame): pass # TODO when running test_save_transformations it seems that for the same element this is called multiple times From fbc3040eb04358319c0defc2a93bde4082117e7e Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 14:33:56 +0200 Subject: [PATCH 26/51] Support for UPath in data reading functions and improve error handling for unsupported protocols in storage options, and add test cases to validate new functionality and ensure compatibility with cloud object store protocols. --- .github/workflows/test.yaml | 4 +- src/spatialdata/_io/_utils.py | 76 +++++++++++++------ src/spatialdata/_io/io_points.py | 5 +- src/spatialdata/_io/io_shapes.py | 5 +- src/spatialdata/_io/io_table.py | 3 +- tests/io/remote_storage/conftest.py | 9 ++- .../remote_storage/test_resolve_zarr_store.py | 32 +++++++- 7 files changed, 97 insertions(+), 37 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a626165c8..df6637ea9 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -53,7 +53,9 @@ jobs: fi fi uv sync --group=test - # Start storage emulators (S3, Azure, GCS) only on Linux; service containers are not available on Windows/macOS + # Start storage emulators (S3, Azure, GCS) only on Linux; Docker service containers are not available on + # Windows/macOS runners, so tests/io/remote_storage/ is skipped there (see Test step). Remote I/O is still + # exercised on every PR via the Ubuntu matrix jobs. - name: Build and start storage emulators if: matrix.os == 'ubuntu-latest' run: | diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index ec71e2031..9fc247c69 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -74,9 +74,6 @@ def __fspath__(self) -> str: return str(self) -_PARQUET_FSSPEC_NAMES: frozenset[str] = frozenset( - {"AzureBlobFileSystem", "ExtendedGcsFileSystem", "GCSFileSystem", "MotoS3FS", "S3FileSystem"} -) _CLOUD_OBJECT_STORE_PROTOCOLS: frozenset[str] = frozenset({"abfs", "adl", "az", "gcs", "gs", "s3", "s3a"}) @@ -96,35 +93,54 @@ def _fsspec_protocols(core: Any) -> set[str]: return set() -def _require_known_parquet_fsspec(core: Any) -> None: - if type(core).__name__ in _PARQUET_FSSPEC_NAMES: - return - supported = ", ".join(sorted(_PARQUET_FSSPEC_NAMES)) - label = f"{type(core).__module__}.{type(core).__qualname__}" - raise ValueError( - f"Cannot derive parquet storage_options from filesystem {label!r}. Supported filesystem classes: {supported}." - ) +def _cloud_parquet_protocol_family(core: Any) -> Literal["azure", "gcs", "s3"] | None: + """Map fsspec filesystem protocol(s) to how we extract parquet ``storage_options`` (not by class name).""" + protos = _fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS + if not protos: + return None + if protos & {"s3", "s3a"}: + return "s3" + if protos & {"abfs", "adl", "az"}: + return "azure" + if protos & {"gcs", "gs"}: + return "gcs" + return None def _check_fsspec_at_remote_store_open(fs: Any) -> None: """If ``fs`` looks like S3/GCS/Azure, ensure we can build parquet ``storage_options`` for it.""" core = _unwrap_fsspec_sync_fs(fs) - if not (_fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS): + protos = _fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS + if not protos: return - _require_known_parquet_fsspec(core) + if _cloud_parquet_protocol_family(core) is None: + label = f"{type(core).__module__}.{type(core).__qualname__}" + raise ValueError( + f"Cannot derive parquet storage_options for filesystem {label!r} with protocol(s) {protos!r}. " + "Supported protocol families: S3 (s3, s3a), Azure (abfs, adl, az), GCS (gcs, gs). " + "Custom implementations should expose a matching ``protocol`` (see fsspec)." + ) def _storage_options_from_fs(fs: Any) -> dict[str, Any]: """Build storage_options dict from an fsspec filesystem for use with to_parquet/read_parquet. - Unwraps ``sync_fs`` chains (e.g. async wrappers). Raises if the implementation is not one we - support for adlfs / s3fs / gcsfs-style credentials. + Unwraps ``sync_fs`` chains (e.g. async wrappers). Dispatches by **reported fsspec protocol** (``fs.protocol``), + not by concrete class name, so subclasses and thin wrappers that speak ``s3``/``gs``/``az`` still work as long as + they expose the credential attributes we copy (same shape as s3fs, gcsfs, adlfs). """ core = _unwrap_fsspec_sync_fs(fs) - _require_known_parquet_fsspec(core) + family = _cloud_parquet_protocol_family(core) + if family is None: + label = f"{type(core).__module__}.{type(core).__qualname__}" + protos = _fsspec_protocols(core) + raise ValueError( + f"Cannot derive parquet storage_options from filesystem {label!r} (protocols {protos!r}). " + "Expected an object-store protocol among " + f"{sorted(_CLOUD_OBJECT_STORE_PROTOCOLS)}." + ) out: dict[str, Any] = {} - name = type(core).__name__ - if name == "AzureBlobFileSystem": + if family == "azure": if getattr(core, "connection_string", None): out["connection_string"] = core.connection_string elif getattr(core, "account_name", None) and getattr(core, "account_key", None): @@ -132,7 +148,7 @@ def _storage_options_from_fs(fs: Any) -> dict[str, Any]: out["account_key"] = core.account_key if getattr(core, "anon", None) is not None: out["anon"] = core.anon - elif name in ("S3FileSystem", "MotoS3FS"): + elif family == "s3": if getattr(core, "endpoint_url", None): out["endpoint_url"] = core.endpoint_url if getattr(core, "key", None): @@ -141,7 +157,7 @@ def _storage_options_from_fs(fs: Any) -> dict[str, Any]: out["secret"] = core.secret if getattr(core, "anon", None) is not None: out["anon"] = core.anon - elif name in ("GCSFileSystem", "ExtendedGcsFileSystem"): + elif family == "gcs": if getattr(core, "token", None) is not None: out["token"] = core.token if getattr(core, "_endpoint", None): @@ -149,7 +165,7 @@ def _storage_options_from_fs(fs: Any) -> dict[str, Any]: if getattr(core, "project", None): out["project"] = core.project else: - raise AssertionError(f"Unhandled fsspec class {name!r} (out of sync with _PARQUET_FSSPEC_NAMES)") + raise AssertionError(f"Unhandled protocol family {family!r}") return out @@ -651,6 +667,7 @@ def _resolve_zarr_store( return LocalStore(path.path) if isinstance(path, zarr.Group): + _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) # if the input is a zarr.Group, wrap it with a store if isinstance(path.store, LocalStore): store_path = UPath(path.store.root) / path.path @@ -664,9 +681,20 @@ def _resolve_zarr_store( path=_join_fsspec_store_path(path.store.path, path.path), **kwargs, ) - if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): - # if the store is a ConsolidatedMetadataStore, just return the underlying FSSpec store - return path.store.store + if _cms is not None and isinstance(path.store, _cms): + # Unwrap and apply the same async-fs + parquet guards as a direct FsspecStore on the group. + inner = path.store.store + if isinstance(inner, FsspecStore): + _check_fsspec_at_remote_store_open(inner.fs) + return FsspecStore( + fs=_ensure_async_fs(inner.fs), + path=_join_fsspec_store_path(inner.path, path.path), + **kwargs, + ) + if isinstance(inner, LocalStore): + store_path = UPath(inner.root) / path.path + return LocalStore(store_path.path) + return inner raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, _FsspecStoreRoot): # path-like from read_zarr that carries the same fs (preserves Azure/GCS credentials) diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index be2e30796..90d784742 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -7,6 +7,7 @@ from dask.dataframe import DataFrame as DaskDataFrame from dask.dataframe import read_parquet from ome_zarr.format import Format +from upath import UPath from spatialdata._io._utils import ( _FsspecStoreRoot, @@ -26,9 +27,9 @@ def _read_points( - store: str | Path, + store: str | Path | UPath, ) -> DaskDataFrame: - """Read points from a zarr store.""" + """Read points from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" resolved_store = _resolve_zarr_store(store) f = zarr.open(resolved_store, mode="r") diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index cd521f51b..ccba50dae 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -13,6 +13,7 @@ from natsort import natsorted from ome_zarr.format import Format from shapely import from_ragged_array, to_ragged_array +from upath import UPath from spatialdata._io._utils import ( _FsspecStoreRoot, @@ -39,9 +40,9 @@ def _read_shapes( - store: str | Path, + store: str | Path | UPath, ) -> GeoDataFrame: - """Read shapes from a zarr store.""" + """Read shapes from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" resolved_store = _resolve_zarr_store(store) f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 03ec78526..11414fd66 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -8,6 +8,7 @@ from anndata import read_zarr as read_anndata_zarr from anndata._io.specs import write_elem as write_adata from ome_zarr.format import Format +from upath import UPath from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import ( @@ -20,7 +21,7 @@ from spatialdata.models import TableModel, get_table_keys -def _read_table(store: str | Path) -> AnnData: +def _read_table(store: str | Path | UPath) -> AnnData: resolved_store = _resolve_zarr_store(store) table = read_anndata_zarr(resolved_store) diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 9bcc5af5e..62f87b6c2 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -1,8 +1,11 @@ -"""Minimal pytest config for IO tests. Creates buckets/containers when remote emulators are running. +"""Pytest hooks for ``tests/io/remote_storage/`` only (not loaded from repo-root ``tests/conftest.py``). -Assumes emulators are already running (e.g. Docker: - docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). +Creates buckets/containers when remote emulators are running. Assumes emulators are already up +(e.g. Docker: ``docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators``). Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. + +``pytest_configure`` here patches ``fsspec.asyn.sync`` and ``gcsfs`` session teardown for this subtree +only; the library package itself does not apply those patches globally. """ from __future__ import annotations diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py index c34f26eee..57a0a2257 100644 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -9,6 +9,7 @@ import tempfile +import pytest from zarr.storage import FsspecStore, LocalStore, MemoryStore from spatialdata._io._utils import _FsspecStoreRoot, _resolve_zarr_store, _storage_options_from_fs @@ -39,12 +40,11 @@ def test_storage_options_from_fs_azure_account_key() -> None: """_storage_options_from_fs extracts Azure credentials for writing parquet to remote Azure Blob.""" class AzureBlobFileSystemMock: + protocol = "abfs" account_name = "dev" account_key = "key123" connection_string = None anon = None - - AzureBlobFileSystemMock.__name__ = "AzureBlobFileSystem" out = _storage_options_from_fs(AzureBlobFileSystemMock()) assert out["account_name"] == "dev" assert out["account_key"] == "key123" @@ -54,12 +54,36 @@ def test_storage_options_from_fs_gcs_endpoint() -> None: """_storage_options_from_fs extracts GCS endpoint and project for writing parquet to remote GCS.""" class GCSFileSystemMock: + protocol = "gs" token = "anon" _endpoint = "http://localhost:4443" project = "test" - - GCSFileSystemMock.__name__ = "GCSFileSystem" out = _storage_options_from_fs(GCSFileSystemMock()) assert out["token"] == "anon" assert out["endpoint_url"] == "http://localhost:4443" assert out["project"] == "test" + + +def test_storage_options_from_fs_s3_by_protocol_not_class_name() -> None: + """Subclasses / wrappers are accepted when ``protocol`` is s3 and attrs match s3fs-style credentials.""" + + class CustomS3Wrapper: + protocol = "s3" + endpoint_url = "http://127.0.0.1:9000" + key = "access" + secret = "secret" + anon = False + + out = _storage_options_from_fs(CustomS3Wrapper()) + assert out["endpoint_url"] == "http://127.0.0.1:9000" + assert out["key"] == "access" + assert out["secret"] == "secret" + assert out["anon"] is False + + +def test_storage_options_from_fs_requires_object_store_protocol() -> None: + class NoCloud: + protocol = "file" + + with pytest.raises(ValueError, match="Cannot derive parquet storage_options"): + _storage_options_from_fs(NoCloud()) From 175fbea8a7ef8937a86523dcf18fdbb67888c4d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 12:34:12 +0000 Subject: [PATCH 27/51] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/io/remote_storage/test_resolve_zarr_store.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py index 57a0a2257..d37e0aa35 100644 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -45,6 +45,7 @@ class AzureBlobFileSystemMock: account_key = "key123" connection_string = None anon = None + out = _storage_options_from_fs(AzureBlobFileSystemMock()) assert out["account_name"] == "dev" assert out["account_key"] == "key123" @@ -58,6 +59,7 @@ class GCSFileSystemMock: token = "anon" _endpoint = "http://localhost:4443" project = "test" + out = _storage_options_from_fs(GCSFileSystemMock()) assert out["token"] == "anon" assert out["endpoint_url"] == "http://localhost:4443" From 3beed0e00aeaa6c90f14821508fdb55d7c122891 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 17:50:49 +0200 Subject: [PATCH 28/51] Refactor full_sdata fixture for consistency in remote I/O tests. --- tests/conftest.py | 12 +----------- tests/io/remote_storage/conftest.py | 4 ++++ 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a9aa8ebaa..c97939129 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,5 @@ from __future__ import annotations -import os - -os.environ.setdefault("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false") - from collections.abc import Sequence from pathlib import Path from typing import Any @@ -93,18 +89,12 @@ def tables() -> list[AnnData]: @pytest.fixture() def full_sdata() -> SpatialData: - # Use two regions so the table categorical has two categories; otherwise anndata does not - # write the obs/region/codes/c/0 chunk (only codes/zarr.json), causing 404 on remote read. return SpatialData( images=_get_images(), labels=_get_labels(), shapes=_get_shapes(), points=_get_points(), - tables=_get_tables( - region=["labels2d", "poly"], - region_key="region", - instance_key="instance_id", - ), + tables=_get_tables(region="labels2d", region_key="region", instance_key="instance_id"), ) diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 62f87b6c2..2ed93e9b6 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -11,12 +11,16 @@ from __future__ import annotations import os + +os.environ.setdefault("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false") + import socket import time import pytest + def _ensure_gcs_emulator_env() -> None: """Point google-cloud-storage / gcsfs defaults at fake-gcs-server (not production).""" raw = os.environ.get("STORAGE_EMULATOR_HOST", "").strip() From a7c51c23d84b4b009261c0aa4bb66a01cc3e7efd Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 17:58:18 +0200 Subject: [PATCH 29/51] rollback the unneeded changes for test cases within the core --- tests/core/operations/test_spatialdata_operations.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/core/operations/test_spatialdata_operations.py b/tests/core/operations/test_spatialdata_operations.py index a898bed0c..68b538e0a 100644 --- a/tests/core/operations/test_spatialdata_operations.py +++ b/tests/core/operations/test_spatialdata_operations.py @@ -559,15 +559,14 @@ def test_init_from_elements(full_sdata: SpatialData) -> None: def test_subset(full_sdata: SpatialData) -> None: - # Exclude labels and poly so the default table (annotating labels2d and poly) is not included - element_names = ["image2d", "points_0", "circles"] + element_names = ["image2d", "points_0", "circles", "poly"] subset0 = full_sdata.subset(element_names) unique_names = set() for _, k, _ in subset0.gen_spatial_elements(): unique_names.add(k) assert "image3d_xarray" in full_sdata.images assert unique_names == set(element_names) - # no table since neither labels2d nor poly are in the subset + # no table since the labels are not present in the subset assert "table" not in subset0.tables adata = AnnData( @@ -676,9 +675,7 @@ def test_transform_to_data_extent(full_sdata: SpatialData, maintain_positioning: def test_validate_table_in_spatialdata(full_sdata): table = full_sdata["table"] region, region_key, _ = get_table_keys(table) - # full_sdata uses two regions (labels2d, poly) so the table annotates both - expected = {"labels2d", "poly"} - assert set(region if isinstance(region, list) else [region]) == expected + assert region == "labels2d" full_sdata.validate_table_in_spatialdata(table) From 6443422cd91d8d332628f26640e325953e17e113 Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Wed, 15 Apr 2026 18:01:27 +0200 Subject: [PATCH 30/51] rollback the unneeded changes for test cases within the query --- tests/core/query/test_relational_query.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/core/query/test_relational_query.py b/tests/core/query/test_relational_query.py index c28725681..63e7a6f19 100644 --- a/tests/core/query/test_relational_query.py +++ b/tests/core/query/test_relational_query.py @@ -914,9 +914,6 @@ def test_filter_table_non_annotating(full_sdata): def test_labels_table_joins(full_sdata): - # Restrict table to labels2d only so the join returns one row per label (full_sdata default has two regions) - full_sdata["table"].obs["region"] = pd.Categorical(["labels2d"] * full_sdata["table"].n_obs) - full_sdata["table"].uns["spatialdata_attrs"]["region"] = "labels2d" element_dict, table = join_spatialelement_table( sdata=full_sdata, spatial_element_names="labels2d", From be230218371b870f341b4807aee8f4d4288989a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:01:44 +0000 Subject: [PATCH 31/51] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/io/remote_storage/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 2ed93e9b6..0a0b608b9 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -20,7 +20,6 @@ import pytest - def _ensure_gcs_emulator_env() -> None: """Point google-cloud-storage / gcsfs defaults at fake-gcs-server (not production).""" raw = os.environ.get("STORAGE_EMULATOR_HOST", "").strip() From 738e61111dc30011b8bc2d334b9a63593cbd0184 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Wed, 15 Apr 2026 18:24:41 +0200 Subject: [PATCH 32/51] init --- docs/tutorials/notebooks | 2 +- src/spatialdata/_core/spatialdata.py | 105 +++++++++++++-------------- src/spatialdata/_store.py | 73 +++++++++++++++++++ tests/io/test_store.py | 34 +++++++++ 4 files changed, 157 insertions(+), 57 deletions(-) create mode 100644 src/spatialdata/_store.py create mode 100644 tests/io/test_store.py diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 8774b0d92..9cf35b236 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 8774b0d927e1d5ad38aec8f545c7bf0591c77fe7 +Subproject commit 9cf35b236c4fdbce01a7c9e83f20256738b9a8fd diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 184421ed2..9b2c997dd 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -30,6 +30,7 @@ validate_table_attr_keys, ) from spatialdata._logging import logger +from spatialdata._store import ZarrStore, make_zarr_store, open_read_store, open_write_store from spatialdata._types import ArrayLike, Raster_T from spatialdata._utils import _deprecation_alias from spatialdata.models import ( @@ -122,6 +123,7 @@ def __init__( attrs: Mapping[Any, Any] | None = None, ) -> None: self._path: Path | UPath | None = None + self._zarr_store: ZarrStore | None = None self._shared_keys: set[str | None] = set() self._images: Images = Images(shared_keys=self._shared_keys) @@ -555,17 +557,27 @@ def path(self) -> Path | UPath | None: @path.setter def path(self, value: str | Path | UPath | None) -> None: if value is None: - self._path = None - elif isinstance(value, (Path, UPath)): - self._path = value - elif isinstance(value, str): - # Match ``write()`` / ``_validate_can_safely_write_to_path``: keep ``self._path`` as Path | UPath only. - if "://" in value: - self._path = UPath(value) - else: - self._path = Path(value) + self._set_zarr_store(None) else: - raise TypeError("Path must be `None`, a `str`, a `Path` or a `UPath` object.") + self._set_zarr_store(make_zarr_store(value)) + + def _set_zarr_store(self, zarr_store: ZarrStore | None) -> None: + self._zarr_store = zarr_store + self._path = None if zarr_store is None else zarr_store.path + + def _get_zarr_store(self) -> ZarrStore | None: + if self._zarr_store is not None: + return self._zarr_store + if self.path is None: + return None + self._zarr_store = make_zarr_store(self.path) + return self._zarr_store + + def _require_zarr_store(self) -> ZarrStore: + zarr_store = self._get_zarr_store() + if zarr_store is None: + raise ValueError("The SpatialData object is not backed by a Zarr store.") + return zarr_store def locate_element(self, element: SpatialElement) -> list[str]: """ @@ -990,13 +1002,7 @@ def elements_paths_on_disk(self) -> list[str]: ------- A list of paths of the elements saved in the Zarr store. """ - from spatialdata._io._utils import _resolve_zarr_store - - if self.path is None: - raise ValueError("The SpatialData object is not backed by a Zarr store.") - - store = _resolve_zarr_store(self.path) - root = zarr.open_group(store=store, mode="r") + zarr_store = self._require_zarr_store() elements_in_zarr = [] def find_groups(obj: zarr.Group, path: str) -> None: @@ -1005,13 +1011,14 @@ def find_groups(obj: zarr.Group, path: str) -> None: if isinstance(obj, zarr.Group) and path.count("/") == 1: elements_in_zarr.append(path) - for element_type in root: - if element_type in ["images", "labels", "points", "shapes", "tables"]: - for element_name in root[element_type]: - path = f"{element_type}/{element_name}" - elements_in_zarr.append(path) + with open_read_store(zarr_store) as store: + root = zarr.open_group(store=store, mode="r") + for element_type in root: + if element_type in ["images", "labels", "points", "shapes", "tables"]: + for element_name in root[element_type]: + path = f"{element_type}/{element_name}" + elements_in_zarr.append(path) # root.visit(lambda path: find_groups(root[path], path)) - store.close() return elements_in_zarr def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: @@ -1198,7 +1205,6 @@ def write( Whether to use the WKB or geoarrow encoding for GeoParquet. See :meth:`geopandas.GeoDataFrame.to_parquet` for details. If None, uses the value from :attr:`spatialdata.settings.shapes_geometry_encoding`. """ - from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import _parse_formats parsed = _parse_formats(sdata_formats) @@ -1207,20 +1213,15 @@ def write( if self.path is None: raise ValueError("file_path must be provided when SpatialData.path is not set.") file_path = self.path - # Hierarchical URIs (``scheme://…``) must become UPath: plain ``Path(str)`` breaks cloud URLs - # (S3-compatible stores, Azure ``abfs://`` / ``az://``, GCS ``gs://``, ``https://``, fsspec chains, etc.). - if isinstance(file_path, str) and "://" in file_path: - file_path = UPath(file_path) - elif isinstance(file_path, str): - file_path = Path(file_path) + zarr_store = make_zarr_store(file_path) + file_path = zarr_store.path self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) self._validate_all_elements() - store = _resolve_zarr_store(file_path) - zarr_format = parsed["SpatialData"].zarr_format - zarr_group = zarr.create_group(store=store, overwrite=overwrite, zarr_format=zarr_format) - self.write_attrs(zarr_group=zarr_group, sdata_format=parsed["SpatialData"]) - store.close() + with open_write_store(zarr_store) as store: + zarr_format = parsed["SpatialData"].zarr_format + zarr_group = zarr.create_group(store=store, overwrite=overwrite, zarr_format=zarr_format) + self.write_attrs(zarr_group=zarr_group, sdata_format=parsed["SpatialData"]) for element_type, element_name, element in self.gen_elements(): self._write_element( @@ -1234,7 +1235,7 @@ def write( ) if self.path != file_path and update_sdata_path: - self.path = file_path + self._set_zarr_store(zarr_store) if consolidate_metadata: self.write_consolidated_metadata() @@ -1471,13 +1472,12 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None: "more elements in the SpatialData object. Deleting the data would corrupt the SpatialData object." ) - from spatialdata._io._utils import _resolve_zarr_store + zarr_store = self._require_zarr_store() # delete the element - store = _resolve_zarr_store(self.path) - root = zarr.open_group(store=store, mode="r+", use_consolidated=False) - del root[element_type][element_name] - store.close() + with open_write_store(zarr_store) as store: + root = zarr.open_group(store=store, mode="r+", use_consolidated=False) + del root[element_type][element_name] if self.has_consolidated_metadata(): self.write_consolidated_metadata() @@ -1500,14 +1500,11 @@ def write_consolidated_metadata(self) -> None: _write_consolidated_metadata(self.path) def has_consolidated_metadata(self) -> bool: - from spatialdata._io._utils import _resolve_zarr_store - return_value = False - store = _resolve_zarr_store(self.path) - group = zarr.open_group(store, mode="r") - if getattr(group.metadata, "consolidated_metadata", None): - return_value = True - store.close() + with open_read_store(self._require_zarr_store()) as store: + group = zarr.open_group(store, mode="r") + if getattr(group.metadata, "consolidated_metadata", None): + return_value = True return return_value def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[str, SpatialElement | AnnData] | None: @@ -1693,18 +1690,17 @@ def write_attrs( sdata_format: SpatialDataContainerFormatType | None = None, zarr_group: zarr.Group | None = None, ) -> None: - from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import CurrentSpatialDataContainerFormat, SpatialDataContainerFormatType sdata_format = sdata_format if sdata_format is not None else CurrentSpatialDataContainerFormat() assert isinstance(sdata_format, SpatialDataContainerFormatType) - store = None - if zarr_group is None: assert self.is_backed(), "The SpatialData object must be backed by a Zarr store to write attrs." - store = _resolve_zarr_store(self.path) - zarr_group = zarr.open_group(store=store, mode="r+") + with open_write_store(self._require_zarr_store()) as store: + zarr_group = zarr.open_group(store=store, mode="r+") + self.write_attrs(sdata_format=sdata_format, zarr_group=zarr_group) + return version = sdata_format.spatialdata_format_version version_specific_attrs = sdata_format.attrs_to_dict() @@ -1715,9 +1711,6 @@ def write_attrs( except TypeError as e: raise TypeError("Invalid attribute in SpatialData.attrs") from e - if store is not None: - store.close() - def write_metadata( self, element_name: str | None = None, diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py new file mode 100644 index 000000000..d3c932123 --- /dev/null +++ b/src/spatialdata/_store.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from contextlib import contextmanager +from dataclasses import dataclass, field, replace +from pathlib import Path +from typing import Any + +import zarr +from upath import UPath + +PathLike = Path | UPath + + +def _normalize_path(path: str | PathLike) -> PathLike: + if isinstance(path, str): + return UPath(path) if "://" in path else Path(path) + if isinstance(path, (Path, UPath)): + return path + raise TypeError("Path must be `None`, a `str`, a `Path` or a `UPath` object.") + + +@dataclass(frozen=True) +class ZarrStore: + path: PathLike + storage_options: dict[str, Any] = field(default_factory=dict) + + def with_path(self, path: str | PathLike) -> ZarrStore: + return replace(self, path=_normalize_path(path)) + + +def make_zarr_store( + path: str | PathLike, + *, + storage_options: dict[str, Any] | None = None, +) -> ZarrStore: + return ZarrStore( + path=_normalize_path(path), + storage_options={} if storage_options is None else dict(storage_options), + ) + + +@contextmanager +def open_read_store(store: ZarrStore) -> Any: + from spatialdata._io._utils import _resolve_zarr_store + + resolved_store = _resolve_zarr_store(store.path, **store.storage_options) + try: + yield resolved_store + finally: + resolved_store.close() + + +@contextmanager +def open_write_store(store: ZarrStore) -> Any: + from spatialdata._io._utils import _resolve_zarr_store + + resolved_store = _resolve_zarr_store(store.path, **store.storage_options) + try: + yield resolved_store + finally: + resolved_store.close() + + +def open_group_from_store( + store: zarr.storage.StoreLike, + *, + mode: str, + use_consolidated: bool | None = None, +) -> zarr.Group: + kwargs: dict[str, Any] = {"store": store, "mode": mode} + if use_consolidated is not None: + kwargs["use_consolidated"] = use_consolidated + return zarr.open_group(**kwargs) diff --git a/tests/io/test_store.py b/tests/io/test_store.py new file mode 100644 index 000000000..bd44c77f8 --- /dev/null +++ b/tests/io/test_store.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from pathlib import Path + +import zarr +from upath import UPath + +from spatialdata._store import ( + make_zarr_store, + open_read_store, + open_write_store, +) + + +def test_make_zarr_store_normalizes_local_and_remote_paths( + tmp_path: Path, +) -> None: + local_store = make_zarr_store(str(tmp_path / "store.zarr")) + assert isinstance(local_store.path, Path) + + remote_store = make_zarr_store("s3://bucket/store.zarr") + assert isinstance(remote_store.path, UPath) + + +def test_open_read_and_write_store_roundtrip(tmp_path: Path) -> None: + zarr_store = make_zarr_store(tmp_path / "store.zarr") + + with open_write_store(zarr_store) as store: + group = zarr.create_group(store=store, overwrite=True) + group.attrs["answer"] = 42 + + with open_read_store(zarr_store) as store: + group = zarr.open_group(store=store, mode="r") + assert group.attrs["answer"] == 42 From 53c45eefae006f255737d568962e81ab6bc2ae4d Mon Sep 17 00:00:00 2001 From: SamirMoustafa Date: Thu, 16 Apr 2026 10:56:55 +0200 Subject: [PATCH 33/51] Adding a dedicated job for remote storage tests, updating coverage upload configurations, and refining test execution conditions for different operating systems. --- .github/workflows/test.yaml | 61 +++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index df6637ea9..5849c4dfa 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -53,18 +53,52 @@ jobs: fi fi uv sync --group=test - # Start storage emulators (S3, Azure, GCS) only on Linux; Docker service containers are not available on - # Windows/macOS runners, so tests/io/remote_storage/ is skipped there (see Test step). Remote I/O is still - # exercised on every PR via the Ubuntu matrix jobs. + - name: Test + env: + MPLBACKEND: agg + PLATFORM: ${{ matrix.os }} + DISPLAY: :42 + run: | + uv run pytest --cov --color=yes --cov-report=xml --ignore=tests/io/remote_storage/ + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v5 + with: + name: coverage + verbose: true + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + + test-remote-storage: + runs-on: ubuntu-latest + defaults: + run: + shell: bash + strategy: + fail-fast: false + matrix: + python: ["3.11", "3.13"] + env: + MPLBACKEND: agg + PLATFORM: ubuntu-latest + DISPLAY: :42 + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT: "false" + steps: + - uses: actions/checkout@v6 + - uses: astral-sh/setup-uv@v7 + with: + version: "latest" + python-version: ${{ matrix.python }} + - name: Install dependencies + run: | + uv add dask + uv sync --group=test - name: Build and start storage emulators - if: matrix.os == 'ubuntu-latest' run: | docker build -f tests/io/remote_storage/Dockerfile.emulators -t spatialdata-emulators . docker run --rm -d --name spatialdata-emulators \ -p 5000:5000 -p 10000:10000 -p 4443:4443 \ spatialdata-emulators - name: Wait for emulator ports - if: matrix.os == 'ubuntu-latest' run: | echo "Waiting for S3 (5000), Azure (10000), GCS (4443)..." python3 -c " @@ -80,24 +114,13 @@ jobs: else: raise SystemExit('Emulators did not become ready.') " - # On Linux, emulators run above so full suite (incl. tests/io/remote_storage/) runs. On Windows/macOS, skip remote_storage. - - name: Test - env: - MPLBACKEND: agg - PLATFORM: ${{ matrix.os }} - DISPLAY: :42 - # gcsfs otherwise defaults to ExtendedGcsFileSystem (prod Storage Control gRPC; breaks fake-gcs-server). - GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT: "false" + - name: Test remote storage run: | - if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then - uv run pytest --cov --color=yes --cov-report=xml - else - uv run pytest --cov --color=yes --cov-report=xml --ignore=tests/io/remote_storage/ - fi + uv run pytest tests/io/remote_storage/ --cov --color=yes --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: - name: coverage + name: coverage-remote-storage-${{ matrix.python }} verbose: true env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} From 341b8faeb49f4314ccfc00e25a302bf5f36689e4 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Thu, 16 Apr 2026 12:12:04 +0200 Subject: [PATCH 34/51] add arrow filesystem --- src/spatialdata/_io/_utils.py | 38 ----- src/spatialdata/_io/io_points.py | 35 ++-- src/spatialdata/_io/io_raster.py | 7 +- src/spatialdata/_io/io_shapes.py | 94 ++--------- src/spatialdata/_io/io_table.py | 6 +- src/spatialdata/_io/io_zarr.py | 151 ++++++++---------- src/spatialdata/_store.py | 70 +++++--- .../io/remote_storage/test_remote_storage.py | 17 ++ .../remote_storage/test_resolve_zarr_store.py | 20 ++- tests/io/test_store.py | 18 +++ 10 files changed, 199 insertions(+), 257 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 9fc247c69..c022e2701 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -49,31 +49,6 @@ def _join_fsspec_store_path(store_path: str, relative_path: str) -> str: return f"{base}/{rel}" if rel else base -class _FsspecStoreRoot: - """Path-like root for FsspecStore (no .root attribute); supports __truediv__ and str() as full URL.""" - - __slots__ = ("_store", "_path") - - def __init__(self, store: FsspecStore, path: str | None = None) -> None: - self._store = store - raw = path or store.path - self._path = str(raw).replace("\\", "/").rstrip("/") - - def __truediv__(self, other: str | Path) -> _FsspecStoreRoot: - return _FsspecStoreRoot(self._store, _join_fsspec_store_path(self._path, str(other))) - - def __str__(self) -> str: - protocol = getattr(self._store.fs, "protocol", None) - if isinstance(protocol, (list, tuple)): - protocol = protocol[0] if protocol else "file" - elif protocol is None: - protocol = "file" - return f"{protocol}://{self._path}" - - def __fspath__(self) -> str: - return str(self) - - _CLOUD_OBJECT_STORE_PROTOCOLS: frozenset[str] = frozenset({"abfs", "adl", "az", "gcs", "gs", "s3", "s3a"}) @@ -170,15 +145,6 @@ def _storage_options_from_fs(fs: Any) -> dict[str, Any]: return out -def _get_store_root(store: LocalStore | FsspecStore) -> Path | _FsspecStoreRoot: - """Return a path-like root for the store (supports / and str()). Use for building paths to parquet etc.""" - if isinstance(store, LocalStore): - return Path(store.root) - if isinstance(store, FsspecStore): - return _FsspecStoreRoot(store) - raise TypeError(f"Unsupported store type: {type(store)}") - - def _get_transformations_from_ngff_dict( list_of_encoded_ngff_transformations: list[dict[str, Any]], ) -> MappingToCoordinateSystem_t: @@ -696,10 +662,6 @@ def _resolve_zarr_store( return LocalStore(store_path.path) return inner raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") - if isinstance(path, _FsspecStoreRoot): - # path-like from read_zarr that carries the same fs (preserves Azure/GCS credentials) - _check_fsspec_at_remote_store_open(path._store.fs) - return FsspecStore(_ensure_async_fs(path._store.fs), path=path._path, **kwargs) if isinstance(path, UPath): # if input is a remote UPath, map it to an FSStore (check before StoreLike to avoid UnionType isinstance) _check_fsspec_at_remote_store_open(path.fs) diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 90d784742..a774d8be4 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -1,7 +1,6 @@ from __future__ import annotations from pathlib import Path -from typing import Any import zarr from dask.dataframe import DataFrame as DaskDataFrame @@ -10,15 +9,13 @@ from upath import UPath from spatialdata._io._utils import ( - _FsspecStoreRoot, - _get_store_root, _get_transformations_from_ngff_dict, _resolve_zarr_store, - _storage_options_from_fs, _write_metadata, overwrite_coordinate_transformations_non_raster, ) from spatialdata._io.format import CurrentPointsFormat, PointsFormats, _parse_version +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group from spatialdata.models import get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -27,26 +24,27 @@ def _read_points( - store: str | Path | UPath, + store: str | Path | UPath | ZarrStore, ) -> DaskDataFrame: """Read points from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" - resolved_store = _resolve_zarr_store(store) + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None points_format = PointsFormats[version] - store_root = _get_store_root(f.store_path.store) - path = store_root / f.path / "points.parquet" + parquet_store = zarr_store.child("points.parquet") # cache on remote file needed for parquet reader to work # TODO: allow reading in the metadata without caching all the data - if isinstance(path, _FsspecStoreRoot): - opts = _storage_options_from_fs(path._store.fs) - points = read_parquet(str(path), storage_options=opts if opts else {}) - else: - points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) + points = read_parquet( + parquet_store.arrow_path(), + filesystem=parquet_store.arrow_filesystem(), + ) assert isinstance(points, DaskDataFrame) + if points.index.name == "__null_dask_index__": + points = points.rename_axis(None) transformations = _get_transformations_from_ngff_dict(f.attrs.asdict()["coordinateTransformations"]) _set_transformations(points, transformations) @@ -79,8 +77,7 @@ def write_points( axes = get_axes_names(points) transformations = _get_transformations(points) - store_root = _get_store_root(group.store_path.store) - path = store_root / group.path / "points.parquet" + parquet_store = make_zarr_store_from_group(group).child("points.parquet") # The following code iterates through all columns in the 'points' DataFrame. If the column's datatype is # 'category', it checks whether the categories of this column are known. If not, it explicitly converts the @@ -95,10 +92,10 @@ def write_points( points_without_transform = points.copy() del points_without_transform.attrs["transform"] - storage_options: dict[str, Any] = {} - if isinstance(path, _FsspecStoreRoot): - storage_options = _storage_options_from_fs(path._store.fs) - points_without_transform.to_parquet(str(path), storage_options=storage_options or None) + points_without_transform.to_parquet( + parquet_store.arrow_path(), + filesystem=parquet_store.arrow_filesystem(), + ) attrs = element_format.attrs_to_dict(points.attrs) attrs["version"] = element_format.spatialdata_format_version diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index e6a188dda..7eaf04d57 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -16,6 +16,7 @@ from ome_zarr.writer import write_labels as write_labels_ngff from ome_zarr.writer import write_multiscale as write_multiscale_ngff from ome_zarr.writer import write_multiscale_labels as write_multiscale_labels_ngff +from upath import UPath from xarray import DataArray, DataTree from spatialdata._io._utils import ( @@ -28,6 +29,7 @@ RasterFormatType, get_ome_zarr_format, ) +from spatialdata._store import ZarrStore, make_zarr_store from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import get_channel_names from spatialdata.models.models import ATTRS_KEY @@ -161,10 +163,11 @@ def _prepare_storage_options( def _read_multiscale( - store: str | Path, raster_type: Literal["image", "labels"], reader_format: Format + store: str | Path | UPath | ZarrStore, raster_type: Literal["image", "labels"], reader_format: Format ) -> DataArray | DataTree: assert raster_type in ["image", "labels"] - resolved_store = _resolve_zarr_store(store) + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) nodes: list[Node] = [] image_loc = ZarrLocation(resolved_store, fmt=reader_format) diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index ccba50dae..7344cd90e 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -1,9 +1,5 @@ from __future__ import annotations -import contextlib -import json -import os -import tempfile from pathlib import Path from typing import Any, Literal @@ -16,11 +12,8 @@ from upath import UPath from spatialdata._io._utils import ( - _FsspecStoreRoot, - _get_store_root, _get_transformations_from_ngff_dict, _resolve_zarr_store, - _storage_options_from_fs, _write_metadata, overwrite_coordinate_transformations_non_raster, ) @@ -32,6 +25,7 @@ ShapesFormatV03, _parse_version, ) +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group from spatialdata.models import ShapesModel, get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -40,10 +34,11 @@ def _read_shapes( - store: str | Path | UPath, + store: str | Path | UPath | ZarrStore, ) -> GeoDataFrame: """Read shapes from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" - resolved_store = _resolve_zarr_store(store) + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None @@ -64,13 +59,9 @@ def _read_shapes( geometry = from_ragged_array(typ, coords, offsets) geo_df = GeoDataFrame({"geometry": geometry}, index=index) elif isinstance(shape_format, ShapesFormatV02 | ShapesFormatV03): - store_root = _get_store_root(f.store_path.store) - path = store_root / f.path / "shapes.parquet" - if isinstance(path, _FsspecStoreRoot): - opts = _storage_options_from_fs(path._store.fs) - geo_df = read_parquet(str(path), storage_options=opts if opts else {}) - else: - geo_df = read_parquet(path) + parquet_store = zarr_store.child("shapes.parquet") + with parquet_store.arrow_filesystem().open_input_file(parquet_store.arrow_path()) as src: + geo_df = read_parquet(src) else: raise ValueError( f"Unsupported shapes format {shape_format} from version {version}. Please update the spatialdata library." @@ -163,61 +154,6 @@ def _write_shapes_v01(shapes: GeoDataFrame, group: zarr.Group, element_format: F attrs["version"] = element_format.spatialdata_format_version return attrs - -def _parse_fsspec_remote_path(path: _FsspecStoreRoot) -> tuple[str, str]: - """Return (bucket_or_container, blob_key) from an fsspec store path.""" - remote = str(path) - if "://" in remote: - remote = remote.split("://", 1)[1] - parts = remote.split("/", 1) - bucket_or_container = parts[0] - blob_key = parts[1] if len(parts) > 1 else "" - return bucket_or_container, blob_key - - -def _upload_parquet_to_azure(tmp_path: str, bucket: str, key: str, fs: Any) -> None: - from azure.storage.blob import BlobServiceClient - - client = BlobServiceClient.from_connection_string(fs.connection_string) - blob_client = client.get_blob_client(container=bucket, blob=key) - with open(tmp_path, "rb") as f: - blob_client.upload_blob(f, overwrite=True) - - -def _upload_parquet_to_s3(tmp_path: str, bucket: str, key: str, fs: Any) -> None: - import boto3 - - endpoint = getattr(fs, "endpoint_url", None) or os.environ.get("AWS_ENDPOINT_URL") - s3 = boto3.client( - "s3", - endpoint_url=endpoint, - aws_access_key_id=getattr(fs, "key", None) or os.environ.get("AWS_ACCESS_KEY_ID"), - aws_secret_access_key=getattr(fs, "secret", None) or os.environ.get("AWS_SECRET_ACCESS_KEY"), - region_name=os.environ.get("AWS_DEFAULT_REGION", "us-east-1"), - ) - s3.upload_file(tmp_path, bucket, key) - - -def _upload_parquet_to_fsspec(path: _FsspecStoreRoot, tmp_path: str) -> None: - """Upload local parquet file to remote fsspec store using sync APIs to avoid event-loop issues.""" - fs = path._store.fs - bucket, key = _parse_fsspec_remote_path(path) - fs_name = type(fs).__name__ - if fs_name == "AzureBlobFileSystem" and getattr(fs, "connection_string", None): - _upload_parquet_to_azure(tmp_path, bucket, key, fs) - elif fs_name in ("S3FileSystem", "MotoS3FS"): - _upload_parquet_to_s3(tmp_path, bucket, key, fs) - elif fs_name == "GCSFileSystem": - import fsspec - - fs_dict = json.loads(fs.to_json()) - fs_dict["asynchronous"] = False - sync_fs = fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict)) - sync_fs.put_file(tmp_path, path._path) - else: - fs.put(tmp_path, str(path)) - - def _write_shapes_v02_v03( shapes: GeoDataFrame, group: zarr.Group, element_format: Format, geometry_encoding: Literal["WKB", "geoarrow"] ) -> Any: @@ -237,23 +173,13 @@ def _write_shapes_v02_v03( """ from spatialdata.models._utils import TRANSFORM_KEY - store_root = _get_store_root(group.store_path.store) - path = store_root / group.path / "shapes.parquet" + parquet_store = make_zarr_store_from_group(group).child("shapes.parquet") # Temporarily remove transformations from attrs to avoid serialization issues transforms = shapes.attrs[TRANSFORM_KEY] del shapes.attrs[TRANSFORM_KEY] - if isinstance(path, _FsspecStoreRoot): - with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: - tmp_path = tmp.name - try: - shapes.to_parquet(tmp_path, geometry_encoding=geometry_encoding) - _upload_parquet_to_fsspec(path, tmp_path) - finally: - with contextlib.suppress(OSError): - os.unlink(tmp_path) - else: - shapes.to_parquet(path, geometry_encoding=geometry_encoding) + with parquet_store.arrow_filesystem().open_output_stream(parquet_store.arrow_path()) as sink: + shapes.to_parquet(sink, geometry_encoding=geometry_encoding) shapes.attrs[TRANSFORM_KEY] = transforms attrs = element_format.attrs_to_dict(shapes.attrs) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 11414fd66..a37e62a4e 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -18,11 +18,13 @@ TablesFormatV02, _parse_version, ) +from spatialdata._store import ZarrStore, make_zarr_store from spatialdata.models import TableModel, get_table_keys -def _read_table(store: str | Path | UPath) -> AnnData: - resolved_store = _resolve_zarr_store(store) +def _read_table(store: str | Path | UPath | ZarrStore) -> AnnData: + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) table = read_anndata_zarr(resolved_store) f = zarr.open(resolved_store, mode="r") diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index f3506beed..456374804 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -14,13 +14,10 @@ from pyarrow import ArrowInvalid from upath import UPath from zarr.errors import ArrayNotFoundError -from zarr.storage import FsspecStore, LocalStore from spatialdata._core.spatialdata import SpatialData from spatialdata._io._utils import ( BadFileHandleMethod, - _FsspecStoreRoot, - _get_store_root, _resolve_zarr_store, handle_read_errors, ) @@ -29,12 +26,13 @@ from spatialdata._io.io_shapes import _read_shapes from spatialdata._io.io_table import _read_table from spatialdata._logging import logger +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group, open_read_store from spatialdata._types import Raster_T def _read_zarr_group_spatialdata_element( root_group: zarr.Group, - root_store_path: Path | _FsspecStoreRoot, + root_store: ZarrStore, sdata_version: Literal["0.1", "0.2"], selector: set[str], read_func: Callable[..., Any], @@ -56,7 +54,7 @@ def _read_zarr_group_spatialdata_element( # skip hidden files like .zgroup or .zmetadata continue elem_group = group[subgroup_name] - elem_group_path = root_store_path / elem_group.path + elem_store = root_store.child(elem_group.path) with handle_read_errors( on_bad_files, location=f"{group.path}/{subgroup_name}", @@ -72,12 +70,12 @@ def _read_zarr_group_spatialdata_element( if element_type in ["image", "labels"]: reader_format = get_raster_format_for_read(elem_group, sdata_version) element = read_func( - elem_group_path, + elem_store, cast(Literal["image", "labels"], element_type), reader_format, ) elif element_type in ["shapes", "points", "tables"]: - element = read_func(elem_group_path) + element = read_func(elem_store) else: raise ValueError(f"Unknown element type {element_type}") element_container[subgroup_name] = element @@ -155,24 +153,7 @@ def read_zarr( ------- A SpatialData object. """ - from spatialdata._io._utils import _resolve_zarr_store - - resolved_store = _resolve_zarr_store(store) - root_group = zarr.open_group(resolved_store, mode="r") - # the following is the SpatialDataContainerFormat version - if "spatialdata_attrs" not in root_group.metadata.attributes: - # backward compatibility for pre-versioned SpatialData zarr stores - sdata_version: Literal["0.1", "0.2"] = "0.1" - else: - sdata_version = root_group.metadata.attributes["spatialdata_attrs"]["version"] - if sdata_version == "0.1": - warnings.warn( - "SpatialData is not stored in the most current format. If you want to use Zarr v3" - ", please write the store to a new location using `sdata.write()`.", - UserWarning, - stacklevel=2, - ) - root_store_path = _get_store_root(root_group.store) + zarr_store = make_zarr_store_from_group(store) if isinstance(store, zarr.Group) else make_zarr_store(store) images: dict[str, Raster_T] = {} labels: dict[str, Raster_T] = {} @@ -180,50 +161,66 @@ def read_zarr( shapes: dict[str, GeoDataFrame] = {} tables: dict[str, AnnData] = {} - selector = {"images", "labels", "points", "shapes", "tables"} if not selection else set(selection or []) - logger.debug(f"Reading selection {selector}") - - # we could make this more readable. One can get lost when looking at this dict and iteration over the items - group_readers: dict[ - Literal["images", "labels", "shapes", "points", "tables"], - tuple[ - Callable[..., Any], - Literal["image", "labels", "shapes", "points", "tables"], - dict[str, Raster_T] | dict[str, DaskDataFrame] | dict[str, GeoDataFrame] | dict[str, AnnData], - ], - ] = { - # ome-zarr-py needs a kwargs that has "image" has key. So here we have "image" and not "images" - "images": (_read_multiscale, "image", images), - "labels": (_read_multiscale, "labels", labels), - "points": (_read_points, "points", points), - "shapes": (_read_shapes, "shapes", shapes), - "tables": (_read_table, "tables", tables), - } - for group_name, ( - read_func, - element_type, - element_container, - ) in group_readers.items(): - _read_zarr_group_spatialdata_element( - root_group=root_group, - root_store_path=root_store_path, - sdata_version=sdata_version, - selector=selector, - read_func=read_func, - group_name=group_name, - element_type=element_type, - element_container=element_container, - on_bad_files=on_bad_files, - ) - - # read attrs metadata - attrs = root_group.attrs.asdict() - if "spatialdata_attrs" in attrs: - # when refactoring the read_zarr function into reading componenets separately (and according to the version), - # we can move the code below (.pop()) into attrs_from_dict() - attrs.pop("spatialdata_attrs") - else: - attrs = None + with open_read_store(zarr_store) as resolved_store: + root_group = zarr.open_group(resolved_store, mode="r") + # the following is the SpatialDataContainerFormat version + if "spatialdata_attrs" not in root_group.metadata.attributes: + # backward compatibility for pre-versioned SpatialData zarr stores + sdata_version: Literal["0.1", "0.2"] = "0.1" + else: + sdata_version = root_group.metadata.attributes["spatialdata_attrs"]["version"] + if sdata_version == "0.1": + warnings.warn( + "SpatialData is not stored in the most current format. If you want to use Zarr v3" + ", please write the store to a new location using `sdata.write()`.", + UserWarning, + stacklevel=2, + ) + + selector = {"images", "labels", "points", "shapes", "tables"} if not selection else set(selection or []) + logger.debug(f"Reading selection {selector}") + + # we could make this more readable. One can get lost when looking at this dict and iteration over the items + group_readers: dict[ + Literal["images", "labels", "shapes", "points", "tables"], + tuple[ + Callable[..., Any], + Literal["image", "labels", "shapes", "points", "tables"], + dict[str, Raster_T] | dict[str, DaskDataFrame] | dict[str, GeoDataFrame] | dict[str, AnnData], + ], + ] = { + # ome-zarr-py needs a kwargs that has "image" has key. So here we have "image" and not "images" + "images": (_read_multiscale, "image", images), + "labels": (_read_multiscale, "labels", labels), + "points": (_read_points, "points", points), + "shapes": (_read_shapes, "shapes", shapes), + "tables": (_read_table, "tables", tables), + } + for group_name, ( + read_func, + element_type, + element_container, + ) in group_readers.items(): + _read_zarr_group_spatialdata_element( + root_group=root_group, + root_store=zarr_store, + sdata_version=sdata_version, + selector=selector, + read_func=read_func, + group_name=group_name, + element_type=element_type, + element_container=element_container, + on_bad_files=on_bad_files, + ) + + # read attrs metadata + attrs = root_group.attrs.asdict() + if "spatialdata_attrs" in attrs: + # when refactoring the read_zarr function into reading componenets separately (and according to the version), + # we can move the code below (.pop()) into attrs_from_dict() + attrs.pop("spatialdata_attrs") + else: + attrs = None sdata = SpatialData( images=images, @@ -233,21 +230,7 @@ def read_zarr( tables=tables, attrs=attrs, ) - if isinstance(store, UPath): - sdata.path = store - elif isinstance(store, str): - sdata.path = UPath(store) if "://" in store else Path(store) - elif isinstance(store, Path): - sdata.path = store - elif isinstance(store, zarr.Group): - if isinstance(resolved_store, LocalStore): - sdata.path = Path(resolved_store.root) - elif isinstance(resolved_store, FsspecStore): - sdata.path = UPath(str(_FsspecStoreRoot(resolved_store))) - else: - sdata.path = None - else: - sdata.path = None + sdata._set_zarr_store(zarr_store) return sdata diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index d3c932123..d7bd2544f 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -1,19 +1,21 @@ from __future__ import annotations from contextlib import contextmanager -from dataclasses import dataclass, field, replace +from dataclasses import dataclass, replace from pathlib import Path from typing import Any +import pyarrow.fs as pafs import zarr from upath import UPath +from zarr.storage import FsspecStore, LocalStore PathLike = Path | UPath -def _normalize_path(path: str | PathLike) -> PathLike: +def _normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: if isinstance(path, str): - return UPath(path) if "://" in path else Path(path) + return UPath(path, **(storage_options or {})) if "://" in path else Path(path) if isinstance(path, (Path, UPath)): return path raise TypeError("Path must be `None`, a `str`, a `Path` or a `UPath` object.") @@ -22,28 +24,66 @@ def _normalize_path(path: str | PathLike) -> PathLike: @dataclass(frozen=True) class ZarrStore: path: PathLike - storage_options: dict[str, Any] = field(default_factory=dict) def with_path(self, path: str | PathLike) -> ZarrStore: return replace(self, path=_normalize_path(path)) + def child(self, path: str | PathLike) -> ZarrStore: + return self.with_path(self.path / path) + + def arrow_path(self) -> str: + return self.path.path if isinstance(self.path, UPath) else str(self.path) + + def arrow_filesystem(self) -> pafs.FileSystem: + if isinstance(self.path, UPath): + return pafs.PyFileSystem(pafs.FSSpecHandler(self.path.fs)) + return pafs.LocalFileSystem() + def make_zarr_store( path: str | PathLike, *, storage_options: dict[str, Any] | None = None, ) -> ZarrStore: - return ZarrStore( - path=_normalize_path(path), - storage_options={} if storage_options is None else dict(storage_options), + return ZarrStore(path=_normalize_path(path, storage_options)) + + +def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: + from spatialdata._io._utils import ( + _check_fsspec_at_remote_store_open, + _join_fsspec_store_path, + _storage_options_from_fs, ) + store = group.store + _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) + if _cms is not None and isinstance(store, _cms): + store = store.store + + if isinstance(store, LocalStore): + return make_zarr_store(Path(store.root) / group.path) + if isinstance(store, FsspecStore): + _check_fsspec_at_remote_store_open(store.fs) + protocol = getattr(store.fs, "protocol", None) + if isinstance(protocol, (list, tuple)): + protocol = protocol[0] if protocol else "file" + elif protocol is None: + protocol = "file" + storage_options: dict[str, Any] + try: + storage_options = _storage_options_from_fs(store.fs) + except ValueError: + storage_options = {} + path = _join_fsspec_store_path(store.path, group.path) + return make_zarr_store(f"{protocol}://{path}", storage_options=storage_options) + raise ValueError(f"Unsupported store type or zarr.Group: {type(group.store)}") + @contextmanager def open_read_store(store: ZarrStore) -> Any: from spatialdata._io._utils import _resolve_zarr_store - resolved_store = _resolve_zarr_store(store.path, **store.storage_options) + resolved_store = _resolve_zarr_store(store.path) try: yield resolved_store finally: @@ -54,20 +94,8 @@ def open_read_store(store: ZarrStore) -> Any: def open_write_store(store: ZarrStore) -> Any: from spatialdata._io._utils import _resolve_zarr_store - resolved_store = _resolve_zarr_store(store.path, **store.storage_options) + resolved_store = _resolve_zarr_store(store.path) try: yield resolved_store finally: resolved_store.close() - - -def open_group_from_store( - store: zarr.storage.StoreLike, - *, - mode: str, - use_consolidated: bool | None = None, -) -> zarr.Group: - kwargs: dict[str, Any] = {"store": store, "mode": mode} - if use_consolidated is not None: - kwargs["use_consolidated"] = use_consolidated - return zarr.open_group(**kwargs) diff --git a/tests/io/remote_storage/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py index 065211910..23906b9ec 100644 --- a/tests/io/remote_storage/test_remote_storage.py +++ b/tests/io/remote_storage/test_remote_storage.py @@ -13,9 +13,11 @@ import uuid import pytest +import zarr from upath import UPath from spatialdata import SpatialData +from spatialdata._store import make_zarr_store, open_read_store from spatialdata.testing import assert_spatial_data_objects_are_identical # Azure emulator connection string (Azurite default). @@ -195,3 +197,18 @@ def test_write_element_to_remote_storage(self, full_sdata: SpatialData, get_upat for _element_type, element_name, _ in full_sdata.gen_elements(): full_sdata.write_element(element_name, overwrite=True) _assert_read_identical(full_sdata, upath, check_path=False) + + @REMOTE_STORAGE_PARAMS + def test_read_from_remote_zarr_group_keeps_backing_for_followup_write( + self, full_sdata: SpatialData, get_upath, storage_name: str + ) -> None: + upath = get_upath(container=f"test-{storage_name}", path=f"read-group-{uuid.uuid4().hex}.zarr") + full_sdata.write(upath, overwrite=True) + + with open_read_store(make_zarr_store(upath)) as store: + group = zarr.open_group(store=store, mode="r") + sdata_from_group = SpatialData.read(group) + + assert isinstance(sdata_from_group.path, UPath) + sdata_from_group.write(overwrite=True) + _assert_read_identical(full_sdata, upath, check_path=False) diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py index d37e0aa35..d3c03d789 100644 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -1,7 +1,7 @@ """Unit tests for remote-storage-specific store resolution and credential handling. Covers only code paths used when reading/writing from remote backends (Azure, S3, GCS): -- _FsspecStoreRoot resolution (used when reading elements from a remote zarr store). +- zarr.Group to ZarrStore normalization for remote-backed groups. - _storage_options_from_fs for Azure and GCS (used when writing parquet to remote). """ @@ -10,9 +10,11 @@ import tempfile import pytest +import zarr from zarr.storage import FsspecStore, LocalStore, MemoryStore -from spatialdata._io._utils import _FsspecStoreRoot, _resolve_zarr_store, _storage_options_from_fs +from spatialdata._io._utils import _resolve_zarr_store, _storage_options_from_fs +from spatialdata._store import make_zarr_store_from_group, open_read_store def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: @@ -23,17 +25,21 @@ def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: assert _resolve_zarr_store(loc) is loc -def test_resolve_zarr_store_fsspec_store_root() -> None: - """_FsspecStoreRoot is resolved to FsspecStore when reading from remote (e.g. points/shapes paths).""" +def test_make_zarr_store_from_remote_group() -> None: + """Remote zarr.Group inputs keep a usable UPath plus storage options for follow-up reads.""" import fsspec from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper fs = fsspec.filesystem("memory") async_fs = AsyncFileSystemWrapper(fs, asynchronous=True) base = FsspecStore(async_fs, path="/") - root = _FsspecStoreRoot(base, "/") - store = _resolve_zarr_store(root) - assert isinstance(store, FsspecStore) + root = zarr.open_group(store=base, mode="a") + group = root.require_group("points").require_group("points") + + zarr_store = make_zarr_store_from_group(group) + + with open_read_store(zarr_store) as store: + assert isinstance(store, FsspecStore) def test_storage_options_from_fs_azure_account_key() -> None: diff --git a/tests/io/test_store.py b/tests/io/test_store.py index bd44c77f8..000721d5b 100644 --- a/tests/io/test_store.py +++ b/tests/io/test_store.py @@ -7,6 +7,7 @@ from spatialdata._store import ( make_zarr_store, + make_zarr_store_from_group, open_read_store, open_write_store, ) @@ -22,6 +23,12 @@ def test_make_zarr_store_normalizes_local_and_remote_paths( assert isinstance(remote_store.path, UPath) +def test_make_zarr_store_applies_storage_options_to_remote_strings() -> None: + zarr_store = make_zarr_store("s3://bucket/store.zarr", storage_options={"anon": True}) + assert isinstance(zarr_store.path, UPath) + assert getattr(zarr_store.path.fs, "anon", None) is True + + def test_open_read_and_write_store_roundtrip(tmp_path: Path) -> None: zarr_store = make_zarr_store(tmp_path / "store.zarr") @@ -32,3 +39,14 @@ def test_open_read_and_write_store_roundtrip(tmp_path: Path) -> None: with open_read_store(zarr_store) as store: group = zarr.open_group(store=store, mode="r") assert group.attrs["answer"] == 42 + + +def test_make_zarr_store_from_local_group(tmp_path: Path) -> None: + zarr_store = make_zarr_store(tmp_path / "store.zarr") + + with open_write_store(zarr_store) as store: + root = zarr.create_group(store=store, overwrite=True) + group = root.require_group("images").require_group("image") + + child_store = make_zarr_store_from_group(group) + assert child_store.path == tmp_path / "store.zarr" / "images" / "image" From 389d8ec1c5d2a5cc2adc7cf09363521a263d63df Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Thu, 16 Apr 2026 13:50:27 +0200 Subject: [PATCH 35/51] no provider specific stuff --- src/spatialdata/_core/spatialdata.py | 11 +- src/spatialdata/_io/_utils.py | 120 +----------------- src/spatialdata/_store.py | 11 +- tests/io/remote_storage/conftest.py | 6 +- .../io/remote_storage/test_remote_storage.py | 30 ++--- .../remote_storage/test_resolve_zarr_store.py | 66 +--------- 6 files changed, 29 insertions(+), 215 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 9b2c997dd..ba973c590 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1056,12 +1056,12 @@ def _validate_can_safely_write_to_path( For :class:`upath.UPath`, only "store exists vs ``overwrite``" is checked. Local Dask-backing and subfolder checks are omitted because backing paths are filesystem-local and are not - compared to object-store keys; ``overwrite=True`` on remote URLs must be chosen carefully. + compared to object-store keys. Conflicts for remote targets are deferred to the backend/Zarr + write path; ``overwrite=True`` on remote URLs must be chosen carefully. """ from spatialdata._io._utils import ( _backed_elements_contained_in_path, _is_subfolder, - _remote_zarr_store_exists, _resolve_zarr_store, ) @@ -1076,13 +1076,6 @@ def _validate_can_safely_write_to_path( raise ValueError(f"file_path must be a string, Path or UPath object, type(file_path) = {type(file_path)}.") if isinstance(file_path, UPath): - store = _resolve_zarr_store(file_path) - if _remote_zarr_store_exists(store) and not overwrite: - raise ValueError( - "The Zarr store already exists. Use `overwrite=True` to try overwriting the store. " - "Please note that only Zarr stores not currently in use by the current SpatialData object can be " - "overwritten." - ) return # Local Path: existing logic diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index c022e2701..72268fa66 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -24,7 +24,6 @@ from upath import UPath from upath.implementations.local import PosixUPath, WindowsUPath from xarray import DataArray, DataTree -from zarr.errors import GroupNotFoundError from zarr.storage import FsspecStore, LocalStore from spatialdata._core.spatialdata import SpatialData @@ -49,9 +48,6 @@ def _join_fsspec_store_path(store_path: str, relative_path: str) -> str: return f"{base}/{rel}" if rel else base -_CLOUD_OBJECT_STORE_PROTOCOLS: frozenset[str] = frozenset({"abfs", "adl", "az", "gcs", "gs", "s3", "s3a"}) - - def _unwrap_fsspec_sync_fs(fs: Any) -> Any: inner = getattr(fs, "sync_fs", None) if inner is not None and inner is not fs: @@ -59,92 +55,6 @@ def _unwrap_fsspec_sync_fs(fs: Any) -> Any: return fs -def _fsspec_protocols(core: Any) -> set[str]: - raw = getattr(core, "protocol", None) - if isinstance(raw, str): - return {raw} - if isinstance(raw, (list, tuple)): - return set(raw) - return set() - - -def _cloud_parquet_protocol_family(core: Any) -> Literal["azure", "gcs", "s3"] | None: - """Map fsspec filesystem protocol(s) to how we extract parquet ``storage_options`` (not by class name).""" - protos = _fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS - if not protos: - return None - if protos & {"s3", "s3a"}: - return "s3" - if protos & {"abfs", "adl", "az"}: - return "azure" - if protos & {"gcs", "gs"}: - return "gcs" - return None - - -def _check_fsspec_at_remote_store_open(fs: Any) -> None: - """If ``fs`` looks like S3/GCS/Azure, ensure we can build parquet ``storage_options`` for it.""" - core = _unwrap_fsspec_sync_fs(fs) - protos = _fsspec_protocols(core) & _CLOUD_OBJECT_STORE_PROTOCOLS - if not protos: - return - if _cloud_parquet_protocol_family(core) is None: - label = f"{type(core).__module__}.{type(core).__qualname__}" - raise ValueError( - f"Cannot derive parquet storage_options for filesystem {label!r} with protocol(s) {protos!r}. " - "Supported protocol families: S3 (s3, s3a), Azure (abfs, adl, az), GCS (gcs, gs). " - "Custom implementations should expose a matching ``protocol`` (see fsspec)." - ) - - -def _storage_options_from_fs(fs: Any) -> dict[str, Any]: - """Build storage_options dict from an fsspec filesystem for use with to_parquet/read_parquet. - - Unwraps ``sync_fs`` chains (e.g. async wrappers). Dispatches by **reported fsspec protocol** (``fs.protocol``), - not by concrete class name, so subclasses and thin wrappers that speak ``s3``/``gs``/``az`` still work as long as - they expose the credential attributes we copy (same shape as s3fs, gcsfs, adlfs). - """ - core = _unwrap_fsspec_sync_fs(fs) - family = _cloud_parquet_protocol_family(core) - if family is None: - label = f"{type(core).__module__}.{type(core).__qualname__}" - protos = _fsspec_protocols(core) - raise ValueError( - f"Cannot derive parquet storage_options from filesystem {label!r} (protocols {protos!r}). " - "Expected an object-store protocol among " - f"{sorted(_CLOUD_OBJECT_STORE_PROTOCOLS)}." - ) - out: dict[str, Any] = {} - if family == "azure": - if getattr(core, "connection_string", None): - out["connection_string"] = core.connection_string - elif getattr(core, "account_name", None) and getattr(core, "account_key", None): - out["account_name"] = core.account_name - out["account_key"] = core.account_key - if getattr(core, "anon", None) is not None: - out["anon"] = core.anon - elif family == "s3": - if getattr(core, "endpoint_url", None): - out["endpoint_url"] = core.endpoint_url - if getattr(core, "key", None): - out["key"] = core.key - if getattr(core, "secret", None): - out["secret"] = core.secret - if getattr(core, "anon", None) is not None: - out["anon"] = core.anon - elif family == "gcs": - if getattr(core, "token", None) is not None: - out["token"] = core.token - if getattr(core, "_endpoint", None): - out["endpoint_url"] = core._endpoint - if getattr(core, "project", None): - out["project"] = core.project - else: - raise AssertionError(f"Unhandled protocol family {family!r}") - - return out - - def _get_transformations_from_ngff_dict( list_of_encoded_ngff_transformations: list[dict[str, Any]], ) -> MappingToCoordinateSystem_t: @@ -549,27 +459,6 @@ def _is_element_self_contained( return all(_backed_elements_contained_in_path(path=element_path, object=element)) -def _is_azure_http_response_error(exc: BaseException) -> bool: - """Return True if exc is an Azure SDK HttpResponseError (e.g. emulator API mismatch).""" - t = type(exc) - return t.__name__ == "HttpResponseError" and (getattr(t, "__module__", "") or "").startswith("azure.") - - -def _remote_zarr_store_exists(store: zarr.storage.StoreLike) -> bool: - """Return True if the store contains a zarr group. Closes the store. Handles Azure emulator errors.""" - try: - zarr.open_group(store, mode="r") - return True - except (GroupNotFoundError, OSError, FileNotFoundError): - return False - except Exception as e: - if _is_azure_http_response_error(e): - return False - raise - finally: - store.close() - - def _ensure_async_fs(fs: Any) -> Any: """Return an async fsspec filesystem for use with zarr's FsspecStore. @@ -619,10 +508,8 @@ def _resolve_zarr_store( ------ TypeError If the input type is unsupported. - ValueError - If a `zarr.Group` has an unsupported store type, or if the fsspec filesystem uses a cloud - object-store protocol (S3, GCS, Azure, …) but is not a supported implementation for parquet - ``storage_options`` (see :func:`_check_fsspec_at_remote_store_open`). + ValueError + If a `zarr.Group` has an unsupported store type. """ # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): @@ -641,7 +528,6 @@ def _resolve_zarr_store( if isinstance(path.store, FsspecStore): # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group - _check_fsspec_at_remote_store_open(path.store.fs) return FsspecStore( fs=_ensure_async_fs(path.store.fs), path=_join_fsspec_store_path(path.store.path, path.path), @@ -651,7 +537,6 @@ def _resolve_zarr_store( # Unwrap and apply the same async-fs + parquet guards as a direct FsspecStore on the group. inner = path.store.store if isinstance(inner, FsspecStore): - _check_fsspec_at_remote_store_open(inner.fs) return FsspecStore( fs=_ensure_async_fs(inner.fs), path=_join_fsspec_store_path(inner.path, path.path), @@ -664,7 +549,6 @@ def _resolve_zarr_store( raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, UPath): # if input is a remote UPath, map it to an FSStore (check before StoreLike to avoid UnionType isinstance) - _check_fsspec_at_remote_store_open(path.fs) return FsspecStore(_ensure_async_fs(path.fs), path=path.path, **kwargs) if isinstance(path, zarr.storage.StoreLike): # Already a concrete store (LocalStore, FsspecStore, MemoryStore, …). Do not pass it as ``fs=`` to diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index d7bd2544f..8c0c08ed9 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -50,9 +50,8 @@ def make_zarr_store( def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: from spatialdata._io._utils import ( - _check_fsspec_at_remote_store_open, _join_fsspec_store_path, - _storage_options_from_fs, + _unwrap_fsspec_sync_fs, ) store = group.store @@ -63,19 +62,13 @@ def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: if isinstance(store, LocalStore): return make_zarr_store(Path(store.root) / group.path) if isinstance(store, FsspecStore): - _check_fsspec_at_remote_store_open(store.fs) protocol = getattr(store.fs, "protocol", None) if isinstance(protocol, (list, tuple)): protocol = protocol[0] if protocol else "file" elif protocol is None: protocol = "file" - storage_options: dict[str, Any] - try: - storage_options = _storage_options_from_fs(store.fs) - except ValueError: - storage_options = {} path = _join_fsspec_store_path(store.path, group.path) - return make_zarr_store(f"{protocol}://{path}", storage_options=storage_options) + return make_zarr_store(UPath(f"{protocol}://{path}", fs=_unwrap_fsspec_sync_fs(store.fs))) raise ValueError(f"Unsupported store type or zarr.Group: {type(group.store)}") diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py index 0a0b608b9..5443ec4d6 100644 --- a/tests/io/remote_storage/conftest.py +++ b/tests/io/remote_storage/conftest.py @@ -1,6 +1,6 @@ """Pytest hooks for ``tests/io/remote_storage/`` only (not loaded from repo-root ``tests/conftest.py``). -Creates buckets/containers when remote emulators are running. Assumes emulators are already up +Creates backend fixtures when remote emulators are running. Assumes emulators are already up (e.g. Docker: ``docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators``). Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. @@ -99,7 +99,7 @@ def _apply_resilient_async_close_patches() -> None: def pytest_configure(config: pytest.Config) -> None: - """Apply patches for remote storage tests (resilient async close at shutdown).""" + """Apply remote-test-only patches for resilient async close at shutdown.""" _apply_resilient_async_close_patches() @@ -191,7 +191,7 @@ def _wait_for_emulator_ports(host: str = "127.0.0.1", timeout: float = 10.0, che @pytest.fixture(scope="session") def _remote_storage_buckets_containers(): - """Create buckets/containers on running emulators so remote storage tests can run. + """Create backend fixtures on running emulators so remote storage tests can run. Run with emulators up, e.g.: docker run --rm -d -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators diff --git a/tests/io/remote_storage/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py index 23906b9ec..a53ed3377 100644 --- a/tests/io/remote_storage/test_remote_storage.py +++ b/tests/io/remote_storage/test_remote_storage.py @@ -1,4 +1,4 @@ -"""Integration tests for remote storage (Azure, S3, GCS) using real emulators. +"""Integration tests for remote-backed storage using real backend emulators. Emulators must be running (e.g. Docker: docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. @@ -82,11 +82,11 @@ def _assert_read_identical(expected: SpatialData, upath: UPath, *, check_path: b class TestPathSetter: - """Test SpatialData.path setter with UPath objects.""" + """Test SpatialData.path setter with remote UPath objects.""" @GET_UPATH_PARAMS def test_path_setter_accepts_upath(self, get_upath) -> None: - """Test that SpatialData.path setter accepts UPath for remote storage. + """Test that SpatialData.path setter accepts backend-configured UPath objects. This test fails, reproducing issue #441: SpatialData.path setter only accepts None | str | Path, not UPath, preventing the use of remote storage. @@ -98,7 +98,7 @@ def test_path_setter_accepts_upath(self, get_upath) -> None: @GET_UPATH_PARAMS def test_write_with_upath_sets_path(self, get_upath) -> None: - """Test that writing to UPath sets SpatialData.path correctly. + """Test that writing to a remote UPath sets SpatialData.path correctly. This test fails because SpatialData.write() rejects UPath in _validate_can_safely_write_to_path() before it can set sdata.path. @@ -118,10 +118,10 @@ def test_path_setter_rejects_other_types(self) -> None: class TestRemoteStorage: - """Test end-to-end remote storage workflows with UPath. + """Test end-to-end remote storage workflows with backend-configured UPath objects. - Note: These tests require appropriate emulators running (Azurite for Azure, - moto for S3, fake-gcs-server for GCS). Tests will fail if emulators are not available. + Note: These tests require the backend emulators from ``tests/io/remote_storage/conftest.py`` + to be running. Tests will fail if the emulators are not available. """ @REMOTE_STORAGE_PARAMS @@ -138,9 +138,9 @@ def test_write_read_roundtrip_remote(self, full_sdata: SpatialData, get_upath, s assert isinstance(full_sdata.path, UPath) assert full_sdata.path == upath _assert_read_identical(full_sdata, upath) - # ``str(upath)`` drops storage options on the UPath; S3 against moto still works via - # ``AWS_*`` / ``AWS_ENDPOINT_URL`` from conftest. Azure/GCS strings would omit credentials - # or emulator endpoints, so we only assert the string-URL read path for S3 here. + # ``str(upath)`` drops the configured filesystem object. Some backends can still be reopened + # from ambient environment defaults, but others rely on the configured UPath, so we only + # assert the string-URL read path for S3 here. if storage_name == "s3": sdata_str_url = SpatialData.read(str(upath)) assert isinstance(sdata_str_url.path, UPath) @@ -150,7 +150,7 @@ def test_write_read_roundtrip_remote(self, full_sdata: SpatialData, get_upath, s def test_path_setter_with_remote_then_operations( self, full_sdata: SpatialData, get_upath, storage_name: str ) -> None: - """Test setting remote path, then performing operations. + """Test setting a remote path, then performing operations. This test verifies that after setting a remote path: 1. Path is correctly stored @@ -169,9 +169,8 @@ def test_path_setter_with_remote_then_operations( def test_overwrite_existing_remote_data(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: """Test overwriting existing data in remote storage. - Verifies that overwriting existing remote data works (path-exists handling) - and data integrity after overwrite. Round-trip is covered by - test_write_read_roundtrip_remote. + Verifies that backend-managed overwriting works and that the data remains + intact afterwards. Round-trip is covered by ``test_write_read_roundtrip_remote``. """ upath = get_upath(container=f"test-{storage_name}", path=f"overwrite-{uuid.uuid4().hex}.zarr") full_sdata.write(upath, overwrite=True) @@ -180,7 +179,7 @@ def test_overwrite_existing_remote_data(self, full_sdata: SpatialData, get_upath @REMOTE_STORAGE_PARAMS def test_write_element_to_remote_storage(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: - """Test writing individual elements to remote storage using write_element(). + """Test writing individual elements to remote storage using ``write_element()``. This test verifies that: 1. Setting path to remote UPath works @@ -202,6 +201,7 @@ def test_write_element_to_remote_storage(self, full_sdata: SpatialData, get_upat def test_read_from_remote_zarr_group_keeps_backing_for_followup_write( self, full_sdata: SpatialData, get_upath, storage_name: str ) -> None: + """Test that reading from a remote zarr.Group preserves enough backing info for a later write.""" upath = get_upath(container=f"test-{storage_name}", path=f"read-group-{uuid.uuid4().hex}.zarr") full_sdata.write(upath, overwrite=True) diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py index d3c03d789..304f235da 100644 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ b/tests/io/remote_storage/test_resolve_zarr_store.py @@ -1,19 +1,17 @@ -"""Unit tests for remote-storage-specific store resolution and credential handling. +"""Unit tests for remote store resolution helpers. -Covers only code paths used when reading/writing from remote backends (Azure, S3, GCS): +These cover generic code paths used when reading/writing through remote backends: - zarr.Group to ZarrStore normalization for remote-backed groups. -- _storage_options_from_fs for Azure and GCS (used when writing parquet to remote). """ from __future__ import annotations import tempfile -import pytest import zarr from zarr.storage import FsspecStore, LocalStore, MemoryStore -from spatialdata._io._utils import _resolve_zarr_store, _storage_options_from_fs +from spatialdata._io._utils import _resolve_zarr_store from spatialdata._store import make_zarr_store_from_group, open_read_store @@ -26,7 +24,7 @@ def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: def test_make_zarr_store_from_remote_group() -> None: - """Remote zarr.Group inputs keep a usable UPath plus storage options for follow-up reads.""" + """Remote zarr.Group inputs keep a usable UPath and reopen through the same protocol.""" import fsspec from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper @@ -37,61 +35,7 @@ def test_make_zarr_store_from_remote_group() -> None: group = root.require_group("points").require_group("points") zarr_store = make_zarr_store_from_group(group) + assert getattr(zarr_store.path.fs, "protocol", None) == "memory" with open_read_store(zarr_store) as store: assert isinstance(store, FsspecStore) - - -def test_storage_options_from_fs_azure_account_key() -> None: - """_storage_options_from_fs extracts Azure credentials for writing parquet to remote Azure Blob.""" - - class AzureBlobFileSystemMock: - protocol = "abfs" - account_name = "dev" - account_key = "key123" - connection_string = None - anon = None - - out = _storage_options_from_fs(AzureBlobFileSystemMock()) - assert out["account_name"] == "dev" - assert out["account_key"] == "key123" - - -def test_storage_options_from_fs_gcs_endpoint() -> None: - """_storage_options_from_fs extracts GCS endpoint and project for writing parquet to remote GCS.""" - - class GCSFileSystemMock: - protocol = "gs" - token = "anon" - _endpoint = "http://localhost:4443" - project = "test" - - out = _storage_options_from_fs(GCSFileSystemMock()) - assert out["token"] == "anon" - assert out["endpoint_url"] == "http://localhost:4443" - assert out["project"] == "test" - - -def test_storage_options_from_fs_s3_by_protocol_not_class_name() -> None: - """Subclasses / wrappers are accepted when ``protocol`` is s3 and attrs match s3fs-style credentials.""" - - class CustomS3Wrapper: - protocol = "s3" - endpoint_url = "http://127.0.0.1:9000" - key = "access" - secret = "secret" - anon = False - - out = _storage_options_from_fs(CustomS3Wrapper()) - assert out["endpoint_url"] == "http://127.0.0.1:9000" - assert out["key"] == "access" - assert out["secret"] == "secret" - assert out["anon"] is False - - -def test_storage_options_from_fs_requires_object_store_protocol() -> None: - class NoCloud: - protocol = "file" - - with pytest.raises(ValueError, match="Cannot derive parquet storage_options"): - _storage_options_from_fs(NoCloud()) From b587971b2651b96ed7b6d1eac4008c278622d205 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 12:53:31 +0200 Subject: [PATCH 36/51] error when overwrite=False and add remote storeage --- .github/workflows/test.yaml | 59 +---- pyproject.toml | 3 - src/spatialdata/_core/spatialdata.py | 25 +- tests/io/remote_storage/Dockerfile.emulators | 33 --- tests/io/remote_storage/conftest.py | 215 ------------------ .../io/remote_storage/test_remote_storage.py | 214 ----------------- .../remote_storage/test_resolve_zarr_store.py | 41 ---- tests/io/test_store.py | 29 +++ 8 files changed, 46 insertions(+), 573 deletions(-) delete mode 100644 tests/io/remote_storage/Dockerfile.emulators delete mode 100644 tests/io/remote_storage/conftest.py delete mode 100644 tests/io/remote_storage/test_remote_storage.py delete mode 100644 tests/io/remote_storage/test_resolve_zarr_store.py diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5849c4dfa..1635bdd2a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -59,7 +59,7 @@ jobs: PLATFORM: ${{ matrix.os }} DISPLAY: :42 run: | - uv run pytest --cov --color=yes --cov-report=xml --ignore=tests/io/remote_storage/ + uv run pytest --cov --color=yes --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: @@ -67,60 +67,3 @@ jobs: verbose: true env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - - test-remote-storage: - runs-on: ubuntu-latest - defaults: - run: - shell: bash - strategy: - fail-fast: false - matrix: - python: ["3.11", "3.13"] - env: - MPLBACKEND: agg - PLATFORM: ubuntu-latest - DISPLAY: :42 - GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT: "false" - steps: - - uses: actions/checkout@v6 - - uses: astral-sh/setup-uv@v7 - with: - version: "latest" - python-version: ${{ matrix.python }} - - name: Install dependencies - run: | - uv add dask - uv sync --group=test - - name: Build and start storage emulators - run: | - docker build -f tests/io/remote_storage/Dockerfile.emulators -t spatialdata-emulators . - docker run --rm -d --name spatialdata-emulators \ - -p 5000:5000 -p 10000:10000 -p 4443:4443 \ - spatialdata-emulators - - name: Wait for emulator ports - run: | - echo "Waiting for S3 (5000), Azure (10000), GCS (4443)..." - python3 -c " - import socket, time - for _ in range(45): - try: - for p in (5000, 10000, 4443): - socket.create_connection(('127.0.0.1', p), timeout=2) - print('Emulators ready.') - break - except (socket.error, OSError): - time.sleep(2) - else: - raise SystemExit('Emulators did not become ready.') - " - - name: Test remote storage - run: | - uv run pytest tests/io/remote_storage/ --cov --color=yes --cov-report=xml - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 - with: - name: coverage-remote-storage-${{ matrix.python }} - verbose: true - env: - CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/pyproject.toml b/pyproject.toml index 83d1976da..07ec8140b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,9 +66,6 @@ dev = [ "bump2version", ] test = [ - "adlfs", - "gcsfs", - "moto[server]", "pytest", "pytest-cov", "pytest-mock", diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index ba973c590..e4b087d05 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1054,10 +1054,11 @@ def _validate_can_safely_write_to_path( """ Guard against unsafe writes for **local** paths (zarr check, Dask backing, subfolders). - For :class:`upath.UPath`, only "store exists vs ``overwrite``" is checked. Local Dask-backing - and subfolder checks are omitted because backing paths are filesystem-local and are not - compared to object-store keys. Conflicts for remote targets are deferred to the backend/Zarr - write path; ``overwrite=True`` on remote URLs must be chosen carefully. + For :class:`upath.UPath`, ``overwrite=False`` is rejected: we cannot reliably check + whether a remote store already exists (fsspec existence semantics vary by backend and + object stores have no directory concept), so the "fail if exists" contract cannot be + honored. Callers must pass ``overwrite=True`` to explicitly acknowledge that the write + may clobber pre-existing data at the target. """ from spatialdata._io._utils import ( _backed_elements_contained_in_path, @@ -1065,8 +1066,8 @@ def _validate_can_safely_write_to_path( _resolve_zarr_store, ) - # Hierarchical URIs (``scheme://…``) must become UPath: plain ``Path(str)`` breaks cloud URLs - # (S3-compatible stores, Azure ``abfs://`` / ``az://``, GCS ``gs://``, ``https://``, fsspec chains, etc.). + # Hierarchical URIs ("scheme://...") must become UPath: plain Path(str) breaks cloud URLs + # (S3-compatible stores, Azure abfs:// / az://, GCS gs://, https://, fsspec chains, etc.). if isinstance(file_path, str) and "://" in file_path: file_path = UPath(file_path) elif isinstance(file_path, str): @@ -1076,6 +1077,12 @@ def _validate_can_safely_write_to_path( raise ValueError(f"file_path must be a string, Path or UPath object, type(file_path) = {type(file_path)}.") if isinstance(file_path, UPath): + if not overwrite: + raise NotImplementedError( + "Writing to a remote (UPath) target requires overwrite=True. " + "We cannot reliably check whether the remote store already exists, so the write " + "may clobber existing data; pass overwrite=True to acknowledge this." + ) return # Local Path: existing logic @@ -1158,9 +1165,9 @@ def write( The path to the Zarr store to write to. If ``None``, uses :attr:`path` (must be set). overwrite If `True`, overwrite the Zarr store if it already exists. If `False`, `write()` will fail if the Zarr store - already exists. For remote paths (:class:`upath.UPath`), the extra safeguards used for local paths (that - Dask-backed files are not inside the write target) are not applied; use ``overwrite=True`` only when you - are sure the destination store may be replaced. + already exists. For remote paths (:class:`upath.UPath`), ``overwrite=True`` is required because we cannot + reliably check whether the remote target exists; passing ``overwrite=False`` raises ``NotImplementedError``. + Pass ``overwrite=True`` to explicitly acknowledge that the write may clobber pre-existing data. consolidate_metadata If `True`, triggers :func:`zarr.convenience.consolidate_metadata`, which writes all the metadata in a single file at the root directory of the store. This makes the data cloud accessible, which is required for certain diff --git a/tests/io/remote_storage/Dockerfile.emulators b/tests/io/remote_storage/Dockerfile.emulators deleted file mode 100644 index 43b6835e6..000000000 --- a/tests/io/remote_storage/Dockerfile.emulators +++ /dev/null @@ -1,33 +0,0 @@ -# Storage emulators for tests in this directory (S3, Azure, GCS). -# Emulator URLs: S3 127.0.0.1:5000 | Azure 127.0.0.1:10000 | GCS 127.0.0.1:4443 -# -# Build (from project root): -# docker build -f tests/io/remote_storage/Dockerfile.emulators -t spatialdata-emulators . -# -# Run in background (detached): -# docker run --rm -d --name spatialdata-emulators -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators -# -# Run in foreground (attach to terminal): -# docker run --rm --name spatialdata-emulators -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators -# -# Stop / remove: -# docker stop spatialdata-emulators -# docker rm -f spatialdata-emulators # if already stopped or to force-remove -FROM node:20-slim -RUN apt-get update && apt-get install -y --no-install-recommends \ - python3 python3-pip python3-venv curl ca-certificates \ - && rm -rf /var/lib/apt/lists/* -RUN npm install -g azurite -RUN python3 -m venv /opt/venv && /opt/venv/bin/pip install --no-cache-dir 'moto[server]' -ENV PATH="/opt/venv/bin:$PATH" -# fake-gcs-server must match the image CPU. `ARG TARGETARCH=amd64` can stay amd64 on arm64 builds. -RUN set -eux; \ - arch="$(uname -m)"; \ - case "$arch" in x86_64) fgs=amd64 ;; aarch64|arm64) fgs=arm64 ;; *) echo "unsupported arch: $arch" >&2; exit 1 ;; esac; \ - cd /tmp && curl -fsSL -o fgs.tgz "https://github.com/fsouza/fake-gcs-server/releases/download/v1.54.0/fake-gcs-server_1.54.0_linux_${fgs}.tar.gz" \ - && tar xzf fgs.tgz && mv fake-gcs-server /usr/local/bin/ 2>/dev/null || mv fake-gcs-server_*/fake-gcs-server /usr/local/bin/ \ - && chmod +x /usr/local/bin/fake-gcs-server && rm -f fgs.tgz -RUN mkdir -p /data -EXPOSE 5000 10000 4443 -RUN echo 'moto_server -H 0.0.0.0 -p 5000 & azurite --silent --location /data --blobHost 0.0.0.0 --skipApiVersionCheck & fake-gcs-server -scheme http -port 4443 & wait' > /start.sh && chmod +x /start.sh -CMD ["/bin/sh", "/start.sh"] diff --git a/tests/io/remote_storage/conftest.py b/tests/io/remote_storage/conftest.py deleted file mode 100644 index 5443ec4d6..000000000 --- a/tests/io/remote_storage/conftest.py +++ /dev/null @@ -1,215 +0,0 @@ -"""Pytest hooks for ``tests/io/remote_storage/`` only (not loaded from repo-root ``tests/conftest.py``). - -Creates backend fixtures when remote emulators are running. Assumes emulators are already up -(e.g. Docker: ``docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators``). -Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. - -``pytest_configure`` here patches ``fsspec.asyn.sync`` and ``gcsfs`` session teardown for this subtree -only; the library package itself does not apply those patches globally. -""" - -from __future__ import annotations - -import os - -os.environ.setdefault("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false") - -import socket -import time - -import pytest - - -def _ensure_gcs_emulator_env() -> None: - """Point google-cloud-storage / gcsfs defaults at fake-gcs-server (not production).""" - raw = os.environ.get("STORAGE_EMULATOR_HOST", "").strip() - if raw in ("", "default"): - os.environ["STORAGE_EMULATOR_HOST"] = "http://127.0.0.1:4443" - elif not raw.startswith(("http://", "https://")): - os.environ["STORAGE_EMULATOR_HOST"] = f"http://{raw}" - - -# Error messages from asyncio when closing sessions after the event loop is gone (e.g. at process exit) -_LOOP_GONE_ERRORS = ("different loop", "Loop is not running") - - -def _patch_fsspec_sync_for_shutdown() -> None: - """If fsspec.asyn.sync() runs at exit when the loop is gone, return None instead of raising. - - SpatialData does not patch ``fsspec.asyn.sync`` at import time (too broad for a library); this - hook runs only for pytest sessions that load this conftest (remote emulator tests). - """ - import fsspec.asyn as asyn_mod - - _orig = asyn_mod.sync - - def _wrapped(loop, func, *args, timeout=None, **kwargs): - try: - return _orig(loop, func, *args, timeout=timeout, **kwargs) - except RuntimeError as e: - if any(msg in str(e) for msg in _LOOP_GONE_ERRORS): - return None - raise - - asyn_mod.sync = _wrapped - - -def _patch_gcsfs_close_session_for_shutdown() -> None: - """If gcsfs close_session fails (loop gone), close the connector synchronously instead of raising.""" - import asyncio - - import fsspec - import fsspec.asyn as asyn_mod - import gcsfs.core - - @staticmethod - def _close_session(loop, session, asynchronous=False): - if session.closed: - return - try: - running = asyncio.get_running_loop() - except RuntimeError: - running = None - - use_force_close = False - if loop and loop.is_running(): - loop.create_task(session.close()) - elif running and running.is_running() and asynchronous: - running.create_task(session.close()) - elif asyn_mod.loop[0] is not None and asyn_mod.loop[0].is_running(): - try: - asyn_mod.sync(asyn_mod.loop[0], session.close, timeout=0.1) - except (RuntimeError, fsspec.FSTimeoutError): - use_force_close = True - else: - use_force_close = True - - if use_force_close: - connector = getattr(session, "_connector", None) - if connector is not None: - connector._close() - - gcsfs.core.GCSFileSystem.close_session = _close_session - - -def _apply_resilient_async_close_patches() -> None: - """Avoid RuntimeError tracebacks when aiohttp sessions are closed at process exit (loop already gone).""" - _patch_fsspec_sync_for_shutdown() - _patch_gcsfs_close_session_for_shutdown() - - -def pytest_configure(config: pytest.Config) -> None: - """Apply remote-test-only patches for resilient async close at shutdown.""" - _apply_resilient_async_close_patches() - - -EMULATOR_PORTS = {"s3": 5000, "azure": 10000, "gcs": 4443} -S3_BUCKETS = ("bucket", "test-azure", "test-s3", "test-gcs") -AZURE_CONTAINERS = ("test-container", "test-azure", "test-s3", "test-gcs") -GCS_BUCKETS = ("bucket", "test-azure", "test-s3", "test-gcs") - -AZURITE_CONNECTION_STRING = ( - "DefaultEndpointsProtocol=http;" - "AccountName=devstoreaccount1;" - "AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" - "BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" -) - - -def _port_open(host: str = "127.0.0.1", port: int | None = None, timeout: float = 2.0) -> bool: - if port is None: - return False - try: - with socket.create_connection((host, port), timeout=timeout): - return True - except (OSError, TimeoutError): - return False - - -def _ensure_s3_buckets(host: str) -> None: - if not _port_open(host, EMULATOR_PORTS["s3"]): - return - os.environ.setdefault("AWS_ENDPOINT_URL", "http://127.0.0.1:5000") - os.environ.setdefault("AWS_ACCESS_KEY_ID", "testing") - os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "testing") - import boto3 - from botocore.config import Config - - client = boto3.client( - "s3", - endpoint_url=os.environ["AWS_ENDPOINT_URL"], - aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"], - aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], - region_name="us-east-1", - config=Config(signature_version="s3v4"), - ) - existing = {b["Name"] for b in client.list_buckets().get("Buckets", [])} - for name in S3_BUCKETS: - if name not in existing: - client.create_bucket(Bucket=name) - - -def _ensure_azure_containers(host: str) -> None: - if not _port_open(host, EMULATOR_PORTS["azure"]): - return - from azure.storage.blob import BlobServiceClient - - client = BlobServiceClient.from_connection_string(AZURITE_CONNECTION_STRING) - existing = {c.name for c in client.list_containers()} - for name in AZURE_CONTAINERS: - if name not in existing: - client.create_container(name) - - -def _ensure_gcs_buckets(host: str) -> None: - if not _port_open(host, EMULATOR_PORTS["gcs"]): - return - os.environ.setdefault("STORAGE_EMULATOR_HOST", "http://127.0.0.1:4443") - from google.auth.credentials import AnonymousCredentials - from google.cloud import storage - - client = storage.Client(credentials=AnonymousCredentials(), project="test") - existing = {b.name for b in client.list_buckets()} - for name in GCS_BUCKETS: - if name not in existing: - client.create_bucket(name) - - -def _wait_for_emulator_ports(host: str = "127.0.0.1", timeout: float = 10.0, check_interval: float = 2.0) -> None: - """Wait until all three emulator ports accept connections (e.g. after docker run).""" - deadline = time.monotonic() + timeout - while time.monotonic() < deadline: - if all(_port_open(host, EMULATOR_PORTS[p]) for p in ("s3", "azure", "gcs")): - return - time.sleep(check_interval) - raise RuntimeError( - f"Emulators did not become ready within {timeout}s. " - "Ensure the container is running: docker run --rm -d -p 5000:5000 " - "-p 10000:10000 -p 4443:4443 spatialdata-emulators" - ) - - -@pytest.fixture(scope="session") -def _remote_storage_buckets_containers(): - """Create backend fixtures on running emulators so remote storage tests can run. - - Run with emulators up, e.g.: - docker run --rm -d -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators - Then: pytest tests/io/test_remote_storage.py -v - """ - host = "127.0.0.1" - _wait_for_emulator_ports(host) - _ensure_s3_buckets(host) - _ensure_azure_containers(host) - _ensure_gcs_buckets(host) - yield - - -def pytest_collection_modifyitems(config: pytest.Config, items: list) -> None: - """Inject bucket/container creation for test_remote_storage.py.""" - if any("remote_storage" in str(getattr(item, "path", None) or getattr(item, "fspath", "")) for item in items): - _ensure_gcs_emulator_env() - for item in items: - path = getattr(item, "path", None) or getattr(item, "fspath", None) - if path and "test_remote_storage" in str(path): - item.add_marker(pytest.mark.usefixtures("_remote_storage_buckets_containers")) diff --git a/tests/io/remote_storage/test_remote_storage.py b/tests/io/remote_storage/test_remote_storage.py deleted file mode 100644 index a53ed3377..000000000 --- a/tests/io/remote_storage/test_remote_storage.py +++ /dev/null @@ -1,214 +0,0 @@ -"""Integration tests for remote-backed storage using real backend emulators. - -Emulators must be running (e.g. Docker: docker run -p 5000:5000 -p 10000:10000 -p 4443:4443 spatialdata-emulators). -Ports: S3/moto 5000, Azure/Azurite 10000, GCS/fake-gcs-server 4443. -tests/io/remote_storage/conftest.py creates buckets/containers when emulators are up. - -All remote paths use uuid.uuid4().hex so each test run writes to a unique location. -""" - -from __future__ import annotations - -import os -import uuid - -import pytest -import zarr -from upath import UPath - -from spatialdata import SpatialData -from spatialdata._store import make_zarr_store, open_read_store -from spatialdata.testing import assert_spatial_data_objects_are_identical - -# Azure emulator connection string (Azurite default). -# https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string -AZURE_CONNECTION_STRING = ( - "DefaultEndpointsProtocol=http;" - "AccountName=devstoreaccount1;" - "AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" - "BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;" -) - - -def _get_azure_upath(container: str = "test-container", path: str = "test.zarr") -> UPath: - """Create Azure UPath for testing with Azurite (local emulator).""" - return UPath(f"az://{container}/{path}", connection_string=AZURE_CONNECTION_STRING) - - -def _get_s3_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: - """Create S3 UPath for testing (moto emulator at 5000).""" - endpoint = os.environ.get("AWS_ENDPOINT_URL", "http://127.0.0.1:5000") - if endpoint: - return UPath( - f"s3://{container}/{path}", - endpoint_url=endpoint, - key=os.environ.get("AWS_ACCESS_KEY_ID", "testing"), - secret=os.environ.get("AWS_SECRET_ACCESS_KEY", "testing"), - ) - return UPath(f"s3://{container}/{path}", anon=True) - - -def _get_gcs_upath(container: str = "bucket", path: str = "test.zarr") -> UPath: - """Create GCS UPath for testing with fake-gcs-server (port 4443).""" - os.environ.setdefault("STORAGE_EMULATOR_HOST", "http://127.0.0.1:4443") - return UPath( - f"gs://{container}/{path}", - endpoint_url=os.environ["STORAGE_EMULATOR_HOST"], - token="anon", - project="test", - ) - - -GET_UPATH_PARAMS = pytest.mark.parametrize( - "get_upath", [_get_azure_upath, _get_s3_upath, _get_gcs_upath], ids=["azure", "s3", "gcs"] -) -REMOTE_STORAGE_PARAMS = pytest.mark.parametrize( - "get_upath,storage_name", - [(_get_azure_upath, "azure"), (_get_s3_upath, "s3"), (_get_gcs_upath, "gcs")], - ids=["azure", "s3", "gcs"], -) - -# Ensure buckets/containers exist on emulators before any test (see tests/io/remote_storage/conftest.py). -pytestmark = pytest.mark.usefixtures("_remote_storage_buckets_containers") - - -def _assert_read_identical(expected: SpatialData, upath: UPath, *, check_path: bool = True) -> None: - """Read SpatialData from upath and assert it equals expected; optionally assert path.""" - sdata_read = SpatialData.read(upath) - if check_path: - assert isinstance(sdata_read.path, UPath) - assert sdata_read.path == upath - assert_spatial_data_objects_are_identical(expected, sdata_read) - - -class TestPathSetter: - """Test SpatialData.path setter with remote UPath objects.""" - - @GET_UPATH_PARAMS - def test_path_setter_accepts_upath(self, get_upath) -> None: - """Test that SpatialData.path setter accepts backend-configured UPath objects. - - This test fails, reproducing issue #441: SpatialData.path setter only accepts - None | str | Path, not UPath, preventing the use of remote storage. - """ - sdata = SpatialData() - upath = get_upath(path=f"test-accept-{uuid.uuid4().hex}.zarr") - sdata.path = upath - assert sdata.path == upath - - @GET_UPATH_PARAMS - def test_write_with_upath_sets_path(self, get_upath) -> None: - """Test that writing to a remote UPath sets SpatialData.path correctly. - - This test fails because SpatialData.write() rejects UPath in - _validate_can_safely_write_to_path() before it can set sdata.path. - """ - sdata = SpatialData() - upath = get_upath(path=f"test-write-path-{uuid.uuid4().hex}.zarr") - sdata.write(upath) - assert isinstance(sdata.path, UPath) - - def test_path_setter_rejects_other_types(self) -> None: - """Test that SpatialData.path setter rejects other types.""" - sdata = SpatialData() - with pytest.raises(TypeError, match="Path must be.*str.*Path"): - sdata.path = 123 - with pytest.raises(TypeError, match="Path must be.*str.*Path"): - sdata.path = {"not": "a path"} - - -class TestRemoteStorage: - """Test end-to-end remote storage workflows with backend-configured UPath objects. - - Note: These tests require the backend emulators from ``tests/io/remote_storage/conftest.py`` - to be running. Tests will fail if the emulators are not available. - """ - - @REMOTE_STORAGE_PARAMS - def test_write_read_roundtrip_remote(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: - """Test writing and reading SpatialData to/from remote storage. - - This test verifies the full workflow: - 1. Write SpatialData to remote storage using UPath - 2. Read SpatialData from remote storage using UPath - 3. Verify data integrity (round-trip) - """ - upath = get_upath(container=f"test-{storage_name}", path=f"roundtrip-{uuid.uuid4().hex}.zarr") - full_sdata.write(upath, overwrite=True) - assert isinstance(full_sdata.path, UPath) - assert full_sdata.path == upath - _assert_read_identical(full_sdata, upath) - # ``str(upath)`` drops the configured filesystem object. Some backends can still be reopened - # from ambient environment defaults, but others rely on the configured UPath, so we only - # assert the string-URL read path for S3 here. - if storage_name == "s3": - sdata_str_url = SpatialData.read(str(upath)) - assert isinstance(sdata_str_url.path, UPath) - assert_spatial_data_objects_are_identical(full_sdata, sdata_str_url) - - @REMOTE_STORAGE_PARAMS - def test_path_setter_with_remote_then_operations( - self, full_sdata: SpatialData, get_upath, storage_name: str - ) -> None: - """Test setting a remote path, then performing operations. - - This test verifies that after setting a remote path: - 1. Path is correctly stored - 2. Write operations work - 3. Read operations work - """ - upath = get_upath(container=f"test-{storage_name}", path=f"operations-{uuid.uuid4().hex}.zarr") - full_sdata.path = upath - assert full_sdata.path == upath - assert full_sdata.is_backed() is True - full_sdata.write(overwrite=True) - assert full_sdata.path == upath - _assert_read_identical(full_sdata, upath) - - @REMOTE_STORAGE_PARAMS - def test_overwrite_existing_remote_data(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: - """Test overwriting existing data in remote storage. - - Verifies that backend-managed overwriting works and that the data remains - intact afterwards. Round-trip is covered by ``test_write_read_roundtrip_remote``. - """ - upath = get_upath(container=f"test-{storage_name}", path=f"overwrite-{uuid.uuid4().hex}.zarr") - full_sdata.write(upath, overwrite=True) - full_sdata.write(upath, overwrite=True) - _assert_read_identical(full_sdata, upath, check_path=False) - - @REMOTE_STORAGE_PARAMS - def test_write_element_to_remote_storage(self, full_sdata: SpatialData, get_upath, storage_name: str) -> None: - """Test writing individual elements to remote storage using ``write_element()``. - - This test verifies that: - 1. Setting path to remote UPath works - 2. write_element() works with remote storage - 3. Written elements can be read back correctly - """ - upath = get_upath(container=f"test-{storage_name}", path=f"write-element-{uuid.uuid4().hex}.zarr") - # Create empty SpatialData and write to remote storage - empty_sdata = SpatialData() - empty_sdata.write(upath, overwrite=True) - full_sdata.path = upath - assert full_sdata.path == upath - # Write each element type individually - for _element_type, element_name, _ in full_sdata.gen_elements(): - full_sdata.write_element(element_name, overwrite=True) - _assert_read_identical(full_sdata, upath, check_path=False) - - @REMOTE_STORAGE_PARAMS - def test_read_from_remote_zarr_group_keeps_backing_for_followup_write( - self, full_sdata: SpatialData, get_upath, storage_name: str - ) -> None: - """Test that reading from a remote zarr.Group preserves enough backing info for a later write.""" - upath = get_upath(container=f"test-{storage_name}", path=f"read-group-{uuid.uuid4().hex}.zarr") - full_sdata.write(upath, overwrite=True) - - with open_read_store(make_zarr_store(upath)) as store: - group = zarr.open_group(store=store, mode="r") - sdata_from_group = SpatialData.read(group) - - assert isinstance(sdata_from_group.path, UPath) - sdata_from_group.write(overwrite=True) - _assert_read_identical(full_sdata, upath, check_path=False) diff --git a/tests/io/remote_storage/test_resolve_zarr_store.py b/tests/io/remote_storage/test_resolve_zarr_store.py deleted file mode 100644 index 304f235da..000000000 --- a/tests/io/remote_storage/test_resolve_zarr_store.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Unit tests for remote store resolution helpers. - -These cover generic code paths used when reading/writing through remote backends: -- zarr.Group to ZarrStore normalization for remote-backed groups. -""" - -from __future__ import annotations - -import tempfile - -import zarr -from zarr.storage import FsspecStore, LocalStore, MemoryStore - -from spatialdata._io._utils import _resolve_zarr_store -from spatialdata._store import make_zarr_store_from_group, open_read_store - - -def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: - """StoreLike inputs must not be wrapped as FsspecStore(fs=store) — that is only for async filesystems.""" - mem = MemoryStore() - assert _resolve_zarr_store(mem) is mem - loc = LocalStore(tempfile.mkdtemp()) - assert _resolve_zarr_store(loc) is loc - - -def test_make_zarr_store_from_remote_group() -> None: - """Remote zarr.Group inputs keep a usable UPath and reopen through the same protocol.""" - import fsspec - from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper - - fs = fsspec.filesystem("memory") - async_fs = AsyncFileSystemWrapper(fs, asynchronous=True) - base = FsspecStore(async_fs, path="/") - root = zarr.open_group(store=base, mode="a") - group = root.require_group("points").require_group("points") - - zarr_store = make_zarr_store_from_group(group) - assert getattr(zarr_store.path.fs, "protocol", None) == "memory" - - with open_read_store(zarr_store) as store: - assert isinstance(store, FsspecStore) diff --git a/tests/io/test_store.py b/tests/io/test_store.py index 000721d5b..3871898df 100644 --- a/tests/io/test_store.py +++ b/tests/io/test_store.py @@ -1,10 +1,13 @@ from __future__ import annotations +import tempfile from pathlib import Path import zarr from upath import UPath +from zarr.storage import FsspecStore, LocalStore, MemoryStore +from spatialdata._io._utils import _resolve_zarr_store from spatialdata._store import ( make_zarr_store, make_zarr_store_from_group, @@ -50,3 +53,29 @@ def test_make_zarr_store_from_local_group(tmp_path: Path) -> None: child_store = make_zarr_store_from_group(group) assert child_store.path == tmp_path / "store.zarr" / "images" / "image" + + +def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: + """StoreLike inputs must not be wrapped as FsspecStore(fs=store) -- that is only for async filesystems.""" + mem = MemoryStore() + assert _resolve_zarr_store(mem) is mem + loc = LocalStore(tempfile.mkdtemp()) + assert _resolve_zarr_store(loc) is loc + + +def test_make_zarr_store_from_remote_group() -> None: + """Remote zarr.Group inputs keep a usable UPath and reopen through the same protocol.""" + import fsspec + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + fs = fsspec.filesystem("memory") + async_fs = AsyncFileSystemWrapper(fs, asynchronous=True) + base = FsspecStore(async_fs, path="/") + root = zarr.open_group(store=base, mode="a") + group = root.require_group("points").require_group("points") + + zarr_store = make_zarr_store_from_group(group) + assert getattr(zarr_store.path.fs, "protocol", None) == "memory" + + with open_read_store(zarr_store) as store: + assert isinstance(store, FsspecStore) From db4f2865660b9383c330fe63859b29a610d0c373 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 13:46:14 +0200 Subject: [PATCH 37/51] add tests for abstractions and fix bugs --- src/spatialdata/_core/spatialdata.py | 15 +- src/spatialdata/_io/_utils.py | 27 ++- src/spatialdata/_io/io_points.py | 15 +- src/spatialdata/_store.py | 12 +- tests/io/test_store.py | 17 ++ tests/io/test_store_abstractions.py | 283 +++++++++++++++++++++++++++ 6 files changed, 354 insertions(+), 15 deletions(-) create mode 100644 tests/io/test_store_abstractions.py diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index e4b087d05..5c5230c90 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1060,6 +1060,8 @@ def _validate_can_safely_write_to_path( honored. Callers must pass ``overwrite=True`` to explicitly acknowledge that the write may clobber pre-existing data at the target. """ + from upath.implementations.local import PosixUPath, WindowsUPath + from spatialdata._io._utils import ( _backed_elements_contained_in_path, _is_subfolder, @@ -1076,8 +1078,17 @@ def _validate_can_safely_write_to_path( if not isinstance(file_path, (Path, UPath)): raise ValueError(f"file_path must be a string, Path or UPath object, type(file_path) = {type(file_path)}.") - if isinstance(file_path, UPath): - if not overwrite: + # Local UPath variants (PosixUPath / WindowsUPath) wrap a plain filesystem path; they + # have reliable existence semantics and must go through the same local validation as + # Path. Only *remote* UPath (cloud / http / memory / etc.) falls through the remote guard. + is_remote_upath = isinstance(file_path, UPath) and not isinstance(file_path, (PosixUPath, WindowsUPath)) + + if is_remote_upath: + # The overwrite opt-in only applies at the top-level store entry. Per-element writes + # issued internally by ``write()`` (and incremental ``write_element`` calls into an + # existing store) must not re-trigger the guard on every sub-key, or writing to a + # remote target would be impossible. + if not overwrite and not saving_an_element: raise NotImplementedError( "Writing to a remote (UPath) target requires overwrite=True. " "We cannot reliably check whether the remote store already exists, so the write " diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 72268fa66..360a40b4e 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -480,7 +480,10 @@ def _ensure_async_fs(fs: Any) -> Any: def _resolve_zarr_store( - path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, **kwargs: Any + path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, + *, + read_only: bool = False, + **kwargs: Any, ) -> zarr.storage.StoreLike: """ Normalize different Zarr store inputs into a usable store instance. @@ -496,9 +499,14 @@ def _resolve_zarr_store( path The input representing a Zarr store or group. Can be a filesystem path, remote path, existing store, or Zarr group. + read_only + If ``True``, constructed ``LocalStore`` / ``FsspecStore`` instances are built with + ``read_only=True``. Stores that already exist (when ``path`` is a ``StoreLike`` or + a ``zarr.Group`` whose wrapped store is not reconstructable) are returned as-is; + the caller is responsible for opening them at the right mode. **kwargs Additional keyword arguments forwarded to the underlying store - constructor (e.g. `mode`, `storage_options`). + constructor. Returns ------- @@ -511,7 +519,6 @@ def _resolve_zarr_store( ValueError If a `zarr.Group` has an unsupported store type. """ - # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): path = UPath(path) @@ -531,28 +538,30 @@ def _resolve_zarr_store( return FsspecStore( fs=_ensure_async_fs(path.store.fs), path=_join_fsspec_store_path(path.store.path, path.path), + read_only=read_only, **kwargs, ) if _cms is not None and isinstance(path.store, _cms): - # Unwrap and apply the same async-fs + parquet guards as a direct FsspecStore on the group. + # Unwrap and apply the same async-fs guards as a direct FsspecStore on the group. inner = path.store.store if isinstance(inner, FsspecStore): return FsspecStore( fs=_ensure_async_fs(inner.fs), path=_join_fsspec_store_path(inner.path, path.path), + read_only=read_only, **kwargs, ) if isinstance(inner, LocalStore): store_path = UPath(inner.root) / path.path - return LocalStore(store_path.path) + return LocalStore(store_path.path, read_only=read_only) return inner raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, UPath): - # if input is a remote UPath, map it to an FSStore (check before StoreLike to avoid UnionType isinstance) - return FsspecStore(_ensure_async_fs(path.fs), path=path.path, **kwargs) + # Check before StoreLike to avoid UnionType isinstance. + return FsspecStore(_ensure_async_fs(path.fs), path=path.path, read_only=read_only, **kwargs) if isinstance(path, zarr.storage.StoreLike): - # Already a concrete store (LocalStore, FsspecStore, MemoryStore, …). Do not pass it as ``fs=`` to - # FsspecStore — that only accepts an async fsspec filesystem and raises on stores (e.g. ``async_impl``). + # Already a concrete store (LocalStore, FsspecStore, MemoryStore, ...). Do not pass it as ``fs=`` to + # FsspecStore -- that only accepts an async fsspec filesystem and raises on stores (e.g. ``async_impl``). return path raise TypeError(f"Unsupported type: {type(path)}") diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index a774d8be4..9a70b4b1f 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -36,13 +36,24 @@ def _read_points( points_format = PointsFormats[version] parquet_store = zarr_store.child("points.parquet") - # cache on remote file needed for parquet reader to work - # TODO: allow reading in the metadata without caching all the data + # Passing filesystem= to read_parquet makes pyarrow convert dictionary columns into pandas + # categoricals eagerly per partition and marks them known=True with an empty category list. + # This happens for ANY pyarrow filesystem (both LocalFileSystem and PyFileSystem(FSSpecHandler(.)) + # return the same broken categorical), so it is a property of the filesystem= handoff itself, + # not of local-vs-remote. Left as is, it would make write_points' cat.as_known() a no-op and + # the next to_parquet(filesystem=.) would fail with a per-partition schema mismatch + # (dictionary vs dictionary). We demote the categoricals back to + # "unknown" right here so that write_points recomputes categories consistently across partitions. + # TODO: allow reading in the metadata without materializing the data. points = read_parquet( parquet_store.arrow_path(), filesystem=parquet_store.arrow_filesystem(), ) assert isinstance(points, DaskDataFrame) + for column_name in points.columns: + c = points[column_name] + if c.dtype == "category" and c.cat.known: + points[column_name] = c.cat.as_unknown() if points.index.name == "__null_dask_index__": points = points.rename_axis(None) diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index 8c0c08ed9..6fc1069e8 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -74,9 +74,16 @@ def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: @contextmanager def open_read_store(store: ZarrStore) -> Any: + """Open ``store`` as a read-only backend store. + + The resolved zarr store is constructed with ``read_only=True`` so that the underlying + ``LocalStore`` / ``FsspecStore`` refuses writes at the store layer (not just at the group's + ``mode="r"`` level). This also lets remote read-only backends (e.g. public HTTPS zarrs) + skip any write-capability probe that fsspec may otherwise perform. + """ from spatialdata._io._utils import _resolve_zarr_store - resolved_store = _resolve_zarr_store(store.path) + resolved_store = _resolve_zarr_store(store.path, read_only=True) try: yield resolved_store finally: @@ -85,9 +92,10 @@ def open_read_store(store: ZarrStore) -> Any: @contextmanager def open_write_store(store: ZarrStore) -> Any: + """Open ``store`` as a writable backend store (``read_only=False``).""" from spatialdata._io._utils import _resolve_zarr_store - resolved_store = _resolve_zarr_store(store.path) + resolved_store = _resolve_zarr_store(store.path, read_only=False) try: yield resolved_store finally: diff --git a/tests/io/test_store.py b/tests/io/test_store.py index 3871898df..d9ef877e6 100644 --- a/tests/io/test_store.py +++ b/tests/io/test_store.py @@ -63,6 +63,23 @@ def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: assert _resolve_zarr_store(loc) is loc +def test_resolve_zarr_store_forwards_read_only_local(tmp_path: Path) -> None: + """``_resolve_zarr_store(..., read_only=True)`` must reach the LocalStore constructor.""" + store = _resolve_zarr_store(tmp_path / "store.zarr", read_only=True) + assert isinstance(store, LocalStore) + assert store.read_only is True + + +def test_resolve_zarr_store_forwards_read_only_remote() -> None: + """``_resolve_zarr_store(..., read_only=True)`` must reach the FsspecStore constructor.""" + from fsspec.implementations.memory import MemoryFileSystem + + upath = UPath("memory://ro-remote.zarr", fs=MemoryFileSystem(skip_instance_cache=True)) + store = _resolve_zarr_store(upath, read_only=True) + assert isinstance(store, FsspecStore) + assert store.read_only is True + + def test_make_zarr_store_from_remote_group() -> None: """Remote zarr.Group inputs keep a usable UPath and reopen through the same protocol.""" import fsspec diff --git a/tests/io/test_store_abstractions.py b/tests/io/test_store_abstractions.py new file mode 100644 index 000000000..78f9d5963 --- /dev/null +++ b/tests/io/test_store_abstractions.py @@ -0,0 +1,283 @@ +"""Abstraction stress tests for ``SpatialData`` io against a memory-backed ``UPath``. + +These tests exercise the same read/write code paths that would be hit by a real remote +backend (S3/Azure/GCS/HTTPS), using only ``fsspec.filesystem("memory")`` and a thin +no-listing wrapper to approximate HTTP-like semantics. No emulators, no network. + +The file is deliberately scoped to the **public interface** (``SpatialData.read`` / +``SpatialData.write``) plus tamper-evident inspection of the underlying fsspec backend; +the lower-level ``ZarrStore`` / ``_resolve_zarr_store`` plumbing is unit-tested separately +in ``tests/io/test_store.py``. + +Coverage goals (generic, not provider-specific): +- ``SpatialData.read`` does not mutate backend bytes (tamper-evident snapshot equality). +- Full write / write-read-write round-trip through a remote-backed ``UPath`` for images, + labels, shapes, points, and a full sdata. The write-read-write cycle specifically pins + the categorical-schema invariant that the arrow-filesystem migration (this PR) had to + re-establish in ``_read_points``. +- Writing to a ``UPath`` lands the root metadata artifact in the backend. The read-time + consumption of consolidated metadata is an xfail placeholder for the cloud-native + follow-up. +- A ``MemoryFileSystem`` subclass that refuses listing proves that ``SpatialData.read`` + does not depend on directory listing for basic elements (the precondition for serving + public HTTPS zarrs). + +These tests are strictly stronger than moto/s3 emulator coverage: they need no external +process, no subprocess, no network, and they pin the exact abstraction boundary that the +cloud-native follow-up must not regress. +""" + +from __future__ import annotations + +import pytest +from fsspec.implementations.memory import MemoryFileSystem +from upath import UPath + +from spatialdata import SpatialData +from spatialdata.testing import assert_spatial_data_objects_are_identical + + +def _fresh_memory_upath(key: str) -> UPath: + """Build a UPath bound to a fresh (per-test) in-memory fsspec filesystem. + + ``skip_instance_cache=True`` ensures every test gets an isolated memory backend so + tests cannot leak state across each other. + """ + fs = MemoryFileSystem(skip_instance_cache=True) + return UPath(f"memory://{key}.zarr", fs=fs) + + +# --------------------------------------------------------------------------- +# SpatialData.read is side-effect-free against the backend. +# --------------------------------------------------------------------------- + + +class TestReadIsSideEffectFree: + """``SpatialData.read`` must not mutate a single byte of the backend store. + + Using a memory filesystem as a tamper-evident substrate, we snapshot every key+bytes + before and after the read and assert full equality. This is strictly a public-interface + invariant: if ``read_zarr`` (or any element reader) ever silently wrote to a remote + backend, this test is the first to catch it. The lower-level guarantee that + ``_resolve_zarr_store`` forwards ``read_only=True`` to the backend store is unit-tested + separately in ``tests/io/test_store.py``. + """ + + def test_spatialdata_read_does_not_mutate_backend(self, images: SpatialData) -> None: + upath = _fresh_memory_upath("read-only-invariant") + images.write(upath, overwrite=True) + + fs = upath.fs + + def snapshot() -> dict[str, bytes]: + return {key: fs.cat_file(key) for key in fs.find(upath.path)} + + before = snapshot() + SpatialData.read(upath) + after = snapshot() + + assert before.keys() == after.keys(), ( + f"read added/removed backend keys; added={after.keys() - before.keys()}, " + f"removed={before.keys() - after.keys()}" + ) + # Equality on bytes (not just on keys) is what makes this tamper-evident: even a + # same-size rewrite of the same key would be caught. + assert before == after, "read mutated bytes in the backend store" + + +# --------------------------------------------------------------------------- +# Full SpatialData round-trip through a memory-backed UPath: the generic +# remote-backend stress test. +# --------------------------------------------------------------------------- + + +class TestMemoryUPathRoundtrip: + """Round-trip ``SpatialData`` objects through a memory-backed ``UPath``. + + Every code path from ``make_zarr_store`` -> ``_resolve_zarr_store`` -> + ``open_write_store`` / ``open_read_store`` -> ``zarr.open_group(FsspecStore)`` -> + ``io_raster`` / ``io_shapes`` / ``io_points`` / ``io_table`` is exercised identically + to how it would be against S3/Azure/GCS. If any of these regresses for remote backends, + one of these tests must break. + + Note that ``overwrite=True`` is required on every ``write()`` call that targets a + ``UPath`` (per the guard in ``_validate_can_safely_write_to_path``): remote existence + checks are unreliable across fsspec backends, so the caller must explicitly opt in. + """ + + def test_roundtrip_images_only(self, images: SpatialData) -> None: + upath = _fresh_memory_upath("images") + images.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(images, read) + + def test_roundtrip_labels_only(self, labels: SpatialData) -> None: + upath = _fresh_memory_upath("labels") + labels.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(labels, read) + + def test_roundtrip_shapes_only(self, shapes: SpatialData) -> None: + upath = _fresh_memory_upath("shapes") + shapes.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(shapes, read) + + def test_roundtrip_points_only(self, points: SpatialData) -> None: + upath = _fresh_memory_upath("points") + points.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(points, read) + + def test_write_read_write_points_preserves_categorical_schema( + self, points: SpatialData + ) -> None: + """Regression guard for the arrow-filesystem categorical round-trip. + + This PR migrated points io to ``to_parquet`` / ``read_parquet`` with + ``filesystem=arrow_fs``. ``read_parquet(filesystem=arrow_fs)`` eagerly pandas-ifies + pyarrow dictionaries into ``CategoricalDtype`` marked ``known=True`` with an empty + category list -- that would defeat ``write_points``'s ``as_known()`` normalization + and a subsequent ``to_parquet(filesystem=arrow_fs)`` would fail with a per-partition + schema mismatch (``dictionary`` vs ``dictionary``). The + fix lives in ``_read_points`` (demote such categoricals to unknown so that + ``write_points`` recomputes categories across partitions); this test pins it. + """ + upath1 = _fresh_memory_upath("points-rt1") + upath2 = _fresh_memory_upath("points-rt2") + points.write(upath1, overwrite=True) + read = SpatialData.read(upath1) + read.write(upath2, overwrite=True) + round_tripped = SpatialData.read(upath2) + assert_spatial_data_objects_are_identical(points, round_tripped) + + def test_write_read_write_full_sdata(self, full_sdata: SpatialData) -> None: + """End-to-end guard: a full sdata round-trips write -> read -> write cleanly. + + Pinned for the same reason as the points-only variant above: the arrow-filesystem + migration in this PR had to re-establish the categorical-schema invariant on the + read side so that write does not fail on the second pass. + """ + upath1 = _fresh_memory_upath("full-rt1") + upath2 = _fresh_memory_upath("full-rt2") + full_sdata.write(upath1, overwrite=True) + read = SpatialData.read(upath1) + read.write(upath2, overwrite=True) + round_tripped = SpatialData.read(upath2) + assert_spatial_data_objects_are_identical(full_sdata, round_tripped) + + def test_roundtrip_full_sdata(self, full_sdata: SpatialData) -> None: + upath = _fresh_memory_upath("full") + full_sdata.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(full_sdata, read) + + +# --------------------------------------------------------------------------- +# Consolidated metadata on read. +# --------------------------------------------------------------------------- + + +class TestConsolidatedMetadataOnRead: + """Writing produces a consolidated-metadata artifact; the read path does not consume it yet. + + The follow-up cloud-native PR will thread ``use_consolidated=True`` through + ``open_read_store`` / ``read_zarr``. When that lands, the xfail here flips to a pass + and the assertion becomes strict. + """ + + def test_write_produces_root_metadata_on_memory_upath(self, images: SpatialData) -> None: + upath = _fresh_memory_upath("consolidated") + images.write(upath, overwrite=True) + fs = upath.fs + # The root metadata artifact differs by zarr version: zarr v3 writes ``zarr.json`` + # at every group, zarr v2 writes ``.zmetadata`` at the consolidated root. Accepting + # either keeps the test valid across versions and asserts that the write path + # actually reaches the memory backend. + root_keys = [p.rsplit("/", 1)[-1] for p in fs.find(upath.path)] + assert "zarr.json" in root_keys or ".zmetadata" in root_keys, root_keys + + @pytest.mark.xfail( + reason=( + "read_zarr opens the root group with zarr.open_group(store, mode='r') without " + "use_consolidated=True, so a consolidated metadata artifact is ignored on remote " + "reads. The cloud-native follow-up will thread use_consolidated through open_read_store." + ), + strict=True, + ) + def test_read_zarr_opens_via_consolidated_metadata(self, images: SpatialData) -> None: + upath = _fresh_memory_upath("consolidated-read") + images.write(upath, overwrite=True) + + # Count store GETs on the memory fs to detect that consolidated metadata is used: + # without consolidation, reading one image requires many small zarr.json / .zgroup GETs. + fs = upath.fs + original_cat_file = fs._cat_file + call_count = {"n": 0} + + def counting_cat_file(path, *args, **kwargs): + call_count["n"] += 1 + return original_cat_file(path, *args, **kwargs) + + fs._cat_file = counting_cat_file + try: + SpatialData.read(upath) + finally: + fs._cat_file = original_cat_file + + # With consolidated metadata, we expect very few small-metadata GETs for a + # trivial 1-image sdata. Without it, typical count is >> 10. The exact bound is + # a documented, loose sanity check, not a micro-benchmark. + assert call_count["n"] < 10, ( + f"expected consolidated metadata to reduce GETs, saw {call_count['n']}" + ) + + +# --------------------------------------------------------------------------- +# HTTP-like read-only filesystem: simulates a remote that does not support listing. +# --------------------------------------------------------------------------- + + +class _NoListMemoryFileSystem(MemoryFileSystem): + """MemoryFileSystem that refuses directory listing, approximating HTTPS zarr semantics. + + Public HTTPS zarr reads cannot do ``ls`` / ``find`` on an arbitrary prefix; they can + only GET known keys. This wrapper fails any listing operation so we can prove that + our read path does not rely on listing -- the precondition for public HTTPS datasets + to be readable. + """ + + def _ls(self, path, detail=True, **kwargs): # type: ignore[override] + raise NotImplementedError("listing disabled to simulate HTTP-like semantics") + + def ls(self, path, detail=True, **kwargs): # type: ignore[override] + raise NotImplementedError("listing disabled to simulate HTTP-like semantics") + + def find(self, path, **kwargs): # type: ignore[override] + raise NotImplementedError("listing disabled to simulate HTTP-like semantics") + + +class TestHttpLikeReadOnlyStore: + """Approximate HTTPS zarr semantics: a read-only filesystem that refuses listing. + + The point is not to re-test zarr's FsspecStore but to catch the case where our own + ``read_zarr`` implementation (or an element reader) assumes it can list a directory. + That is exactly the pattern that breaks when pointed at a real public HTTPS zarr. + """ + + def test_read_sdata_from_no_list_fs(self, images: SpatialData, tmp_path) -> None: + # Write locally, then copy bytes into a no-list memory fs so that the backend + # resembles a public HTTPS zarr: every known key is readable but listing is disabled. + local_path = tmp_path / "local.zarr" + images.write(local_path) + + no_list_fs = _NoListMemoryFileSystem(skip_instance_cache=True) + remote_root = "no-list.zarr" + for p in local_path.rglob("*"): + if p.is_file(): + rel = p.relative_to(local_path).as_posix() + no_list_fs.pipe_file(f"{remote_root}/{rel}", p.read_bytes()) + + upath = UPath(f"memory://{remote_root}", fs=no_list_fs) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(images, read) From 5bffb4da2dcef2b71e7be6685690ad4237f1bb64 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 13:53:33 +0200 Subject: [PATCH 38/51] give readonly sotrs --- src/spatialdata/_io/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 360a40b4e..ec949f799 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -524,14 +524,14 @@ def _resolve_zarr_store( if isinstance(path, PosixUPath | WindowsUPath): # if the input is a local path, use LocalStore - return LocalStore(path.path) + return LocalStore(path.path, read_only=read_only) if isinstance(path, zarr.Group): _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) # if the input is a zarr.Group, wrap it with a store if isinstance(path.store, LocalStore): store_path = UPath(path.store.root) / path.path - return LocalStore(store_path.path) + return LocalStore(store_path.path, read_only=read_only) if isinstance(path.store, FsspecStore): # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group From 4be931b91114390d1318f670babb4b5e5b078fd0 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 13:58:25 +0200 Subject: [PATCH 39/51] test clarity --- tests/io/test_store_abstractions.py | 37 ++++++++++++++++------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/tests/io/test_store_abstractions.py b/tests/io/test_store_abstractions.py index 78f9d5963..378d1cc4a 100644 --- a/tests/io/test_store_abstractions.py +++ b/tests/io/test_store_abstractions.py @@ -15,9 +15,10 @@ labels, shapes, points, and a full sdata. The write-read-write cycle specifically pins the categorical-schema invariant that the arrow-filesystem migration (this PR) had to re-establish in ``_read_points``. -- Writing to a ``UPath`` lands the root metadata artifact in the backend. The read-time - consumption of consolidated metadata is an xfail placeholder for the cloud-native - follow-up. +- Writing to a ``UPath`` lands the root metadata artifact in the backend. Reading via + consolidated metadata is left as a failing test on purpose: the invariant is stated, + but the fix (threading ``use_consolidated=True`` through ``read_zarr`` / the store + opener) is intentionally open for review discussion rather than silently suppressed. - A ``MemoryFileSystem`` subclass that refuses listing proves that ``SpatialData.read`` does not depend on directory listing for basic elements (the precondition for serving public HTTPS zarrs). @@ -181,9 +182,12 @@ def test_roundtrip_full_sdata(self, full_sdata: SpatialData) -> None: class TestConsolidatedMetadataOnRead: """Writing produces a consolidated-metadata artifact; the read path does not consume it yet. - The follow-up cloud-native PR will thread ``use_consolidated=True`` through - ``open_read_store`` / ``read_zarr``. When that lands, the xfail here flips to a pass - and the assertion becomes strict. + The second test in this class is intentionally left to fail (not xfail-ed): it pins + the invariant we want -- that reading a remote store uses the consolidated metadata + artifact so small-GET traffic stays bounded -- and leaves the implementation detail + (threading ``use_consolidated=True`` through ``read_zarr`` / ``open_read_store``) + open for reviewer discussion. Please comment on the right place to wire it in; we + would rather the gap be visible than hidden behind ``@pytest.mark.xfail``. """ def test_write_produces_root_metadata_on_memory_upath(self, images: SpatialData) -> None: @@ -197,33 +201,32 @@ def test_write_produces_root_metadata_on_memory_upath(self, images: SpatialData) root_keys = [p.rsplit("/", 1)[-1] for p in fs.find(upath.path)] assert "zarr.json" in root_keys or ".zmetadata" in root_keys, root_keys - @pytest.mark.xfail( - reason=( - "read_zarr opens the root group with zarr.open_group(store, mode='r') without " - "use_consolidated=True, so a consolidated metadata artifact is ignored on remote " - "reads. The cloud-native follow-up will thread use_consolidated through open_read_store." - ), - strict=True, - ) def test_read_zarr_opens_via_consolidated_metadata(self, images: SpatialData) -> None: + # Left to fail intentionally: read_zarr currently opens the root group with + # zarr.open_group(store, mode="r") without use_consolidated=True, so a written + # consolidated-metadata artifact is ignored on read. The fix site (wiring + # use_consolidated through open_read_store / read_zarr) is left open for review + # discussion rather than hidden behind @pytest.mark.xfail. upath = _fresh_memory_upath("consolidated-read") images.write(upath, overwrite=True) # Count store GETs on the memory fs to detect that consolidated metadata is used: # without consolidation, reading one image requires many small zarr.json / .zgroup GETs. + # We monkeypatch the public ``cat_file`` method (the one MemoryFileSystem actually + # exposes); targeting ``_cat_file`` would silently miss every call. fs = upath.fs - original_cat_file = fs._cat_file + original_cat_file = fs.cat_file call_count = {"n": 0} def counting_cat_file(path, *args, **kwargs): call_count["n"] += 1 return original_cat_file(path, *args, **kwargs) - fs._cat_file = counting_cat_file + fs.cat_file = counting_cat_file try: SpatialData.read(upath) finally: - fs._cat_file = original_cat_file + fs.cat_file = original_cat_file # With consolidated metadata, we expect very few small-metadata GETs for a # trivial 1-image sdata. Without it, typical count is >> 10. The exact bound is From b800d05d557e8751b57465528969c07cc12ccb4c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 12:00:22 +0000 Subject: [PATCH 40/51] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata/_io/io_shapes.py | 1 + tests/io/test_store_abstractions.py | 9 ++------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index 7344cd90e..c2128fe19 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -154,6 +154,7 @@ def _write_shapes_v01(shapes: GeoDataFrame, group: zarr.Group, element_format: F attrs["version"] = element_format.spatialdata_format_version return attrs + def _write_shapes_v02_v03( shapes: GeoDataFrame, group: zarr.Group, element_format: Format, geometry_encoding: Literal["WKB", "geoarrow"] ) -> Any: diff --git a/tests/io/test_store_abstractions.py b/tests/io/test_store_abstractions.py index 378d1cc4a..996c75b17 100644 --- a/tests/io/test_store_abstractions.py +++ b/tests/io/test_store_abstractions.py @@ -30,7 +30,6 @@ from __future__ import annotations -import pytest from fsspec.implementations.memory import MemoryFileSystem from upath import UPath @@ -130,9 +129,7 @@ def test_roundtrip_points_only(self, points: SpatialData) -> None: read = SpatialData.read(upath) assert_spatial_data_objects_are_identical(points, read) - def test_write_read_write_points_preserves_categorical_schema( - self, points: SpatialData - ) -> None: + def test_write_read_write_points_preserves_categorical_schema(self, points: SpatialData) -> None: """Regression guard for the arrow-filesystem categorical round-trip. This PR migrated points io to ``to_parquet`` / ``read_parquet`` with @@ -231,9 +228,7 @@ def counting_cat_file(path, *args, **kwargs): # With consolidated metadata, we expect very few small-metadata GETs for a # trivial 1-image sdata. Without it, typical count is >> 10. The exact bound is # a documented, loose sanity check, not a micro-benchmark. - assert call_count["n"] < 10, ( - f"expected consolidated metadata to reduce GETs, saw {call_count['n']}" - ) + assert call_count["n"] < 10, f"expected consolidated metadata to reduce GETs, saw {call_count['n']}" # --------------------------------------------------------------------------- From cb1d0d5447a2ca11ca8e704bac1b01550ff9a7dd Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 14:02:53 +0200 Subject: [PATCH 41/51] restore pointer --- docs/tutorials/notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks index 9cf35b236..8774b0d92 160000 --- a/docs/tutorials/notebooks +++ b/docs/tutorials/notebooks @@ -1 +1 @@ -Subproject commit 9cf35b236c4fdbce01a7c9e83f20256738b9a8fd +Subproject commit 8774b0d927e1d5ad38aec8f545c7bf0591c77fe7 From e6fba590ac164fac708d91d7580778230109cf5c Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 14:05:00 +0200 Subject: [PATCH 42/51] mypy plus notebook pointer --- src/spatialdata/_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index 6fc1069e8..f76606bf4 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -3,14 +3,14 @@ from contextlib import contextmanager from dataclasses import dataclass, replace from pathlib import Path -from typing import Any +from typing import Any, TypeAlias import pyarrow.fs as pafs import zarr from upath import UPath from zarr.storage import FsspecStore, LocalStore -PathLike = Path | UPath +PathLike: TypeAlias = Path | UPath def _normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: From 2c4a579a4ca4c78675d4a55189cb195a54f251ae Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 14:19:41 +0200 Subject: [PATCH 43/51] refactor helpers --- src/spatialdata/_core/spatialdata.py | 191 ++++++++++----------------- src/spatialdata/_io/_utils.py | 113 +++------------- src/spatialdata/_io/io_points.py | 44 ++---- src/spatialdata/_io/io_raster.py | 10 +- src/spatialdata/_io/io_shapes.py | 23 ++-- src/spatialdata/_io/io_table.py | 11 +- src/spatialdata/_io/io_zarr.py | 153 +++++++++++---------- src/spatialdata/_store.py | 32 +---- tests/io/test_store.py | 28 ---- 9 files changed, 192 insertions(+), 413 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 5c5230c90..739b225fe 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -30,7 +30,6 @@ validate_table_attr_keys, ) from spatialdata._logging import logger -from spatialdata._store import ZarrStore, make_zarr_store, open_read_store, open_write_store from spatialdata._types import ArrayLike, Raster_T from spatialdata._utils import _deprecation_alias from spatialdata.models import ( @@ -122,8 +121,7 @@ def __init__( tables: dict[str, AnnData] | Tables | None = None, attrs: Mapping[Any, Any] | None = None, ) -> None: - self._path: Path | UPath | None = None - self._zarr_store: ZarrStore | None = None + self._path: Path | None = None self._shared_keys: set[str | None] = set() self._images: Images = Images(shared_keys=self._shared_keys) @@ -550,34 +548,16 @@ def is_backed(self) -> bool: return self.path is not None @property - def path(self) -> Path | UPath | None: - """Path to the Zarr storage (always :class:`pathlib.Path` or :class:`upath.UPath` when set).""" + def path(self) -> Path | None: + """Path to the Zarr storage.""" return self._path @path.setter - def path(self, value: str | Path | UPath | None) -> None: - if value is None: - self._set_zarr_store(None) + def path(self, value: Path | None) -> None: + if value is None or isinstance(value, str | Path): + self._path = value else: - self._set_zarr_store(make_zarr_store(value)) - - def _set_zarr_store(self, zarr_store: ZarrStore | None) -> None: - self._zarr_store = zarr_store - self._path = None if zarr_store is None else zarr_store.path - - def _get_zarr_store(self) -> ZarrStore | None: - if self._zarr_store is not None: - return self._zarr_store - if self.path is None: - return None - self._zarr_store = make_zarr_store(self.path) - return self._zarr_store - - def _require_zarr_store(self) -> ZarrStore: - zarr_store = self._get_zarr_store() - if zarr_store is None: - raise ValueError("The SpatialData object is not backed by a Zarr store.") - return zarr_store + raise TypeError("Path must be `None`, a `str` or a `Path` object.") def locate_element(self, element: SpatialElement) -> list[str]: """ @@ -1002,7 +982,13 @@ def elements_paths_on_disk(self) -> list[str]: ------- A list of paths of the elements saved in the Zarr store. """ - zarr_store = self._require_zarr_store() + from spatialdata._io._utils import _resolve_zarr_store + + if self.path is None: + raise ValueError("The SpatialData object is not backed by a Zarr store.") + + store = _resolve_zarr_store(self.path) + root = zarr.open_group(store=store, mode="r") elements_in_zarr = [] def find_groups(obj: zarr.Group, path: str) -> None: @@ -1011,14 +997,13 @@ def find_groups(obj: zarr.Group, path: str) -> None: if isinstance(obj, zarr.Group) and path.count("/") == 1: elements_in_zarr.append(path) - with open_read_store(zarr_store) as store: - root = zarr.open_group(store=store, mode="r") - for element_type in root: - if element_type in ["images", "labels", "points", "shapes", "tables"]: - for element_name in root[element_type]: - path = f"{element_type}/{element_name}" - elements_in_zarr.append(path) + for element_type in root: + if element_type in ["images", "labels", "points", "shapes", "tables"]: + for element_name in root[element_type]: + path = f"{element_type}/{element_name}" + elements_in_zarr.append(path) # root.visit(lambda path: find_groups(root[path], path)) + store.close() return elements_in_zarr def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: @@ -1047,56 +1032,18 @@ def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: def _validate_can_safely_write_to_path( self, - file_path: str | Path | UPath, + file_path: str | Path, overwrite: bool = False, saving_an_element: bool = False, ) -> None: - """ - Guard against unsafe writes for **local** paths (zarr check, Dask backing, subfolders). - - For :class:`upath.UPath`, ``overwrite=False`` is rejected: we cannot reliably check - whether a remote store already exists (fsspec existence semantics vary by backend and - object stores have no directory concept), so the "fail if exists" contract cannot be - honored. Callers must pass ``overwrite=True`` to explicitly acknowledge that the write - may clobber pre-existing data at the target. - """ - from upath.implementations.local import PosixUPath, WindowsUPath + from spatialdata._io._utils import _backed_elements_contained_in_path, _is_subfolder, _resolve_zarr_store - from spatialdata._io._utils import ( - _backed_elements_contained_in_path, - _is_subfolder, - _resolve_zarr_store, - ) - - # Hierarchical URIs ("scheme://...") must become UPath: plain Path(str) breaks cloud URLs - # (S3-compatible stores, Azure abfs:// / az://, GCS gs://, https://, fsspec chains, etc.). - if isinstance(file_path, str) and "://" in file_path: - file_path = UPath(file_path) - elif isinstance(file_path, str): + if isinstance(file_path, str): file_path = Path(file_path) - if not isinstance(file_path, (Path, UPath)): - raise ValueError(f"file_path must be a string, Path or UPath object, type(file_path) = {type(file_path)}.") - - # Local UPath variants (PosixUPath / WindowsUPath) wrap a plain filesystem path; they - # have reliable existence semantics and must go through the same local validation as - # Path. Only *remote* UPath (cloud / http / memory / etc.) falls through the remote guard. - is_remote_upath = isinstance(file_path, UPath) and not isinstance(file_path, (PosixUPath, WindowsUPath)) - - if is_remote_upath: - # The overwrite opt-in only applies at the top-level store entry. Per-element writes - # issued internally by ``write()`` (and incremental ``write_element`` calls into an - # existing store) must not re-trigger the guard on every sub-key, or writing to a - # remote target would be impossible. - if not overwrite and not saving_an_element: - raise NotImplementedError( - "Writing to a remote (UPath) target requires overwrite=True. " - "We cannot reliably check whether the remote store already exists, so the write " - "may clobber existing data; pass overwrite=True to acknowledge this." - ) - return + if not isinstance(file_path, Path): + raise ValueError(f"file_path must be a string or a Path object, type(file_path) = {type(file_path)}.") - # Local Path: existing logic # TODO: add test for this if os.path.exists(file_path): store = _resolve_zarr_store(file_path) @@ -1125,13 +1072,8 @@ def _validate_can_safely_write_to_path( ERROR_MSG + "\nDetails: the target path contains one or more files that Dask use for " "backing elements in the SpatialData object." + WORKAROUND ) - # Subfolder checks only for local paths (Path); skip when self.path is UPath - if ( - self.path is not None - and isinstance(self.path, Path) - and ( - _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) - ) + if self.path is not None and ( + _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) ): if saving_an_element and _is_subfolder(parent=self.path, child=file_path): raise ValueError( @@ -1160,7 +1102,7 @@ def _validate_all_elements(self) -> None: @_deprecation_alias(format="sdata_formats", version="0.7.0") def write( self, - file_path: str | Path | UPath | None = None, + file_path: str | Path, overwrite: bool = False, consolidate_metadata: bool = True, update_sdata_path: bool = True, @@ -1173,12 +1115,10 @@ def write( Parameters ---------- file_path - The path to the Zarr store to write to. If ``None``, uses :attr:`path` (must be set). + The path to the Zarr store to write to. overwrite If `True`, overwrite the Zarr store if it already exists. If `False`, `write()` will fail if the Zarr store - already exists. For remote paths (:class:`upath.UPath`), ``overwrite=True`` is required because we cannot - reliably check whether the remote target exists; passing ``overwrite=False`` raises ``NotImplementedError``. - Pass ``overwrite=True`` to explicitly acknowledge that the write may clobber pre-existing data. + already exists. consolidate_metadata If `True`, triggers :func:`zarr.convenience.consolidate_metadata`, which writes all the metadata in a single file at the root directory of the store. This makes the data cloud accessible, which is required for certain @@ -1216,23 +1156,21 @@ def write( Whether to use the WKB or geoarrow encoding for GeoParquet. See :meth:`geopandas.GeoDataFrame.to_parquet` for details. If None, uses the value from :attr:`spatialdata.settings.shapes_geometry_encoding`. """ + from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import _parse_formats parsed = _parse_formats(sdata_formats) - if file_path is None: - if self.path is None: - raise ValueError("file_path must be provided when SpatialData.path is not set.") - file_path = self.path - zarr_store = make_zarr_store(file_path) - file_path = zarr_store.path + if isinstance(file_path, str): + file_path = Path(file_path) self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) self._validate_all_elements() - with open_write_store(zarr_store) as store: - zarr_format = parsed["SpatialData"].zarr_format - zarr_group = zarr.create_group(store=store, overwrite=overwrite, zarr_format=zarr_format) - self.write_attrs(zarr_group=zarr_group, sdata_format=parsed["SpatialData"]) + store = _resolve_zarr_store(file_path) + zarr_format = parsed["SpatialData"].zarr_format + zarr_group = zarr.create_group(store=store, overwrite=overwrite, zarr_format=zarr_format) + self.write_attrs(zarr_group=zarr_group, sdata_format=parsed["SpatialData"]) + store.close() for element_type, element_name, element in self.gen_elements(): self._write_element( @@ -1246,7 +1184,7 @@ def write( ) if self.path != file_path and update_sdata_path: - self._set_zarr_store(zarr_store) + self.path = file_path if consolidate_metadata: self.write_consolidated_metadata() @@ -1254,7 +1192,7 @@ def write( def _write_element( self, element: SpatialElement | AnnData, - zarr_container_path: Path | UPath, + zarr_container_path: Path, element_type: str, element_name: str, overwrite: bool, @@ -1263,8 +1201,10 @@ def _write_element( ) -> None: from spatialdata._io.io_zarr import _get_groups_for_element - if not isinstance(zarr_container_path, (Path, UPath)): - raise ValueError(f"zarr_container_path must be a Path or UPath, got {type(zarr_container_path).__name__}.") + if not isinstance(zarr_container_path, Path): + raise ValueError( + f"zarr_container_path must be a Path object, type(zarr_container_path) = {type(zarr_container_path)}." + ) file_path_of_element = zarr_container_path / element_type / element_name self._validate_can_safely_write_to_path( file_path=file_path_of_element, overwrite=overwrite, saving_an_element=True @@ -1483,12 +1423,13 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None: "more elements in the SpatialData object. Deleting the data would corrupt the SpatialData object." ) - zarr_store = self._require_zarr_store() + from spatialdata._io._utils import _resolve_zarr_store # delete the element - with open_write_store(zarr_store) as store: - root = zarr.open_group(store=store, mode="r+", use_consolidated=False) - del root[element_type][element_name] + store = _resolve_zarr_store(self.path) + root = zarr.open_group(store=store, mode="r+", use_consolidated=False) + del root[element_type][element_name] + store.close() if self.has_consolidated_metadata(): self.write_consolidated_metadata() @@ -1511,11 +1452,14 @@ def write_consolidated_metadata(self) -> None: _write_consolidated_metadata(self.path) def has_consolidated_metadata(self) -> bool: + from spatialdata._io._utils import _resolve_zarr_store + return_value = False - with open_read_store(self._require_zarr_store()) as store: - group = zarr.open_group(store, mode="r") - if getattr(group.metadata, "consolidated_metadata", None): - return_value = True + store = _resolve_zarr_store(self.path) + group = zarr.open_group(store, mode="r") + if getattr(group.metadata, "consolidated_metadata", None): + return_value = True + store.close() return return_value def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[str, SpatialElement | AnnData] | None: @@ -1545,7 +1489,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # check if the element exists in the Zarr storage if not _group_for_element_exists( - zarr_path=self.path, + zarr_path=Path(self.path), element_type=element_type, element_name=element_name, ): @@ -1559,7 +1503,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # warn the users if the element is not self-contained, that is, it is Dask-backed by files outside the Zarr # group for the element - element_zarr_path = self.path / element_type / element_name + element_zarr_path = Path(self.path) / element_type / element_name if not _is_element_self_contained(element=element, element_path=element_zarr_path): logger.info( f"Element {element_type}/{element_name} is not self-contained. The metadata will be" @@ -1600,7 +1544,7 @@ def write_channel_names(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have the check in the conditional if element_type == "images" and self.path is not None: _, _, element_group = _get_groups_for_element( - zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False + zarr_path=Path(self.path), element_type=element_type, element_name=element_name, use_consolidated=False ) from spatialdata._io._utils import overwrite_channel_names @@ -1644,7 +1588,7 @@ def write_transformations(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have a conditional assert self.path is not None _, _, element_group = _get_groups_for_element( - zarr_path=self.path, + zarr_path=Path(self.path), element_type=element_type, element_name=element_name, use_consolidated=False, @@ -1701,17 +1645,18 @@ def write_attrs( sdata_format: SpatialDataContainerFormatType | None = None, zarr_group: zarr.Group | None = None, ) -> None: + from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import CurrentSpatialDataContainerFormat, SpatialDataContainerFormatType sdata_format = sdata_format if sdata_format is not None else CurrentSpatialDataContainerFormat() assert isinstance(sdata_format, SpatialDataContainerFormatType) + store = None + if zarr_group is None: assert self.is_backed(), "The SpatialData object must be backed by a Zarr store to write attrs." - with open_write_store(self._require_zarr_store()) as store: - zarr_group = zarr.open_group(store=store, mode="r+") - self.write_attrs(sdata_format=sdata_format, zarr_group=zarr_group) - return + store = _resolve_zarr_store(self.path) + zarr_group = zarr.open_group(store=store, mode="r+") version = sdata_format.spatialdata_format_version version_specific_attrs = sdata_format.attrs_to_dict() @@ -1722,6 +1667,9 @@ def write_attrs( except TypeError as e: raise TypeError("Invalid attribute in SpatialData.attrs") from e + if store is not None: + store.close() + def write_metadata( self, element_name: str | None = None, @@ -2008,8 +1956,7 @@ def h(s: str) -> str: descr = "SpatialData object" if self.path is not None: - path_descr = str(self.path) if isinstance(self.path, UPath) else self.path.resolve() - descr += f", with associated Zarr store: {path_descr}" + descr += f", with associated Zarr store: {self.path.resolve()}" non_empty_elements = self._non_empty_elements() last_element_index = len(non_empty_elements) - 1 diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index ec949f799..6690d1118 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -1,7 +1,6 @@ from __future__ import annotations import filecmp -import json import os.path import re import sys @@ -39,22 +38,6 @@ from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes -def _join_fsspec_store_path(store_path: str, relative_path: str) -> str: - """Combine FsspecStore root with a zarr group path using POSIX ``/`` (fsspec keys; safe on Windows).""" - base = str(store_path).replace("\\", "/").rstrip("/") - rel = str(relative_path).replace("\\", "/").lstrip("/") - if not base: - return f"/{rel}" if rel else "/" - return f"{base}/{rel}" if rel else base - - -def _unwrap_fsspec_sync_fs(fs: Any) -> Any: - inner = getattr(fs, "sync_fs", None) - if inner is not None and inner is not fs: - return _unwrap_fsspec_sync_fs(inner) - return fs - - def _get_transformations_from_ngff_dict( list_of_encoded_ngff_transformations: list[dict[str, Any]], ) -> MappingToCoordinateSystem_t: @@ -387,9 +370,7 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No files.append(os.path.realpath(parquet_file)) -def _backed_elements_contained_in_path( - path: Path | UPath, object: SpatialData | SpatialElement | AnnData -) -> list[bool]: +def _backed_elements_contained_in_path(path: Path, object: SpatialData | SpatialElement | AnnData) -> list[bool]: """ Return the list of boolean values indicating if backing files for an object are child directory of a path. @@ -408,16 +389,9 @@ def _backed_elements_contained_in_path( ----- If an object does not have a Dask computational graph, it will return an empty list. It is possible for a single SpatialElement to contain multiple files in their Dask computational graph. - - For a remote ``path`` (:class:`upath.UPath`), this always returns an empty list: Dask backing paths - are resolved as local filesystem paths, so they cannot be compared to object-store locations. - :meth:`spatialdata.SpatialData.write` therefore skips the local "backing files in target" guard - for remote targets; ``overwrite=True`` on a remote URL must be used only when overwriting is safe. """ - if isinstance(path, UPath): - return [] if not isinstance(path, Path): - raise TypeError(f"Expected a Path or UPath object, got {type(path)}") + raise TypeError(f"Expected a Path object, got {type(path)}") return [_is_subfolder(parent=path, child=Path(fp)) for fp in get_dask_backing_files(object)] @@ -446,44 +420,16 @@ def _is_subfolder(parent: Path, child: Path) -> bool: def _is_element_self_contained( - element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, - element_path: Path | UPath, + element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, element_path: Path ) -> bool: - """Whether element Dask graphs only reference files under ``element_path`` (local) or N/A (remote).""" - if isinstance(element_path, UPath): - # Backing-file paths are local; cannot relate them to remote keys—assume OK for this heuristic. - return True if isinstance(element, DaskDataFrame): pass # TODO when running test_save_transformations it seems that for the same element this is called multiple times return all(_backed_elements_contained_in_path(path=element_path, object=element)) -def _ensure_async_fs(fs: Any) -> Any: - """Return an async fsspec filesystem for use with zarr's FsspecStore. - - Zarr's FsspecStore expects an async filesystem. If the given fs is synchronous, - it is converted using fsspec's public API (async instance or AsyncFileSystemWrapper) - so that ZarrUserWarning is not raised. - """ - if getattr(fs, "asynchronous", False): - return fs - import fsspec - - if getattr(fs, "async_impl", False): - fs_dict = json.loads(fs.to_json()) - fs_dict["asynchronous"] = True - return fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict)) - from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper - - return AsyncFileSystemWrapper(fs, asynchronous=True) - - def _resolve_zarr_store( - path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, - *, - read_only: bool = False, - **kwargs: Any, + path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, **kwargs: Any ) -> zarr.storage.StoreLike: """ Normalize different Zarr store inputs into a usable store instance. @@ -499,14 +445,9 @@ def _resolve_zarr_store( path The input representing a Zarr store or group. Can be a filesystem path, remote path, existing store, or Zarr group. - read_only - If ``True``, constructed ``LocalStore`` / ``FsspecStore`` instances are built with - ``read_only=True``. Stores that already exist (when ``path`` is a ``StoreLike`` or - a ``zarr.Group`` whose wrapped store is not reconstructable) are returned as-is; - the caller is responsible for opening them at the right mode. **kwargs Additional keyword arguments forwarded to the underlying store - constructor. + constructor (e.g. `mode`, `storage_options`). Returns ------- @@ -516,53 +457,37 @@ def _resolve_zarr_store( ------ TypeError If the input type is unsupported. - ValueError + ValueError If a `zarr.Group` has an unsupported store type. """ + # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): + # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): # if the input is a local path, use LocalStore - return LocalStore(path.path, read_only=read_only) + return LocalStore(path.path) if isinstance(path, zarr.Group): - _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) # if the input is a zarr.Group, wrap it with a store if isinstance(path.store, LocalStore): store_path = UPath(path.store.root) / path.path - return LocalStore(store_path.path, read_only=read_only) + return LocalStore(store_path.path) if isinstance(path.store, FsspecStore): # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group - return FsspecStore( - fs=_ensure_async_fs(path.store.fs), - path=_join_fsspec_store_path(path.store.path, path.path), - read_only=read_only, - **kwargs, - ) - if _cms is not None and isinstance(path.store, _cms): - # Unwrap and apply the same async-fs guards as a direct FsspecStore on the group. - inner = path.store.store - if isinstance(inner, FsspecStore): - return FsspecStore( - fs=_ensure_async_fs(inner.fs), - path=_join_fsspec_store_path(inner.path, path.path), - read_only=read_only, - **kwargs, - ) - if isinstance(inner, LocalStore): - store_path = UPath(inner.root) / path.path - return LocalStore(store_path.path, read_only=read_only) - return inner + return FsspecStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) + if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): + # if the store is a ConsolidatedMetadataStore, just return the underlying FSSpec store + return path.store.store raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") - if isinstance(path, UPath): - # Check before StoreLike to avoid UnionType isinstance. - return FsspecStore(_ensure_async_fs(path.fs), path=path.path, read_only=read_only, **kwargs) if isinstance(path, zarr.storage.StoreLike): - # Already a concrete store (LocalStore, FsspecStore, MemoryStore, ...). Do not pass it as ``fs=`` to - # FsspecStore -- that only accepts an async fsspec filesystem and raises on stores (e.g. ``async_impl``). - return path + # if the input already a store, wrap it in an FSStore + return FsspecStore(path, **kwargs) + if isinstance(path, UPath): + # if input is a remote UPath, map it to an FSStore + return FsspecStore(path.path, fs=path.fs, **kwargs) raise TypeError(f"Unsupported type: {type(path)}") diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 9a70b4b1f..b47fc418c 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -6,16 +6,13 @@ from dask.dataframe import DataFrame as DaskDataFrame from dask.dataframe import read_parquet from ome_zarr.format import Format -from upath import UPath from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, - _resolve_zarr_store, _write_metadata, overwrite_coordinate_transformations_non_raster, ) from spatialdata._io.format import CurrentPointsFormat, PointsFormats, _parse_version -from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group from spatialdata.models import get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -24,38 +21,21 @@ def _read_points( - store: str | Path | UPath | ZarrStore, + store: str | Path, ) -> DaskDataFrame: - """Read points from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" - zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) - resolved_store = _resolve_zarr_store(zarr_store.path) - f = zarr.open(resolved_store, mode="r") + """Read points from a zarr store.""" + f = zarr.open(store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None points_format = PointsFormats[version] - parquet_store = zarr_store.child("points.parquet") - # Passing filesystem= to read_parquet makes pyarrow convert dictionary columns into pandas - # categoricals eagerly per partition and marks them known=True with an empty category list. - # This happens for ANY pyarrow filesystem (both LocalFileSystem and PyFileSystem(FSSpecHandler(.)) - # return the same broken categorical), so it is a property of the filesystem= handoff itself, - # not of local-vs-remote. Left as is, it would make write_points' cat.as_known() a no-op and - # the next to_parquet(filesystem=.) would fail with a per-partition schema mismatch - # (dictionary vs dictionary). We demote the categoricals back to - # "unknown" right here so that write_points recomputes categories consistently across partitions. - # TODO: allow reading in the metadata without materializing the data. - points = read_parquet( - parquet_store.arrow_path(), - filesystem=parquet_store.arrow_filesystem(), - ) + store_root = f.store_path.store.root + path = store_root / f.path / "points.parquet" + # cache on remote file needed for parquet reader to work + # TODO: allow reading in the metadata without caching all the data + points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) assert isinstance(points, DaskDataFrame) - for column_name in points.columns: - c = points[column_name] - if c.dtype == "category" and c.cat.known: - points[column_name] = c.cat.as_unknown() - if points.index.name == "__null_dask_index__": - points = points.rename_axis(None) transformations = _get_transformations_from_ngff_dict(f.attrs.asdict()["coordinateTransformations"]) _set_transformations(points, transformations) @@ -88,7 +68,8 @@ def write_points( axes = get_axes_names(points) transformations = _get_transformations(points) - parquet_store = make_zarr_store_from_group(group).child("points.parquet") + store_root = group.store_path.store.root + path = store_root / group.path / "points.parquet" # The following code iterates through all columns in the 'points' DataFrame. If the column's datatype is # 'category', it checks whether the categories of this column are known. If not, it explicitly converts the @@ -103,10 +84,7 @@ def write_points( points_without_transform = points.copy() del points_without_transform.attrs["transform"] - points_without_transform.to_parquet( - parquet_store.arrow_path(), - filesystem=parquet_store.arrow_filesystem(), - ) + points_without_transform.to_parquet(path) attrs = element_format.attrs_to_dict(points.attrs) attrs["version"] = element_format.spatialdata_format_version diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index 7eaf04d57..a8b2ab2ce 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -16,12 +16,10 @@ from ome_zarr.writer import write_labels as write_labels_ngff from ome_zarr.writer import write_multiscale as write_multiscale_ngff from ome_zarr.writer import write_multiscale_labels as write_multiscale_labels_ngff -from upath import UPath from xarray import DataArray, DataTree from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, - _resolve_zarr_store, overwrite_coordinate_transformations_raster, ) from spatialdata._io.format import ( @@ -29,7 +27,6 @@ RasterFormatType, get_ome_zarr_format, ) -from spatialdata._store import ZarrStore, make_zarr_store from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import get_channel_names from spatialdata.models.models import ATTRS_KEY @@ -163,14 +160,13 @@ def _prepare_storage_options( def _read_multiscale( - store: str | Path | UPath | ZarrStore, raster_type: Literal["image", "labels"], reader_format: Format + store: str | Path, raster_type: Literal["image", "labels"], reader_format: Format ) -> DataArray | DataTree: + assert isinstance(store, str | Path) assert raster_type in ["image", "labels"] - zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) - resolved_store = _resolve_zarr_store(zarr_store.path) nodes: list[Node] = [] - image_loc = ZarrLocation(resolved_store, fmt=reader_format) + image_loc = ZarrLocation(store, fmt=reader_format) if exists := image_loc.exists(): image_reader = Reader(image_loc)() image_nodes = list(image_reader) diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index c2128fe19..b07256273 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -9,11 +9,9 @@ from natsort import natsorted from ome_zarr.format import Format from shapely import from_ragged_array, to_ragged_array -from upath import UPath from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, - _resolve_zarr_store, _write_metadata, overwrite_coordinate_transformations_non_raster, ) @@ -25,7 +23,6 @@ ShapesFormatV03, _parse_version, ) -from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group from spatialdata.models import ShapesModel, get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -34,12 +31,10 @@ def _read_shapes( - store: str | Path | UPath | ZarrStore, + store: str | Path, ) -> GeoDataFrame: - """Read shapes from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" - zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) - resolved_store = _resolve_zarr_store(zarr_store.path) - f = zarr.open(resolved_store, mode="r") + """Read shapes from a zarr store.""" + f = zarr.open(store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None shape_format = ShapesFormats[version] @@ -59,9 +54,9 @@ def _read_shapes( geometry = from_ragged_array(typ, coords, offsets) geo_df = GeoDataFrame({"geometry": geometry}, index=index) elif isinstance(shape_format, ShapesFormatV02 | ShapesFormatV03): - parquet_store = zarr_store.child("shapes.parquet") - with parquet_store.arrow_filesystem().open_input_file(parquet_store.arrow_path()) as src: - geo_df = read_parquet(src) + store_root = f.store_path.store.root + path = Path(store_root) / f.path / "shapes.parquet" + geo_df = read_parquet(path) else: raise ValueError( f"Unsupported shapes format {shape_format} from version {version}. Please update the spatialdata library." @@ -174,13 +169,13 @@ def _write_shapes_v02_v03( """ from spatialdata.models._utils import TRANSFORM_KEY - parquet_store = make_zarr_store_from_group(group).child("shapes.parquet") + store_root = group.store_path.store.root + path = store_root / group.path / "shapes.parquet" # Temporarily remove transformations from attrs to avoid serialization issues transforms = shapes.attrs[TRANSFORM_KEY] del shapes.attrs[TRANSFORM_KEY] - with parquet_store.arrow_filesystem().open_output_stream(parquet_store.arrow_path()) as sink: - shapes.to_parquet(sink, geometry_encoding=geometry_encoding) + shapes.to_parquet(path, geometry_encoding=geometry_encoding) shapes.attrs[TRANSFORM_KEY] = transforms attrs = element_format.attrs_to_dict(shapes.attrs) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index a37e62a4e..8cd7b8385 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -8,9 +8,7 @@ from anndata import read_zarr as read_anndata_zarr from anndata._io.specs import write_elem as write_adata from ome_zarr.format import Format -from upath import UPath -from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import ( CurrentTablesFormat, TablesFormats, @@ -18,16 +16,13 @@ TablesFormatV02, _parse_version, ) -from spatialdata._store import ZarrStore, make_zarr_store from spatialdata.models import TableModel, get_table_keys -def _read_table(store: str | Path | UPath | ZarrStore) -> AnnData: - zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) - resolved_store = _resolve_zarr_store(zarr_store.path) - table = read_anndata_zarr(resolved_store) +def _read_table(store: str | Path) -> AnnData: + table = read_anndata_zarr(str(store)) - f = zarr.open(resolved_store, mode="r") + f = zarr.open(store, mode="r") version = _parse_version(f, expect_attrs_key=False) assert version is not None table_format = TablesFormats[version] diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 456374804..4c410fab0 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,12 +1,13 @@ from __future__ import annotations +import os import warnings from collections.abc import Callable from json import JSONDecodeError from pathlib import Path from typing import Any, Literal, cast -import zarr +import zarr.storage from anndata import AnnData from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -26,13 +27,12 @@ from spatialdata._io.io_shapes import _read_shapes from spatialdata._io.io_table import _read_table from spatialdata._logging import logger -from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group, open_read_store from spatialdata._types import Raster_T def _read_zarr_group_spatialdata_element( root_group: zarr.Group, - root_store: ZarrStore, + root_store_path: str, sdata_version: Literal["0.1", "0.2"], selector: set[str], read_func: Callable[..., Any], @@ -54,7 +54,7 @@ def _read_zarr_group_spatialdata_element( # skip hidden files like .zgroup or .zmetadata continue elem_group = group[subgroup_name] - elem_store = root_store.child(elem_group.path) + elem_group_path = os.path.join(root_store_path, elem_group.path) with handle_read_errors( on_bad_files, location=f"{group.path}/{subgroup_name}", @@ -70,12 +70,12 @@ def _read_zarr_group_spatialdata_element( if element_type in ["image", "labels"]: reader_format = get_raster_format_for_read(elem_group, sdata_version) element = read_func( - elem_store, + elem_group_path, cast(Literal["image", "labels"], element_type), reader_format, ) elif element_type in ["shapes", "points", "tables"]: - element = read_func(elem_store) + element = read_func(elem_group_path) else: raise ValueError(f"Unknown element type {element_type}") element_container[subgroup_name] = element @@ -153,7 +153,24 @@ def read_zarr( ------- A SpatialData object. """ - zarr_store = make_zarr_store_from_group(store) if isinstance(store, zarr.Group) else make_zarr_store(store) + from spatialdata._io._utils import _resolve_zarr_store + + resolved_store = _resolve_zarr_store(store) + root_group = zarr.open_group(resolved_store, mode="r") + # the following is the SpatialDataContainerFormat version + if "spatialdata_attrs" not in root_group.metadata.attributes: + # backward compatibility for pre-versioned SpatialData zarr stores + sdata_version: Literal["0.1", "0.2"] = "0.1" + else: + sdata_version = root_group.metadata.attributes["spatialdata_attrs"]["version"] + if sdata_version == "0.1": + warnings.warn( + "SpatialData is not stored in the most current format. If you want to use Zarr v3" + ", please write the store to a new location using `sdata.write()`.", + UserWarning, + stacklevel=2, + ) + root_store_path = root_group.store.root images: dict[str, Raster_T] = {} labels: dict[str, Raster_T] = {} @@ -161,66 +178,50 @@ def read_zarr( shapes: dict[str, GeoDataFrame] = {} tables: dict[str, AnnData] = {} - with open_read_store(zarr_store) as resolved_store: - root_group = zarr.open_group(resolved_store, mode="r") - # the following is the SpatialDataContainerFormat version - if "spatialdata_attrs" not in root_group.metadata.attributes: - # backward compatibility for pre-versioned SpatialData zarr stores - sdata_version: Literal["0.1", "0.2"] = "0.1" - else: - sdata_version = root_group.metadata.attributes["spatialdata_attrs"]["version"] - if sdata_version == "0.1": - warnings.warn( - "SpatialData is not stored in the most current format. If you want to use Zarr v3" - ", please write the store to a new location using `sdata.write()`.", - UserWarning, - stacklevel=2, - ) - - selector = {"images", "labels", "points", "shapes", "tables"} if not selection else set(selection or []) - logger.debug(f"Reading selection {selector}") - - # we could make this more readable. One can get lost when looking at this dict and iteration over the items - group_readers: dict[ - Literal["images", "labels", "shapes", "points", "tables"], - tuple[ - Callable[..., Any], - Literal["image", "labels", "shapes", "points", "tables"], - dict[str, Raster_T] | dict[str, DaskDataFrame] | dict[str, GeoDataFrame] | dict[str, AnnData], - ], - ] = { - # ome-zarr-py needs a kwargs that has "image" has key. So here we have "image" and not "images" - "images": (_read_multiscale, "image", images), - "labels": (_read_multiscale, "labels", labels), - "points": (_read_points, "points", points), - "shapes": (_read_shapes, "shapes", shapes), - "tables": (_read_table, "tables", tables), - } - for group_name, ( - read_func, - element_type, - element_container, - ) in group_readers.items(): - _read_zarr_group_spatialdata_element( - root_group=root_group, - root_store=zarr_store, - sdata_version=sdata_version, - selector=selector, - read_func=read_func, - group_name=group_name, - element_type=element_type, - element_container=element_container, - on_bad_files=on_bad_files, - ) - - # read attrs metadata - attrs = root_group.attrs.asdict() - if "spatialdata_attrs" in attrs: - # when refactoring the read_zarr function into reading componenets separately (and according to the version), - # we can move the code below (.pop()) into attrs_from_dict() - attrs.pop("spatialdata_attrs") - else: - attrs = None + selector = {"images", "labels", "points", "shapes", "tables"} if not selection else set(selection or []) + logger.debug(f"Reading selection {selector}") + + # we could make this more readable. One can get lost when looking at this dict and iteration over the items + group_readers: dict[ + Literal["images", "labels", "shapes", "points", "tables"], + tuple[ + Callable[..., Any], + Literal["image", "labels", "shapes", "points", "tables"], + dict[str, Raster_T] | dict[str, DaskDataFrame] | dict[str, GeoDataFrame] | dict[str, AnnData], + ], + ] = { + # ome-zarr-py needs a kwargs that has "image" has key. So here we have "image" and not "images" + "images": (_read_multiscale, "image", images), + "labels": (_read_multiscale, "labels", labels), + "points": (_read_points, "points", points), + "shapes": (_read_shapes, "shapes", shapes), + "tables": (_read_table, "tables", tables), + } + for group_name, ( + read_func, + element_type, + element_container, + ) in group_readers.items(): + _read_zarr_group_spatialdata_element( + root_group=root_group, + root_store_path=root_store_path, + sdata_version=sdata_version, + selector=selector, + read_func=read_func, + group_name=group_name, + element_type=element_type, + element_container=element_container, + on_bad_files=on_bad_files, + ) + + # read attrs metadata + attrs = root_group.attrs.asdict() + if "spatialdata_attrs" in attrs: + # when refactoring the read_zarr function into reading componenets separately (and according to the version), + # we can move the code below (.pop()) into attrs_from_dict() + attrs.pop("spatialdata_attrs") + else: + attrs = None sdata = SpatialData( images=images, @@ -230,12 +231,12 @@ def read_zarr( tables=tables, attrs=attrs, ) - sdata._set_zarr_store(zarr_store) + sdata.path = resolved_store.root return sdata def _get_groups_for_element( - zarr_path: Path | UPath, element_type: str, element_name: str, use_consolidated: bool = True + zarr_path: Path, element_type: str, element_name: str, use_consolidated: bool = True ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. @@ -264,8 +265,8 @@ def _get_groups_for_element( ------- The Zarr groups for the root, element_type and element for a specific element. """ - if not isinstance(zarr_path, (Path, UPath)): - raise ValueError("zarr_path should be a Path or UPath object") + if not isinstance(zarr_path, Path): + raise ValueError("zarr_path should be a Path object") if element_type not in [ "images", @@ -288,7 +289,7 @@ def _get_groups_for_element( return root_group, element_type_group, element_name_group -def _group_for_element_exists(zarr_path: Path | UPath, element_type: str, element_name: str) -> bool: +def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: str) -> bool: """ Check if the group for an element exists. @@ -318,13 +319,9 @@ def _group_for_element_exists(zarr_path: Path | UPath, element_type: str, elemen return exists -def _write_consolidated_metadata(path: Path | UPath | str | None) -> None: +def _write_consolidated_metadata(path: Path | str | None) -> None: if path is not None: - if isinstance(path, UPath): - store = _resolve_zarr_store(path) - f = zarr.open_group(store, mode="r+", use_consolidated=False) - else: - f = zarr.open_group(path, mode="r+", use_consolidated=False) + f = zarr.open_group(path, mode="r+", use_consolidated=False) # .parquet files are not recognized as proper zarr and thus throw a warning. This does not affect SpatialData. # and therefore we silence it for our users as they can't do anything about this. # TODO check with remote PR whether we can prevent this warning at least for points data and whether with zarrv3 diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index f76606bf4..b231e71ce 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -6,14 +6,12 @@ from typing import Any, TypeAlias import pyarrow.fs as pafs -import zarr from upath import UPath -from zarr.storage import FsspecStore, LocalStore PathLike: TypeAlias = Path | UPath -def _normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: +def normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: if isinstance(path, str): return UPath(path, **(storage_options or {})) if "://" in path else Path(path) if isinstance(path, (Path, UPath)): @@ -26,7 +24,7 @@ class ZarrStore: path: PathLike def with_path(self, path: str | PathLike) -> ZarrStore: - return replace(self, path=_normalize_path(path)) + return replace(self, path=normalize_path(path)) def child(self, path: str | PathLike) -> ZarrStore: return self.with_path(self.path / path) @@ -45,31 +43,7 @@ def make_zarr_store( *, storage_options: dict[str, Any] | None = None, ) -> ZarrStore: - return ZarrStore(path=_normalize_path(path, storage_options)) - - -def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: - from spatialdata._io._utils import ( - _join_fsspec_store_path, - _unwrap_fsspec_sync_fs, - ) - - store = group.store - _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) - if _cms is not None and isinstance(store, _cms): - store = store.store - - if isinstance(store, LocalStore): - return make_zarr_store(Path(store.root) / group.path) - if isinstance(store, FsspecStore): - protocol = getattr(store.fs, "protocol", None) - if isinstance(protocol, (list, tuple)): - protocol = protocol[0] if protocol else "file" - elif protocol is None: - protocol = "file" - path = _join_fsspec_store_path(store.path, group.path) - return make_zarr_store(UPath(f"{protocol}://{path}", fs=_unwrap_fsspec_sync_fs(store.fs))) - raise ValueError(f"Unsupported store type or zarr.Group: {type(group.store)}") + return ZarrStore(path=normalize_path(path, storage_options)) @contextmanager diff --git a/tests/io/test_store.py b/tests/io/test_store.py index d9ef877e6..d7a18cfdc 100644 --- a/tests/io/test_store.py +++ b/tests/io/test_store.py @@ -10,7 +10,6 @@ from spatialdata._io._utils import _resolve_zarr_store from spatialdata._store import ( make_zarr_store, - make_zarr_store_from_group, open_read_store, open_write_store, ) @@ -44,17 +43,6 @@ def test_open_read_and_write_store_roundtrip(tmp_path: Path) -> None: assert group.attrs["answer"] == 42 -def test_make_zarr_store_from_local_group(tmp_path: Path) -> None: - zarr_store = make_zarr_store(tmp_path / "store.zarr") - - with open_write_store(zarr_store) as store: - root = zarr.create_group(store=store, overwrite=True) - group = root.require_group("images").require_group("image") - - child_store = make_zarr_store_from_group(group) - assert child_store.path == tmp_path / "store.zarr" / "images" / "image" - - def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: """StoreLike inputs must not be wrapped as FsspecStore(fs=store) -- that is only for async filesystems.""" mem = MemoryStore() @@ -80,19 +68,3 @@ def test_resolve_zarr_store_forwards_read_only_remote() -> None: assert store.read_only is True -def test_make_zarr_store_from_remote_group() -> None: - """Remote zarr.Group inputs keep a usable UPath and reopen through the same protocol.""" - import fsspec - from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper - - fs = fsspec.filesystem("memory") - async_fs = AsyncFileSystemWrapper(fs, asynchronous=True) - base = FsspecStore(async_fs, path="/") - root = zarr.open_group(store=base, mode="a") - group = root.require_group("points").require_group("points") - - zarr_store = make_zarr_store_from_group(group) - assert getattr(zarr_store.path.fs, "protocol", None) == "memory" - - with open_read_store(zarr_store) as store: - assert isinstance(store, FsspecStore) From 43269f6e6788b5928ca7e8d5992e42d698646d97 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 12:19:43 +0000 Subject: [PATCH 44/51] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/io/test_store.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/io/test_store.py b/tests/io/test_store.py index d7a18cfdc..a50c8423f 100644 --- a/tests/io/test_store.py +++ b/tests/io/test_store.py @@ -66,5 +66,3 @@ def test_resolve_zarr_store_forwards_read_only_remote() -> None: store = _resolve_zarr_store(upath, read_only=True) assert isinstance(store, FsspecStore) assert store.read_only is True - - From c706bbf4208cdb8a8450f17b2dae2a40d837a200 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 14:24:02 +0200 Subject: [PATCH 45/51] Revert "refactor helpers" This reverts commit 2c4a579a4ca4c78675d4a55189cb195a54f251ae. --- src/spatialdata/_core/spatialdata.py | 191 +++++++++++++++++---------- src/spatialdata/_io/_utils.py | 113 +++++++++++++--- src/spatialdata/_io/io_points.py | 44 ++++-- src/spatialdata/_io/io_raster.py | 10 +- src/spatialdata/_io/io_shapes.py | 23 ++-- src/spatialdata/_io/io_table.py | 11 +- src/spatialdata/_io/io_zarr.py | 153 ++++++++++----------- src/spatialdata/_store.py | 32 ++++- tests/io/test_store.py | 30 +++++ 9 files changed, 415 insertions(+), 192 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 739b225fe..5c5230c90 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -30,6 +30,7 @@ validate_table_attr_keys, ) from spatialdata._logging import logger +from spatialdata._store import ZarrStore, make_zarr_store, open_read_store, open_write_store from spatialdata._types import ArrayLike, Raster_T from spatialdata._utils import _deprecation_alias from spatialdata.models import ( @@ -121,7 +122,8 @@ def __init__( tables: dict[str, AnnData] | Tables | None = None, attrs: Mapping[Any, Any] | None = None, ) -> None: - self._path: Path | None = None + self._path: Path | UPath | None = None + self._zarr_store: ZarrStore | None = None self._shared_keys: set[str | None] = set() self._images: Images = Images(shared_keys=self._shared_keys) @@ -548,16 +550,34 @@ def is_backed(self) -> bool: return self.path is not None @property - def path(self) -> Path | None: - """Path to the Zarr storage.""" + def path(self) -> Path | UPath | None: + """Path to the Zarr storage (always :class:`pathlib.Path` or :class:`upath.UPath` when set).""" return self._path @path.setter - def path(self, value: Path | None) -> None: - if value is None or isinstance(value, str | Path): - self._path = value + def path(self, value: str | Path | UPath | None) -> None: + if value is None: + self._set_zarr_store(None) else: - raise TypeError("Path must be `None`, a `str` or a `Path` object.") + self._set_zarr_store(make_zarr_store(value)) + + def _set_zarr_store(self, zarr_store: ZarrStore | None) -> None: + self._zarr_store = zarr_store + self._path = None if zarr_store is None else zarr_store.path + + def _get_zarr_store(self) -> ZarrStore | None: + if self._zarr_store is not None: + return self._zarr_store + if self.path is None: + return None + self._zarr_store = make_zarr_store(self.path) + return self._zarr_store + + def _require_zarr_store(self) -> ZarrStore: + zarr_store = self._get_zarr_store() + if zarr_store is None: + raise ValueError("The SpatialData object is not backed by a Zarr store.") + return zarr_store def locate_element(self, element: SpatialElement) -> list[str]: """ @@ -982,13 +1002,7 @@ def elements_paths_on_disk(self) -> list[str]: ------- A list of paths of the elements saved in the Zarr store. """ - from spatialdata._io._utils import _resolve_zarr_store - - if self.path is None: - raise ValueError("The SpatialData object is not backed by a Zarr store.") - - store = _resolve_zarr_store(self.path) - root = zarr.open_group(store=store, mode="r") + zarr_store = self._require_zarr_store() elements_in_zarr = [] def find_groups(obj: zarr.Group, path: str) -> None: @@ -997,13 +1011,14 @@ def find_groups(obj: zarr.Group, path: str) -> None: if isinstance(obj, zarr.Group) and path.count("/") == 1: elements_in_zarr.append(path) - for element_type in root: - if element_type in ["images", "labels", "points", "shapes", "tables"]: - for element_name in root[element_type]: - path = f"{element_type}/{element_name}" - elements_in_zarr.append(path) + with open_read_store(zarr_store) as store: + root = zarr.open_group(store=store, mode="r") + for element_type in root: + if element_type in ["images", "labels", "points", "shapes", "tables"]: + for element_name in root[element_type]: + path = f"{element_type}/{element_name}" + elements_in_zarr.append(path) # root.visit(lambda path: find_groups(root[path], path)) - store.close() return elements_in_zarr def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: @@ -1032,18 +1047,56 @@ def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: def _validate_can_safely_write_to_path( self, - file_path: str | Path, + file_path: str | Path | UPath, overwrite: bool = False, saving_an_element: bool = False, ) -> None: - from spatialdata._io._utils import _backed_elements_contained_in_path, _is_subfolder, _resolve_zarr_store + """ + Guard against unsafe writes for **local** paths (zarr check, Dask backing, subfolders). + + For :class:`upath.UPath`, ``overwrite=False`` is rejected: we cannot reliably check + whether a remote store already exists (fsspec existence semantics vary by backend and + object stores have no directory concept), so the "fail if exists" contract cannot be + honored. Callers must pass ``overwrite=True`` to explicitly acknowledge that the write + may clobber pre-existing data at the target. + """ + from upath.implementations.local import PosixUPath, WindowsUPath - if isinstance(file_path, str): + from spatialdata._io._utils import ( + _backed_elements_contained_in_path, + _is_subfolder, + _resolve_zarr_store, + ) + + # Hierarchical URIs ("scheme://...") must become UPath: plain Path(str) breaks cloud URLs + # (S3-compatible stores, Azure abfs:// / az://, GCS gs://, https://, fsspec chains, etc.). + if isinstance(file_path, str) and "://" in file_path: + file_path = UPath(file_path) + elif isinstance(file_path, str): file_path = Path(file_path) - if not isinstance(file_path, Path): - raise ValueError(f"file_path must be a string or a Path object, type(file_path) = {type(file_path)}.") + if not isinstance(file_path, (Path, UPath)): + raise ValueError(f"file_path must be a string, Path or UPath object, type(file_path) = {type(file_path)}.") + + # Local UPath variants (PosixUPath / WindowsUPath) wrap a plain filesystem path; they + # have reliable existence semantics and must go through the same local validation as + # Path. Only *remote* UPath (cloud / http / memory / etc.) falls through the remote guard. + is_remote_upath = isinstance(file_path, UPath) and not isinstance(file_path, (PosixUPath, WindowsUPath)) + + if is_remote_upath: + # The overwrite opt-in only applies at the top-level store entry. Per-element writes + # issued internally by ``write()`` (and incremental ``write_element`` calls into an + # existing store) must not re-trigger the guard on every sub-key, or writing to a + # remote target would be impossible. + if not overwrite and not saving_an_element: + raise NotImplementedError( + "Writing to a remote (UPath) target requires overwrite=True. " + "We cannot reliably check whether the remote store already exists, so the write " + "may clobber existing data; pass overwrite=True to acknowledge this." + ) + return + # Local Path: existing logic # TODO: add test for this if os.path.exists(file_path): store = _resolve_zarr_store(file_path) @@ -1072,8 +1125,13 @@ def _validate_can_safely_write_to_path( ERROR_MSG + "\nDetails: the target path contains one or more files that Dask use for " "backing elements in the SpatialData object." + WORKAROUND ) - if self.path is not None and ( - _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) + # Subfolder checks only for local paths (Path); skip when self.path is UPath + if ( + self.path is not None + and isinstance(self.path, Path) + and ( + _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) + ) ): if saving_an_element and _is_subfolder(parent=self.path, child=file_path): raise ValueError( @@ -1102,7 +1160,7 @@ def _validate_all_elements(self) -> None: @_deprecation_alias(format="sdata_formats", version="0.7.0") def write( self, - file_path: str | Path, + file_path: str | Path | UPath | None = None, overwrite: bool = False, consolidate_metadata: bool = True, update_sdata_path: bool = True, @@ -1115,10 +1173,12 @@ def write( Parameters ---------- file_path - The path to the Zarr store to write to. + The path to the Zarr store to write to. If ``None``, uses :attr:`path` (must be set). overwrite If `True`, overwrite the Zarr store if it already exists. If `False`, `write()` will fail if the Zarr store - already exists. + already exists. For remote paths (:class:`upath.UPath`), ``overwrite=True`` is required because we cannot + reliably check whether the remote target exists; passing ``overwrite=False`` raises ``NotImplementedError``. + Pass ``overwrite=True`` to explicitly acknowledge that the write may clobber pre-existing data. consolidate_metadata If `True`, triggers :func:`zarr.convenience.consolidate_metadata`, which writes all the metadata in a single file at the root directory of the store. This makes the data cloud accessible, which is required for certain @@ -1156,21 +1216,23 @@ def write( Whether to use the WKB or geoarrow encoding for GeoParquet. See :meth:`geopandas.GeoDataFrame.to_parquet` for details. If None, uses the value from :attr:`spatialdata.settings.shapes_geometry_encoding`. """ - from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import _parse_formats parsed = _parse_formats(sdata_formats) - if isinstance(file_path, str): - file_path = Path(file_path) + if file_path is None: + if self.path is None: + raise ValueError("file_path must be provided when SpatialData.path is not set.") + file_path = self.path + zarr_store = make_zarr_store(file_path) + file_path = zarr_store.path self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) self._validate_all_elements() - store = _resolve_zarr_store(file_path) - zarr_format = parsed["SpatialData"].zarr_format - zarr_group = zarr.create_group(store=store, overwrite=overwrite, zarr_format=zarr_format) - self.write_attrs(zarr_group=zarr_group, sdata_format=parsed["SpatialData"]) - store.close() + with open_write_store(zarr_store) as store: + zarr_format = parsed["SpatialData"].zarr_format + zarr_group = zarr.create_group(store=store, overwrite=overwrite, zarr_format=zarr_format) + self.write_attrs(zarr_group=zarr_group, sdata_format=parsed["SpatialData"]) for element_type, element_name, element in self.gen_elements(): self._write_element( @@ -1184,7 +1246,7 @@ def write( ) if self.path != file_path and update_sdata_path: - self.path = file_path + self._set_zarr_store(zarr_store) if consolidate_metadata: self.write_consolidated_metadata() @@ -1192,7 +1254,7 @@ def write( def _write_element( self, element: SpatialElement | AnnData, - zarr_container_path: Path, + zarr_container_path: Path | UPath, element_type: str, element_name: str, overwrite: bool, @@ -1201,10 +1263,8 @@ def _write_element( ) -> None: from spatialdata._io.io_zarr import _get_groups_for_element - if not isinstance(zarr_container_path, Path): - raise ValueError( - f"zarr_container_path must be a Path object, type(zarr_container_path) = {type(zarr_container_path)}." - ) + if not isinstance(zarr_container_path, (Path, UPath)): + raise ValueError(f"zarr_container_path must be a Path or UPath, got {type(zarr_container_path).__name__}.") file_path_of_element = zarr_container_path / element_type / element_name self._validate_can_safely_write_to_path( file_path=file_path_of_element, overwrite=overwrite, saving_an_element=True @@ -1423,13 +1483,12 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None: "more elements in the SpatialData object. Deleting the data would corrupt the SpatialData object." ) - from spatialdata._io._utils import _resolve_zarr_store + zarr_store = self._require_zarr_store() # delete the element - store = _resolve_zarr_store(self.path) - root = zarr.open_group(store=store, mode="r+", use_consolidated=False) - del root[element_type][element_name] - store.close() + with open_write_store(zarr_store) as store: + root = zarr.open_group(store=store, mode="r+", use_consolidated=False) + del root[element_type][element_name] if self.has_consolidated_metadata(): self.write_consolidated_metadata() @@ -1452,14 +1511,11 @@ def write_consolidated_metadata(self) -> None: _write_consolidated_metadata(self.path) def has_consolidated_metadata(self) -> bool: - from spatialdata._io._utils import _resolve_zarr_store - return_value = False - store = _resolve_zarr_store(self.path) - group = zarr.open_group(store, mode="r") - if getattr(group.metadata, "consolidated_metadata", None): - return_value = True - store.close() + with open_read_store(self._require_zarr_store()) as store: + group = zarr.open_group(store, mode="r") + if getattr(group.metadata, "consolidated_metadata", None): + return_value = True return return_value def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[str, SpatialElement | AnnData] | None: @@ -1489,7 +1545,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # check if the element exists in the Zarr storage if not _group_for_element_exists( - zarr_path=Path(self.path), + zarr_path=self.path, element_type=element_type, element_name=element_name, ): @@ -1503,7 +1559,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # warn the users if the element is not self-contained, that is, it is Dask-backed by files outside the Zarr # group for the element - element_zarr_path = Path(self.path) / element_type / element_name + element_zarr_path = self.path / element_type / element_name if not _is_element_self_contained(element=element, element_path=element_zarr_path): logger.info( f"Element {element_type}/{element_name} is not self-contained. The metadata will be" @@ -1544,7 +1600,7 @@ def write_channel_names(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have the check in the conditional if element_type == "images" and self.path is not None: _, _, element_group = _get_groups_for_element( - zarr_path=Path(self.path), element_type=element_type, element_name=element_name, use_consolidated=False + zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False ) from spatialdata._io._utils import overwrite_channel_names @@ -1588,7 +1644,7 @@ def write_transformations(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have a conditional assert self.path is not None _, _, element_group = _get_groups_for_element( - zarr_path=Path(self.path), + zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False, @@ -1645,18 +1701,17 @@ def write_attrs( sdata_format: SpatialDataContainerFormatType | None = None, zarr_group: zarr.Group | None = None, ) -> None: - from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import CurrentSpatialDataContainerFormat, SpatialDataContainerFormatType sdata_format = sdata_format if sdata_format is not None else CurrentSpatialDataContainerFormat() assert isinstance(sdata_format, SpatialDataContainerFormatType) - store = None - if zarr_group is None: assert self.is_backed(), "The SpatialData object must be backed by a Zarr store to write attrs." - store = _resolve_zarr_store(self.path) - zarr_group = zarr.open_group(store=store, mode="r+") + with open_write_store(self._require_zarr_store()) as store: + zarr_group = zarr.open_group(store=store, mode="r+") + self.write_attrs(sdata_format=sdata_format, zarr_group=zarr_group) + return version = sdata_format.spatialdata_format_version version_specific_attrs = sdata_format.attrs_to_dict() @@ -1667,9 +1722,6 @@ def write_attrs( except TypeError as e: raise TypeError("Invalid attribute in SpatialData.attrs") from e - if store is not None: - store.close() - def write_metadata( self, element_name: str | None = None, @@ -1956,7 +2008,8 @@ def h(s: str) -> str: descr = "SpatialData object" if self.path is not None: - descr += f", with associated Zarr store: {self.path.resolve()}" + path_descr = str(self.path) if isinstance(self.path, UPath) else self.path.resolve() + descr += f", with associated Zarr store: {path_descr}" non_empty_elements = self._non_empty_elements() last_element_index = len(non_empty_elements) - 1 diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 6690d1118..ec949f799 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import filecmp +import json import os.path import re import sys @@ -38,6 +39,22 @@ from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes +def _join_fsspec_store_path(store_path: str, relative_path: str) -> str: + """Combine FsspecStore root with a zarr group path using POSIX ``/`` (fsspec keys; safe on Windows).""" + base = str(store_path).replace("\\", "/").rstrip("/") + rel = str(relative_path).replace("\\", "/").lstrip("/") + if not base: + return f"/{rel}" if rel else "/" + return f"{base}/{rel}" if rel else base + + +def _unwrap_fsspec_sync_fs(fs: Any) -> Any: + inner = getattr(fs, "sync_fs", None) + if inner is not None and inner is not fs: + return _unwrap_fsspec_sync_fs(inner) + return fs + + def _get_transformations_from_ngff_dict( list_of_encoded_ngff_transformations: list[dict[str, Any]], ) -> MappingToCoordinateSystem_t: @@ -370,7 +387,9 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No files.append(os.path.realpath(parquet_file)) -def _backed_elements_contained_in_path(path: Path, object: SpatialData | SpatialElement | AnnData) -> list[bool]: +def _backed_elements_contained_in_path( + path: Path | UPath, object: SpatialData | SpatialElement | AnnData +) -> list[bool]: """ Return the list of boolean values indicating if backing files for an object are child directory of a path. @@ -389,9 +408,16 @@ def _backed_elements_contained_in_path(path: Path, object: SpatialData | Spatial ----- If an object does not have a Dask computational graph, it will return an empty list. It is possible for a single SpatialElement to contain multiple files in their Dask computational graph. + + For a remote ``path`` (:class:`upath.UPath`), this always returns an empty list: Dask backing paths + are resolved as local filesystem paths, so they cannot be compared to object-store locations. + :meth:`spatialdata.SpatialData.write` therefore skips the local "backing files in target" guard + for remote targets; ``overwrite=True`` on a remote URL must be used only when overwriting is safe. """ + if isinstance(path, UPath): + return [] if not isinstance(path, Path): - raise TypeError(f"Expected a Path object, got {type(path)}") + raise TypeError(f"Expected a Path or UPath object, got {type(path)}") return [_is_subfolder(parent=path, child=Path(fp)) for fp in get_dask_backing_files(object)] @@ -420,16 +446,44 @@ def _is_subfolder(parent: Path, child: Path) -> bool: def _is_element_self_contained( - element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, element_path: Path + element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, + element_path: Path | UPath, ) -> bool: + """Whether element Dask graphs only reference files under ``element_path`` (local) or N/A (remote).""" + if isinstance(element_path, UPath): + # Backing-file paths are local; cannot relate them to remote keys—assume OK for this heuristic. + return True if isinstance(element, DaskDataFrame): pass # TODO when running test_save_transformations it seems that for the same element this is called multiple times return all(_backed_elements_contained_in_path(path=element_path, object=element)) +def _ensure_async_fs(fs: Any) -> Any: + """Return an async fsspec filesystem for use with zarr's FsspecStore. + + Zarr's FsspecStore expects an async filesystem. If the given fs is synchronous, + it is converted using fsspec's public API (async instance or AsyncFileSystemWrapper) + so that ZarrUserWarning is not raised. + """ + if getattr(fs, "asynchronous", False): + return fs + import fsspec + + if getattr(fs, "async_impl", False): + fs_dict = json.loads(fs.to_json()) + fs_dict["asynchronous"] = True + return fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict)) + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + return AsyncFileSystemWrapper(fs, asynchronous=True) + + def _resolve_zarr_store( - path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, **kwargs: Any + path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, + *, + read_only: bool = False, + **kwargs: Any, ) -> zarr.storage.StoreLike: """ Normalize different Zarr store inputs into a usable store instance. @@ -445,9 +499,14 @@ def _resolve_zarr_store( path The input representing a Zarr store or group. Can be a filesystem path, remote path, existing store, or Zarr group. + read_only + If ``True``, constructed ``LocalStore`` / ``FsspecStore`` instances are built with + ``read_only=True``. Stores that already exist (when ``path`` is a ``StoreLike`` or + a ``zarr.Group`` whose wrapped store is not reconstructable) are returned as-is; + the caller is responsible for opening them at the right mode. **kwargs Additional keyword arguments forwarded to the underlying store - constructor (e.g. `mode`, `storage_options`). + constructor. Returns ------- @@ -457,37 +516,53 @@ def _resolve_zarr_store( ------ TypeError If the input type is unsupported. - ValueError + ValueError If a `zarr.Group` has an unsupported store type. """ - # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): - # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): # if the input is a local path, use LocalStore - return LocalStore(path.path) + return LocalStore(path.path, read_only=read_only) if isinstance(path, zarr.Group): + _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) # if the input is a zarr.Group, wrap it with a store if isinstance(path.store, LocalStore): store_path = UPath(path.store.root) / path.path - return LocalStore(store_path.path) + return LocalStore(store_path.path, read_only=read_only) if isinstance(path.store, FsspecStore): # if the store within the zarr.Group is an FSStore, return it # but extend the path of the store with that of the zarr.Group - return FsspecStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) - if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): - # if the store is a ConsolidatedMetadataStore, just return the underlying FSSpec store - return path.store.store + return FsspecStore( + fs=_ensure_async_fs(path.store.fs), + path=_join_fsspec_store_path(path.store.path, path.path), + read_only=read_only, + **kwargs, + ) + if _cms is not None and isinstance(path.store, _cms): + # Unwrap and apply the same async-fs guards as a direct FsspecStore on the group. + inner = path.store.store + if isinstance(inner, FsspecStore): + return FsspecStore( + fs=_ensure_async_fs(inner.fs), + path=_join_fsspec_store_path(inner.path, path.path), + read_only=read_only, + **kwargs, + ) + if isinstance(inner, LocalStore): + store_path = UPath(inner.root) / path.path + return LocalStore(store_path.path, read_only=read_only) + return inner raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") - if isinstance(path, zarr.storage.StoreLike): - # if the input already a store, wrap it in an FSStore - return FsspecStore(path, **kwargs) if isinstance(path, UPath): - # if input is a remote UPath, map it to an FSStore - return FsspecStore(path.path, fs=path.fs, **kwargs) + # Check before StoreLike to avoid UnionType isinstance. + return FsspecStore(_ensure_async_fs(path.fs), path=path.path, read_only=read_only, **kwargs) + if isinstance(path, zarr.storage.StoreLike): + # Already a concrete store (LocalStore, FsspecStore, MemoryStore, ...). Do not pass it as ``fs=`` to + # FsspecStore -- that only accepts an async fsspec filesystem and raises on stores (e.g. ``async_impl``). + return path raise TypeError(f"Unsupported type: {type(path)}") diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index b47fc418c..9a70b4b1f 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -6,13 +6,16 @@ from dask.dataframe import DataFrame as DaskDataFrame from dask.dataframe import read_parquet from ome_zarr.format import Format +from upath import UPath from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, + _resolve_zarr_store, _write_metadata, overwrite_coordinate_transformations_non_raster, ) from spatialdata._io.format import CurrentPointsFormat, PointsFormats, _parse_version +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group from spatialdata.models import get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -21,21 +24,38 @@ def _read_points( - store: str | Path, + store: str | Path | UPath | ZarrStore, ) -> DaskDataFrame: - """Read points from a zarr store.""" - f = zarr.open(store, mode="r") + """Read points from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None points_format = PointsFormats[version] - store_root = f.store_path.store.root - path = store_root / f.path / "points.parquet" - # cache on remote file needed for parquet reader to work - # TODO: allow reading in the metadata without caching all the data - points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) + parquet_store = zarr_store.child("points.parquet") + # Passing filesystem= to read_parquet makes pyarrow convert dictionary columns into pandas + # categoricals eagerly per partition and marks them known=True with an empty category list. + # This happens for ANY pyarrow filesystem (both LocalFileSystem and PyFileSystem(FSSpecHandler(.)) + # return the same broken categorical), so it is a property of the filesystem= handoff itself, + # not of local-vs-remote. Left as is, it would make write_points' cat.as_known() a no-op and + # the next to_parquet(filesystem=.) would fail with a per-partition schema mismatch + # (dictionary vs dictionary). We demote the categoricals back to + # "unknown" right here so that write_points recomputes categories consistently across partitions. + # TODO: allow reading in the metadata without materializing the data. + points = read_parquet( + parquet_store.arrow_path(), + filesystem=parquet_store.arrow_filesystem(), + ) assert isinstance(points, DaskDataFrame) + for column_name in points.columns: + c = points[column_name] + if c.dtype == "category" and c.cat.known: + points[column_name] = c.cat.as_unknown() + if points.index.name == "__null_dask_index__": + points = points.rename_axis(None) transformations = _get_transformations_from_ngff_dict(f.attrs.asdict()["coordinateTransformations"]) _set_transformations(points, transformations) @@ -68,8 +88,7 @@ def write_points( axes = get_axes_names(points) transformations = _get_transformations(points) - store_root = group.store_path.store.root - path = store_root / group.path / "points.parquet" + parquet_store = make_zarr_store_from_group(group).child("points.parquet") # The following code iterates through all columns in the 'points' DataFrame. If the column's datatype is # 'category', it checks whether the categories of this column are known. If not, it explicitly converts the @@ -84,7 +103,10 @@ def write_points( points_without_transform = points.copy() del points_without_transform.attrs["transform"] - points_without_transform.to_parquet(path) + points_without_transform.to_parquet( + parquet_store.arrow_path(), + filesystem=parquet_store.arrow_filesystem(), + ) attrs = element_format.attrs_to_dict(points.attrs) attrs["version"] = element_format.spatialdata_format_version diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index a8b2ab2ce..7eaf04d57 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -16,10 +16,12 @@ from ome_zarr.writer import write_labels as write_labels_ngff from ome_zarr.writer import write_multiscale as write_multiscale_ngff from ome_zarr.writer import write_multiscale_labels as write_multiscale_labels_ngff +from upath import UPath from xarray import DataArray, DataTree from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, + _resolve_zarr_store, overwrite_coordinate_transformations_raster, ) from spatialdata._io.format import ( @@ -27,6 +29,7 @@ RasterFormatType, get_ome_zarr_format, ) +from spatialdata._store import ZarrStore, make_zarr_store from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import get_channel_names from spatialdata.models.models import ATTRS_KEY @@ -160,13 +163,14 @@ def _prepare_storage_options( def _read_multiscale( - store: str | Path, raster_type: Literal["image", "labels"], reader_format: Format + store: str | Path | UPath | ZarrStore, raster_type: Literal["image", "labels"], reader_format: Format ) -> DataArray | DataTree: - assert isinstance(store, str | Path) assert raster_type in ["image", "labels"] + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) nodes: list[Node] = [] - image_loc = ZarrLocation(store, fmt=reader_format) + image_loc = ZarrLocation(resolved_store, fmt=reader_format) if exists := image_loc.exists(): image_reader = Reader(image_loc)() image_nodes = list(image_reader) diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index b07256273..c2128fe19 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -9,9 +9,11 @@ from natsort import natsorted from ome_zarr.format import Format from shapely import from_ragged_array, to_ragged_array +from upath import UPath from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, + _resolve_zarr_store, _write_metadata, overwrite_coordinate_transformations_non_raster, ) @@ -23,6 +25,7 @@ ShapesFormatV03, _parse_version, ) +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group from spatialdata.models import ShapesModel, get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -31,10 +34,12 @@ def _read_shapes( - store: str | Path, + store: str | Path | UPath | ZarrStore, ) -> GeoDataFrame: - """Read shapes from a zarr store.""" - f = zarr.open(store, mode="r") + """Read shapes from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=True) assert version is not None shape_format = ShapesFormats[version] @@ -54,9 +59,9 @@ def _read_shapes( geometry = from_ragged_array(typ, coords, offsets) geo_df = GeoDataFrame({"geometry": geometry}, index=index) elif isinstance(shape_format, ShapesFormatV02 | ShapesFormatV03): - store_root = f.store_path.store.root - path = Path(store_root) / f.path / "shapes.parquet" - geo_df = read_parquet(path) + parquet_store = zarr_store.child("shapes.parquet") + with parquet_store.arrow_filesystem().open_input_file(parquet_store.arrow_path()) as src: + geo_df = read_parquet(src) else: raise ValueError( f"Unsupported shapes format {shape_format} from version {version}. Please update the spatialdata library." @@ -169,13 +174,13 @@ def _write_shapes_v02_v03( """ from spatialdata.models._utils import TRANSFORM_KEY - store_root = group.store_path.store.root - path = store_root / group.path / "shapes.parquet" + parquet_store = make_zarr_store_from_group(group).child("shapes.parquet") # Temporarily remove transformations from attrs to avoid serialization issues transforms = shapes.attrs[TRANSFORM_KEY] del shapes.attrs[TRANSFORM_KEY] - shapes.to_parquet(path, geometry_encoding=geometry_encoding) + with parquet_store.arrow_filesystem().open_output_stream(parquet_store.arrow_path()) as sink: + shapes.to_parquet(sink, geometry_encoding=geometry_encoding) shapes.attrs[TRANSFORM_KEY] = transforms attrs = element_format.attrs_to_dict(shapes.attrs) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 8cd7b8385..a37e62a4e 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -8,7 +8,9 @@ from anndata import read_zarr as read_anndata_zarr from anndata._io.specs import write_elem as write_adata from ome_zarr.format import Format +from upath import UPath +from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import ( CurrentTablesFormat, TablesFormats, @@ -16,13 +18,16 @@ TablesFormatV02, _parse_version, ) +from spatialdata._store import ZarrStore, make_zarr_store from spatialdata.models import TableModel, get_table_keys -def _read_table(store: str | Path) -> AnnData: - table = read_anndata_zarr(str(store)) +def _read_table(store: str | Path | UPath | ZarrStore) -> AnnData: + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) + table = read_anndata_zarr(resolved_store) - f = zarr.open(store, mode="r") + f = zarr.open(resolved_store, mode="r") version = _parse_version(f, expect_attrs_key=False) assert version is not None table_format = TablesFormats[version] diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 4c410fab0..456374804 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,13 +1,12 @@ from __future__ import annotations -import os import warnings from collections.abc import Callable from json import JSONDecodeError from pathlib import Path from typing import Any, Literal, cast -import zarr.storage +import zarr from anndata import AnnData from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -27,12 +26,13 @@ from spatialdata._io.io_shapes import _read_shapes from spatialdata._io.io_table import _read_table from spatialdata._logging import logger +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group, open_read_store from spatialdata._types import Raster_T def _read_zarr_group_spatialdata_element( root_group: zarr.Group, - root_store_path: str, + root_store: ZarrStore, sdata_version: Literal["0.1", "0.2"], selector: set[str], read_func: Callable[..., Any], @@ -54,7 +54,7 @@ def _read_zarr_group_spatialdata_element( # skip hidden files like .zgroup or .zmetadata continue elem_group = group[subgroup_name] - elem_group_path = os.path.join(root_store_path, elem_group.path) + elem_store = root_store.child(elem_group.path) with handle_read_errors( on_bad_files, location=f"{group.path}/{subgroup_name}", @@ -70,12 +70,12 @@ def _read_zarr_group_spatialdata_element( if element_type in ["image", "labels"]: reader_format = get_raster_format_for_read(elem_group, sdata_version) element = read_func( - elem_group_path, + elem_store, cast(Literal["image", "labels"], element_type), reader_format, ) elif element_type in ["shapes", "points", "tables"]: - element = read_func(elem_group_path) + element = read_func(elem_store) else: raise ValueError(f"Unknown element type {element_type}") element_container[subgroup_name] = element @@ -153,24 +153,7 @@ def read_zarr( ------- A SpatialData object. """ - from spatialdata._io._utils import _resolve_zarr_store - - resolved_store = _resolve_zarr_store(store) - root_group = zarr.open_group(resolved_store, mode="r") - # the following is the SpatialDataContainerFormat version - if "spatialdata_attrs" not in root_group.metadata.attributes: - # backward compatibility for pre-versioned SpatialData zarr stores - sdata_version: Literal["0.1", "0.2"] = "0.1" - else: - sdata_version = root_group.metadata.attributes["spatialdata_attrs"]["version"] - if sdata_version == "0.1": - warnings.warn( - "SpatialData is not stored in the most current format. If you want to use Zarr v3" - ", please write the store to a new location using `sdata.write()`.", - UserWarning, - stacklevel=2, - ) - root_store_path = root_group.store.root + zarr_store = make_zarr_store_from_group(store) if isinstance(store, zarr.Group) else make_zarr_store(store) images: dict[str, Raster_T] = {} labels: dict[str, Raster_T] = {} @@ -178,50 +161,66 @@ def read_zarr( shapes: dict[str, GeoDataFrame] = {} tables: dict[str, AnnData] = {} - selector = {"images", "labels", "points", "shapes", "tables"} if not selection else set(selection or []) - logger.debug(f"Reading selection {selector}") - - # we could make this more readable. One can get lost when looking at this dict and iteration over the items - group_readers: dict[ - Literal["images", "labels", "shapes", "points", "tables"], - tuple[ - Callable[..., Any], - Literal["image", "labels", "shapes", "points", "tables"], - dict[str, Raster_T] | dict[str, DaskDataFrame] | dict[str, GeoDataFrame] | dict[str, AnnData], - ], - ] = { - # ome-zarr-py needs a kwargs that has "image" has key. So here we have "image" and not "images" - "images": (_read_multiscale, "image", images), - "labels": (_read_multiscale, "labels", labels), - "points": (_read_points, "points", points), - "shapes": (_read_shapes, "shapes", shapes), - "tables": (_read_table, "tables", tables), - } - for group_name, ( - read_func, - element_type, - element_container, - ) in group_readers.items(): - _read_zarr_group_spatialdata_element( - root_group=root_group, - root_store_path=root_store_path, - sdata_version=sdata_version, - selector=selector, - read_func=read_func, - group_name=group_name, - element_type=element_type, - element_container=element_container, - on_bad_files=on_bad_files, - ) - - # read attrs metadata - attrs = root_group.attrs.asdict() - if "spatialdata_attrs" in attrs: - # when refactoring the read_zarr function into reading componenets separately (and according to the version), - # we can move the code below (.pop()) into attrs_from_dict() - attrs.pop("spatialdata_attrs") - else: - attrs = None + with open_read_store(zarr_store) as resolved_store: + root_group = zarr.open_group(resolved_store, mode="r") + # the following is the SpatialDataContainerFormat version + if "spatialdata_attrs" not in root_group.metadata.attributes: + # backward compatibility for pre-versioned SpatialData zarr stores + sdata_version: Literal["0.1", "0.2"] = "0.1" + else: + sdata_version = root_group.metadata.attributes["spatialdata_attrs"]["version"] + if sdata_version == "0.1": + warnings.warn( + "SpatialData is not stored in the most current format. If you want to use Zarr v3" + ", please write the store to a new location using `sdata.write()`.", + UserWarning, + stacklevel=2, + ) + + selector = {"images", "labels", "points", "shapes", "tables"} if not selection else set(selection or []) + logger.debug(f"Reading selection {selector}") + + # we could make this more readable. One can get lost when looking at this dict and iteration over the items + group_readers: dict[ + Literal["images", "labels", "shapes", "points", "tables"], + tuple[ + Callable[..., Any], + Literal["image", "labels", "shapes", "points", "tables"], + dict[str, Raster_T] | dict[str, DaskDataFrame] | dict[str, GeoDataFrame] | dict[str, AnnData], + ], + ] = { + # ome-zarr-py needs a kwargs that has "image" has key. So here we have "image" and not "images" + "images": (_read_multiscale, "image", images), + "labels": (_read_multiscale, "labels", labels), + "points": (_read_points, "points", points), + "shapes": (_read_shapes, "shapes", shapes), + "tables": (_read_table, "tables", tables), + } + for group_name, ( + read_func, + element_type, + element_container, + ) in group_readers.items(): + _read_zarr_group_spatialdata_element( + root_group=root_group, + root_store=zarr_store, + sdata_version=sdata_version, + selector=selector, + read_func=read_func, + group_name=group_name, + element_type=element_type, + element_container=element_container, + on_bad_files=on_bad_files, + ) + + # read attrs metadata + attrs = root_group.attrs.asdict() + if "spatialdata_attrs" in attrs: + # when refactoring the read_zarr function into reading componenets separately (and according to the version), + # we can move the code below (.pop()) into attrs_from_dict() + attrs.pop("spatialdata_attrs") + else: + attrs = None sdata = SpatialData( images=images, @@ -231,12 +230,12 @@ def read_zarr( tables=tables, attrs=attrs, ) - sdata.path = resolved_store.root + sdata._set_zarr_store(zarr_store) return sdata def _get_groups_for_element( - zarr_path: Path, element_type: str, element_name: str, use_consolidated: bool = True + zarr_path: Path | UPath, element_type: str, element_name: str, use_consolidated: bool = True ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. @@ -265,8 +264,8 @@ def _get_groups_for_element( ------- The Zarr groups for the root, element_type and element for a specific element. """ - if not isinstance(zarr_path, Path): - raise ValueError("zarr_path should be a Path object") + if not isinstance(zarr_path, (Path, UPath)): + raise ValueError("zarr_path should be a Path or UPath object") if element_type not in [ "images", @@ -289,7 +288,7 @@ def _get_groups_for_element( return root_group, element_type_group, element_name_group -def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: str) -> bool: +def _group_for_element_exists(zarr_path: Path | UPath, element_type: str, element_name: str) -> bool: """ Check if the group for an element exists. @@ -319,9 +318,13 @@ def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: return exists -def _write_consolidated_metadata(path: Path | str | None) -> None: +def _write_consolidated_metadata(path: Path | UPath | str | None) -> None: if path is not None: - f = zarr.open_group(path, mode="r+", use_consolidated=False) + if isinstance(path, UPath): + store = _resolve_zarr_store(path) + f = zarr.open_group(store, mode="r+", use_consolidated=False) + else: + f = zarr.open_group(path, mode="r+", use_consolidated=False) # .parquet files are not recognized as proper zarr and thus throw a warning. This does not affect SpatialData. # and therefore we silence it for our users as they can't do anything about this. # TODO check with remote PR whether we can prevent this warning at least for points data and whether with zarrv3 diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index b231e71ce..f76606bf4 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -6,12 +6,14 @@ from typing import Any, TypeAlias import pyarrow.fs as pafs +import zarr from upath import UPath +from zarr.storage import FsspecStore, LocalStore PathLike: TypeAlias = Path | UPath -def normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: +def _normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: if isinstance(path, str): return UPath(path, **(storage_options or {})) if "://" in path else Path(path) if isinstance(path, (Path, UPath)): @@ -24,7 +26,7 @@ class ZarrStore: path: PathLike def with_path(self, path: str | PathLike) -> ZarrStore: - return replace(self, path=normalize_path(path)) + return replace(self, path=_normalize_path(path)) def child(self, path: str | PathLike) -> ZarrStore: return self.with_path(self.path / path) @@ -43,7 +45,31 @@ def make_zarr_store( *, storage_options: dict[str, Any] | None = None, ) -> ZarrStore: - return ZarrStore(path=normalize_path(path, storage_options)) + return ZarrStore(path=_normalize_path(path, storage_options)) + + +def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: + from spatialdata._io._utils import ( + _join_fsspec_store_path, + _unwrap_fsspec_sync_fs, + ) + + store = group.store + _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) + if _cms is not None and isinstance(store, _cms): + store = store.store + + if isinstance(store, LocalStore): + return make_zarr_store(Path(store.root) / group.path) + if isinstance(store, FsspecStore): + protocol = getattr(store.fs, "protocol", None) + if isinstance(protocol, (list, tuple)): + protocol = protocol[0] if protocol else "file" + elif protocol is None: + protocol = "file" + path = _join_fsspec_store_path(store.path, group.path) + return make_zarr_store(UPath(f"{protocol}://{path}", fs=_unwrap_fsspec_sync_fs(store.fs))) + raise ValueError(f"Unsupported store type or zarr.Group: {type(group.store)}") @contextmanager diff --git a/tests/io/test_store.py b/tests/io/test_store.py index a50c8423f..d9ef877e6 100644 --- a/tests/io/test_store.py +++ b/tests/io/test_store.py @@ -10,6 +10,7 @@ from spatialdata._io._utils import _resolve_zarr_store from spatialdata._store import ( make_zarr_store, + make_zarr_store_from_group, open_read_store, open_write_store, ) @@ -43,6 +44,17 @@ def test_open_read_and_write_store_roundtrip(tmp_path: Path) -> None: assert group.attrs["answer"] == 42 +def test_make_zarr_store_from_local_group(tmp_path: Path) -> None: + zarr_store = make_zarr_store(tmp_path / "store.zarr") + + with open_write_store(zarr_store) as store: + root = zarr.create_group(store=store, overwrite=True) + group = root.require_group("images").require_group("image") + + child_store = make_zarr_store_from_group(group) + assert child_store.path == tmp_path / "store.zarr" / "images" / "image" + + def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: """StoreLike inputs must not be wrapped as FsspecStore(fs=store) -- that is only for async filesystems.""" mem = MemoryStore() @@ -66,3 +78,21 @@ def test_resolve_zarr_store_forwards_read_only_remote() -> None: store = _resolve_zarr_store(upath, read_only=True) assert isinstance(store, FsspecStore) assert store.read_only is True + + +def test_make_zarr_store_from_remote_group() -> None: + """Remote zarr.Group inputs keep a usable UPath and reopen through the same protocol.""" + import fsspec + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + fs = fsspec.filesystem("memory") + async_fs = AsyncFileSystemWrapper(fs, asynchronous=True) + base = FsspecStore(async_fs, path="/") + root = zarr.open_group(store=base, mode="a") + group = root.require_group("points").require_group("points") + + zarr_store = make_zarr_store_from_group(group) + assert getattr(zarr_store.path.fs, "protocol", None) == "memory" + + with open_read_store(zarr_store) as store: + assert isinstance(store, FsspecStore) From 318c0bdba9d390ac81f0395a72b33d19e2986076 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 14:26:16 +0200 Subject: [PATCH 46/51] too long line --- src/spatialdata/_io/io_zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 456374804..534bd7d91 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -216,7 +216,7 @@ def read_zarr( # read attrs metadata attrs = root_group.attrs.asdict() if "spatialdata_attrs" in attrs: - # when refactoring the read_zarr function into reading componenets separately (and according to the version), + # when refactoring the read_zarr function into reading componenets separately (and according to the version) # we can move the code below (.pop()) into attrs_from_dict() attrs.pop("spatialdata_attrs") else: From b14b2a823178550a61512f4d8f98d0ba961cf28c Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 14:35:37 +0200 Subject: [PATCH 47/51] if _cms is not None and isinstance(path.store, _cms): is always none since we use v3 --- src/spatialdata/_io/_utils.py | 44 ++++++++--------------------------- src/spatialdata/_store.py | 30 ++++++++++++++---------- 2 files changed, 28 insertions(+), 46 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index ec949f799..29efbe66a 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -12,7 +12,7 @@ from contextlib import contextmanager from enum import Enum from functools import singledispatch -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import Any, Literal import zarr @@ -39,20 +39,10 @@ from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes -def _join_fsspec_store_path(store_path: str, relative_path: str) -> str: - """Combine FsspecStore root with a zarr group path using POSIX ``/`` (fsspec keys; safe on Windows).""" - base = str(store_path).replace("\\", "/").rstrip("/") - rel = str(relative_path).replace("\\", "/").lstrip("/") - if not base: - return f"/{rel}" if rel else "/" - return f"{base}/{rel}" if rel else base - - -def _unwrap_fsspec_sync_fs(fs: Any) -> Any: - inner = getattr(fs, "sync_fs", None) - if inner is not None and inner is not fs: - return _unwrap_fsspec_sync_fs(inner) - return fs +def join_fsspec_store_path(store_path: str, relative_path: str) -> str: + """Append a relative zarr-group path to an FsspecStore root, yielding a fsspec key.""" + rel = relative_path.lstrip("/") + return str(PurePosixPath(store_path) / rel) if rel else store_path def _get_transformations_from_ngff_dict( @@ -527,34 +517,20 @@ def _resolve_zarr_store( return LocalStore(path.path, read_only=read_only) if isinstance(path, zarr.Group): - _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) - # if the input is a zarr.Group, wrap it with a store + # Re-wrap the group's store at the group's subpath. Note: zarr v3 no longer ships + # ``ConsolidatedMetadataStore`` (v2 wrapped the backend in a store; v3 surfaces + # consolidated metadata as a field on ``GroupMetadata`` instead), so we only need to + # handle the two concrete backends below. if isinstance(path.store, LocalStore): store_path = UPath(path.store.root) / path.path return LocalStore(store_path.path, read_only=read_only) if isinstance(path.store, FsspecStore): - # if the store within the zarr.Group is an FSStore, return it - # but extend the path of the store with that of the zarr.Group return FsspecStore( fs=_ensure_async_fs(path.store.fs), - path=_join_fsspec_store_path(path.store.path, path.path), + path=join_fsspec_store_path(path.store.path, path.path), read_only=read_only, **kwargs, ) - if _cms is not None and isinstance(path.store, _cms): - # Unwrap and apply the same async-fs guards as a direct FsspecStore on the group. - inner = path.store.store - if isinstance(inner, FsspecStore): - return FsspecStore( - fs=_ensure_async_fs(inner.fs), - path=_join_fsspec_store_path(inner.path, path.path), - read_only=read_only, - **kwargs, - ) - if isinstance(inner, LocalStore): - store_path = UPath(inner.root) / path.path - return LocalStore(store_path.path, read_only=read_only) - return inner raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") if isinstance(path, UPath): # Check before StoreLike to avoid UnionType isinstance. diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index f76606bf4..0d7e1c551 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -13,7 +13,7 @@ PathLike: TypeAlias = Path | UPath -def _normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: +def normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: if isinstance(path, str): return UPath(path, **(storage_options or {})) if "://" in path else Path(path) if isinstance(path, (Path, UPath)): @@ -26,7 +26,7 @@ class ZarrStore: path: PathLike def with_path(self, path: str | PathLike) -> ZarrStore: - return replace(self, path=_normalize_path(path)) + return replace(self, path=normalize_path(path)) def child(self, path: str | PathLike) -> ZarrStore: return self.with_path(self.path / path) @@ -45,19 +45,16 @@ def make_zarr_store( *, storage_options: dict[str, Any] | None = None, ) -> ZarrStore: - return ZarrStore(path=_normalize_path(path, storage_options)) + return ZarrStore(path=normalize_path(path, storage_options)) def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: - from spatialdata._io._utils import ( - _join_fsspec_store_path, - _unwrap_fsspec_sync_fs, - ) + from spatialdata._io._utils import join_fsspec_store_path + # zarr v3 does not wrap stores with a ``ConsolidatedMetadataStore`` (that was a v2-only + # concept); consolidated metadata is now a field on ``GroupMetadata``. So the group's + # ``.store`` is already the concrete backend store -- no unwrapping required. store = group.store - _cms = getattr(zarr.storage, "ConsolidatedMetadataStore", None) - if _cms is not None and isinstance(store, _cms): - store = store.store if isinstance(store, LocalStore): return make_zarr_store(Path(store.root) / group.path) @@ -67,8 +64,17 @@ def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: protocol = protocol[0] if protocol else "file" elif protocol is None: protocol = "file" - path = _join_fsspec_store_path(store.path, group.path) - return make_zarr_store(UPath(f"{protocol}://{path}", fs=_unwrap_fsspec_sync_fs(store.fs))) + # zarr's FsspecStore wraps sync filesystems via ``AsyncFileSystemWrapper`` (exposed on + # ``.sync_fs``). Walk that chain to recover the original sync fs so we can hand it back + # to UPath, which expects a sync filesystem. + fs = store.fs + while True: + inner = getattr(fs, "sync_fs", None) + if inner is None or inner is fs: + break + fs = inner + path = join_fsspec_store_path(store.path, group.path) + return make_zarr_store(UPath(f"{protocol}://{path}", fs=fs)) raise ValueError(f"Unsupported store type or zarr.Group: {type(group.store)}") From 958dd1ed5971b5290957a00a7e8211ad001dbed5 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 14:38:09 +0200 Subject: [PATCH 48/51] add clear comments --- src/spatialdata/_store.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index 0d7e1c551..ab307e716 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -64,9 +64,13 @@ def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: protocol = protocol[0] if protocol else "file" elif protocol is None: protocol = "file" - # zarr's FsspecStore wraps sync filesystems via ``AsyncFileSystemWrapper`` (exposed on - # ``.sync_fs``). Walk that chain to recover the original sync fs so we can hand it back - # to UPath, which expects a sync filesystem. + # Recover the original SYNC filesystem from ``store.fs``. zarr v3's FsspecStore requires + # an async fs, so when callers pass a sync fs (e.g. ``MemoryFileSystem``) we wrap it via + # ``AsyncFileSystemWrapper``, which preserves the original on ``.sync_fs``. We must + # unwrap here because the resulting UPath flows into ``ZarrStore.arrow_filesystem()``, + # i.e. ``pafs.FSSpecHandler(fs)`` -- and pyarrow's handler is strictly sync. Feeding it + # an async-wrapped fs raises ``RuntimeError: Loop is not running`` at read/write time. + # The ``while`` loop tolerates (hypothetical) multi-layer wrapping across zarr versions. fs = store.fs while True: inner = getattr(fs, "sync_fs", None) From d8ad5bb692bf2f1bebbc3e5f66a4c1f11b811627 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 17:11:54 +0200 Subject: [PATCH 49/51] fix: restore get_dask_backing_files for post-unpinning dask task-spec Dask's task-graph shape changed in PR #1006 ("unpinning dask", commit 53b9438a): parquet reads are now Task objects wrapping ``_fragment_to_table`` with a ``FragmentWrapper`` in kwargs or inside fused-expression subgraphs, instead of the legacy dicts with a ``piece`` tuple. This broke ``_search_for_backing_files_recursively`` in two ways: 1. ``"piece" in v.args[0]`` raised ``TypeError: argument of type 'Task' is not iterable`` before the fallback branch ever ran -- affecting every test that writes+reads points (``test_points``, ``test_roundtrip[points]``, ``test_io_and_lazy_loading_points``). 2. Fused expressions use key prefix ``readparquetpyarrowfs-fused-values-*`` (not ``read_parquet-*``) with the FragmentWrapper nested inside lists of tuples inside a subgraph dict, so even after fixing (1) the parquet file was never discovered -- affecting ``test_self_contained``. Replace the ad-hoc ``v.args[0]["piece"]`` / ``v.args[0].values()`` logic with a uniformly recursive helper ``_extract_parquet_paths_from_task`` that walks Mappings, Sequences, ``.args`` and ``.kwargs``, detecting FragmentWrapper via the ``.fragment.path`` attribute chain (no private dask_expr import) and still validating the legacy ``piece`` tuple shape for backward compatibility. Broaden the outer key-prefix match to any key containing "parquet"; ``.endswith(".parquet")`` inside the extractor keeps false positives out. Validated: 130 passed / 1 failed on tests/io/test_readwrite.py + test_store.py + test_store_abstractions.py (up from 113 / 18 on baseline; the one remaining failure is the intentionally-exposed consolidated- metadata-on-read gap, unrelated to this change). Made-with: Cursor --- src/spatialdata/_io/_utils.py | 98 ++++++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 25 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 29efbe66a..683aa47c5 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -324,6 +324,69 @@ def _find_piece_dict(obj: dict[str, tuple[str | None]] | Task) -> dict[str, tupl return None +def _extract_parquet_paths_from_task(obj: Any) -> list[str]: + """Recursively extract parquet file paths from a dask ``read_parquet`` task. + + Dask's task-graph shape changed between the version pinned before PR #1006 ("unpinning + dask", commit 53b9438a) and the current one; we tolerate both: + + - Legacy shape: a dict ``{"piece": (parquet_file, None, None)}`` somewhere in the args + (possibly wrapped in other dicts for mixed points+images element graphs). The trailing + elements of the ``piece`` tuple encode row-group / filter constraints; we only support + unfiltered reads (hence the validation on ``check0`` / ``check1``). + - Current shape: a ``dask.dataframe.dask_expr.io.parquet.FragmentWrapper`` whose + ``.fragment.path`` is the parquet file (from ``dask_expr.io.parquet.ReadParquetPyarrowFS``). + The wrapper may live in Task ``kwargs["fragment_wrapper"]`` for simple reads, but in fused + expressions (``readparquetpyarrowfs-fused-*``) it is nested inside lists and tuples + inside a subgraph dict, so we walk every container uniformly rather than targeting named + kwargs. + + ``FragmentWrapper`` is detected via the ``.fragment.path`` attribute chain instead of an + isinstance check to avoid importing private dask_expr internals; the ``endswith(".parquet")`` + guard keeps false positives from random objects out of the result. + """ + found: list[str] = [] + + frag = getattr(obj, "fragment", None) + if frag is not None: + path = getattr(frag, "path", None) + if isinstance(path, str) and path.endswith(".parquet"): + found.append(path) + + if isinstance(obj, Mapping): + if "piece" in obj: + piece = obj["piece"] + if isinstance(piece, tuple) and len(piece) >= 1 and isinstance(piece[0], str): + parquet_file = piece[0] + check0 = piece[1] if len(piece) > 1 else None + check1 = piece[2] if len(piece) > 2 else None + if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: + raise ValueError( + f"Unable to parse the parquet file from the dask task {obj!r}. Please report this bug." + ) + found.append(parquet_file) + for v in obj.values(): + found.extend(_extract_parquet_paths_from_task(v)) + return found + + if isinstance(obj, (list, tuple)): + for item in obj: + found.extend(_extract_parquet_paths_from_task(item)) + return found + + kwargs = getattr(obj, "kwargs", None) + if isinstance(kwargs, Mapping): + for v in kwargs.values(): + found.extend(_extract_parquet_paths_from_task(v)) + + args = getattr(obj, "args", None) + if isinstance(args, (list, tuple)): + for a in args: + found.extend(_extract_parquet_paths_from_task(a)) + + return found + + def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> None: # see the types allowed for the dask graph here: https://docs.dask.org/en/stable/spec.html @@ -349,32 +412,17 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No # LocalStore.store does not have an attribute path, but we keep it like this for backward compat. path = getattr(v.store, "path", None) if getattr(v.store, "path", None) else v.store.root files.append(str(UPath(path).resolve())) - elif name.startswith("read-parquet") or name.startswith("read_parquet"): - # Here v is a read_parquet task with arguments and the only value is a dictionary. - if "piece" in v.args[0]: - # https://github.com/dask/dask/blob/ff2488aec44d641696e0b7aa41ed9e995c710705/dask/dataframe/io/parquet/core.py#L870 - parquet_file, check0, check1 = v.args[0]["piece"] - if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: - raise ValueError( - f"Unable to parse the parquet file from the dask subgraph {subgraph}. Please " - f"report this bug." - ) + elif "parquet" in name.lower(): + # Matches every dask task-key that wraps a parquet read across versions: + # - legacy ``read-parquet-`` / ``read_parquet-`` (pre PR #1006), + # - current ``read_parquet-`` plus fused-expression forms such as + # ``readparquetpyarrowfs-fused-values-`` produced by + # ``dask_expr.io.parquet.ReadParquetPyarrowFS`` when a parquet column is + # combined with other arrays (see ``test_self_contained``). + # Any false-positive key that matches but carries no parquet payload is filtered + # inside ``_extract_parquet_paths_from_task`` (paths must ``endswith(".parquet")``). + for parquet_file in _extract_parquet_paths_from_task(v): files.append(os.path.realpath(parquet_file)) - else: - # This occurs when for example points and images are mixed, the main task still starts with - # read_parquet, but the execution happens through a subgraph which we iterate over to get the - # actual read_parquet task. - for task in v.args[0].values(): - # Recursively go through tasks, this is required because differences between dask versions. - piece_dict = _find_piece_dict(task) - if isinstance(piece_dict, dict) and "piece" in piece_dict: - parquet_file, check0, check1 = piece_dict["piece"] # type: ignore[misc] - if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: - raise ValueError( - f"Unable to parse the parquet file from the dask subgraph {subgraph}. Please " - f"report this bug." - ) - files.append(os.path.realpath(parquet_file)) def _backed_elements_contained_in_path( From 909b0f3e9b9ec3e202ab36a81d9a5338c39a5ccf Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 17:18:34 +0200 Subject: [PATCH 50/51] add github links --- src/spatialdata/_io/_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 683aa47c5..497f516d9 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -327,8 +327,10 @@ def _find_piece_dict(obj: dict[str, tuple[str | None]] | Task) -> dict[str, tupl def _extract_parquet_paths_from_task(obj: Any) -> list[str]: """Recursively extract parquet file paths from a dask ``read_parquet`` task. - Dask's task-graph shape changed between the version pinned before PR #1006 ("unpinning - dask", commit 53b9438a) and the current one; we tolerate both: + Dask's task-graph shape changed between the version pinned before scverse/spatialdata + PR #1006 (https://github.com/scverse/spatialdata/pull/1006 "unpinning dask", commit + 53b9438a https://github.com/scverse/spatialdata/commit/53b9438a328c5fc2a451d2c8afab439b945ba2b8) + and the current one; we tolerate both. - Legacy shape: a dict ``{"piece": (parquet_file, None, None)}`` somewhere in the args (possibly wrapped in other dicts for mixed points+images element graphs). The trailing @@ -414,7 +416,8 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No files.append(str(UPath(path).resolve())) elif "parquet" in name.lower(): # Matches every dask task-key that wraps a parquet read across versions: - # - legacy ``read-parquet-`` / ``read_parquet-`` (pre PR #1006), + # - legacy ``read-parquet-`` / ``read_parquet-`` (pre scverse/ + # spatialdata PR #1006, https://github.com/scverse/spatialdata/pull/1006), # - current ``read_parquet-`` plus fused-expression forms such as # ``readparquetpyarrowfs-fused-values-`` produced by # ``dask_expr.io.parquet.ReadParquetPyarrowFS`` when a parquet column is From 12f2489eec27f48754397834cbb95f359be9c6ae Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Fri, 17 Apr 2026 17:46:47 +0200 Subject: [PATCH 51/51] ome-zarr needs to be consolidation aware --- src/spatialdata/_io/_utils.py | 19 ++++++++++- src/spatialdata/_io/io_points.py | 4 +-- src/spatialdata/_io/io_shapes.py | 4 +-- src/spatialdata/_io/io_table.py | 4 +-- src/spatialdata/_io/io_zarr.py | 30 +++++++++++++++-- src/spatialdata/_store.py | 31 +++++++++++++++++ tests/io/test_store_abstractions.py | 52 +++++++++++++++++------------ 7 files changed, 113 insertions(+), 31 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 497f516d9..57b2ec642 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -356,6 +356,11 @@ def _extract_parquet_paths_from_task(obj: Any) -> list[str]: found.append(path) if isinstance(obj, Mapping): + # TODO(legacy-dask): the ``"piece"`` branch targets the pre-PR-#1006 dask graph shape + # (``dask/dataframe/io/parquet/core.py`` produced ``{"piece": (file, rg, filters)}``). The + # current dask pin (``dask>=2025.12.0``) no longer emits this shape at runtime; the branch + # is kept only as a safety net for users forcing an older dask via pip. Remove once the + # lower pin is bumped past the PR-#1006 cut-off and CI covers only the new shape. if "piece" in obj: piece = obj["piece"] if isinstance(piece, tuple) and len(piece) >= 1 and isinstance(piece[0], str): @@ -376,6 +381,12 @@ def _extract_parquet_paths_from_task(obj: Any) -> list[str]: found.extend(_extract_parquet_paths_from_task(item)) return found + # TODO(dask-task-api): the ``kwargs`` / ``args`` getattr probes here rely on the Task wrapper + # object introduced alongside PR #1006. The attribute contract is not documented as public + # (``dask.dataframe.dask_expr``), so we access it defensively via getattr and traverse every + # container uniformly. If dask stabilises a public accessor (e.g. ``task.iter_leaves()`` or an + # expr-level ``file_paths`` property) or if ``FragmentWrapper`` becomes importable from a + # stable namespace, replace the attribute-chain walk with a typed call and drop the getattrs. kwargs = getattr(obj, "kwargs", None) if isinstance(kwargs, Mapping): for v in kwargs.values(): @@ -411,7 +422,13 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No name = k if name is not None: if name.startswith("original-from-zarr"): - # LocalStore.store does not have an attribute path, but we keep it like this for backward compat. + # TODO(zarr-v3-store-path): the ``getattr(..., "path", None)`` fallback dates + # back to zarr v2, where ``DirectoryStore`` exposed ``.path`` and the v3 + # ``LocalStore`` exposes ``.root`` instead. With the current pin + # (``zarr>=3.0.0``) the getattr branch is never taken for local backends -- it + # only covers exotic third-party stores that still mimic the v2 attribute. + # Once we are confident no such shim stores are in use, collapse this to just + # ``v.store.root`` and drop the getattr probe. path = getattr(v.store, "path", None) if getattr(v.store, "path", None) else v.store.root files.append(str(UPath(path).resolve())) elif "parquet" in name.lower(): diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index 9a70b4b1f..32ef88478 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -15,7 +15,7 @@ overwrite_coordinate_transformations_non_raster, ) from spatialdata._io.format import CurrentPointsFormat, PointsFormats, _parse_version -from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group, open_zarr_for_read from spatialdata.models import get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -29,7 +29,7 @@ def _read_points( """Read points from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) resolved_store = _resolve_zarr_store(zarr_store.path) - f = zarr.open(resolved_store, mode="r") + f = open_zarr_for_read(resolved_store, as_group=False) version = _parse_version(f, expect_attrs_key=True) assert version is not None diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index c2128fe19..290360718 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -25,7 +25,7 @@ ShapesFormatV03, _parse_version, ) -from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group, open_zarr_for_read from spatialdata.models import ShapesModel, get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -39,7 +39,7 @@ def _read_shapes( """Read shapes from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) resolved_store = _resolve_zarr_store(zarr_store.path) - f = zarr.open(resolved_store, mode="r") + f = open_zarr_for_read(resolved_store, as_group=False) version = _parse_version(f, expect_attrs_key=True) assert version is not None shape_format = ShapesFormats[version] diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index a37e62a4e..0e8bef657 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -18,7 +18,7 @@ TablesFormatV02, _parse_version, ) -from spatialdata._store import ZarrStore, make_zarr_store +from spatialdata._store import ZarrStore, make_zarr_store, open_zarr_for_read from spatialdata.models import TableModel, get_table_keys @@ -27,7 +27,7 @@ def _read_table(store: str | Path | UPath | ZarrStore) -> AnnData: resolved_store = _resolve_zarr_store(zarr_store.path) table = read_anndata_zarr(resolved_store) - f = zarr.open(resolved_store, mode="r") + f = open_zarr_for_read(resolved_store, as_group=False) version = _parse_version(f, expect_attrs_key=False) assert version is not None table_format = TablesFormats[version] diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 534bd7d91..336ba1d93 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -26,7 +26,13 @@ from spatialdata._io.io_shapes import _read_shapes from spatialdata._io.io_table import _read_table from spatialdata._logging import logger -from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group, open_read_store +from spatialdata._store import ( + ZarrStore, + make_zarr_store, + make_zarr_store_from_group, + open_read_store, + open_zarr_for_read, +) from spatialdata._types import Raster_T @@ -162,7 +168,10 @@ def read_zarr( tables: dict[str, AnnData] = {} with open_read_store(zarr_store) as resolved_store: - root_group = zarr.open_group(resolved_store, mode="r") + # Use the consolidated + zarr-v3-pinned fast path. See ``open_zarr_for_read`` for why + # pinning ``zarr_format=3`` matters over remote backends (avoids five small v2-metadata + # probes per open) and how the fallback keeps legacy / non-consolidated stores working. + root_group = open_zarr_for_read(resolved_store, as_group=True) # the following is the SpatialDataContainerFormat version if "spatialdata_attrs" not in root_group.metadata.attributes: # backward compatibility for pre-versioned SpatialData zarr stores @@ -331,5 +340,22 @@ def _write_consolidated_metadata(path: Path | UPath | str | None) -> None: # that pr would still work. with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=zarr.errors.ZarrUserWarning) + # Consolidate at the root, then at every element group + # (``/``). The per-element consolidation is what lets our readers + # -- which re-open each element via a child-rooted ``FsspecStore`` -- actually + # consume consolidated metadata at element open time. A root-only consolidation + # only benefits the first ``zarr.open_group`` call in ``read_zarr``; every + # subsequent ``zarr.open(elem_store, ...)`` rooted at the element path would + # still walk its own subtree one ``zarr.json`` at a time because the + # consolidated-metadata field lives on the *root* ``zarr.json``, not the + # child's. Consolidating per-element writes the field on every element's own + # ``zarr.json`` so a child-rooted open is a single GET regardless of depth. zarr.consolidate_metadata(f.store) + for group_name in ("images", "labels", "points", "shapes", "tables"): + if group_name not in f: + continue + for element_name in f[group_name]: + if element_name.startswith("."): + continue + zarr.consolidate_metadata(f.store, path=f"{group_name}/{element_name}") f.store.close() diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py index ab307e716..21f27273f 100644 --- a/src/spatialdata/_store.py +++ b/src/spatialdata/_store.py @@ -71,6 +71,12 @@ def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: # i.e. ``pafs.FSSpecHandler(fs)`` -- and pyarrow's handler is strictly sync. Feeding it # an async-wrapped fs raises ``RuntimeError: Loop is not running`` at read/write time. # The ``while`` loop tolerates (hypothetical) multi-layer wrapping across zarr versions. + # + # TODO(async-pyarrow-fs): drop this unwrap once either (a) pyarrow's FSSpecHandler learns + # to run an async fs under its own event loop, or (b) zarr exposes the original sync fs + # on FsspecStore without the AsyncFileSystemWrapper indirection (tracked at + # https://github.com/zarr-developers/zarr-python/issues/2073). At that point ``fs`` can be + # assigned directly from ``store.fs`` and the getattr probe can go. fs = store.fs while True: inner = getattr(fs, "sync_fs", None) @@ -100,6 +106,31 @@ def open_read_store(store: ZarrStore) -> Any: resolved_store.close() +def open_zarr_for_read(store: Any, *, as_group: bool = True) -> Any: + """Open a zarr group or node for reading with remote-friendly defaults. + + Prefers the fast path: pinned ``zarr_format=3`` (we only ever write v3 stores, so skipping + v2-metadata auto-probes saves up to five small GETs per open on remote backends) and + ``use_consolidated=True`` (requires the root / element ``zarr.json`` to carry the + ``consolidated_metadata`` field produced by ``_write_consolidated_metadata``). Falls back + to ``zarr.open*`` with no format/consolidation hints for legacy or third-party stores that + predate either convention. + + Parameters + ---------- + store + A ``zarr.storage.StoreLike`` -- typically the value yielded by ``open_read_store``. + as_group + If ``True`` (default) use ``zarr.open_group``; if ``False`` use ``zarr.open`` which + returns either a ``Group`` or an ``Array`` based on the metadata at the store root. + """ + fn = zarr.open_group if as_group else zarr.open + try: + return fn(store, mode="r", zarr_format=3, use_consolidated=True) + except (ValueError, FileNotFoundError): + return fn(store, mode="r") + + @contextmanager def open_write_store(store: ZarrStore) -> Any: """Open ``store`` as a writable backend store (``read_only=False``).""" diff --git a/tests/io/test_store_abstractions.py b/tests/io/test_store_abstractions.py index 996c75b17..315c22e4c 100644 --- a/tests/io/test_store_abstractions.py +++ b/tests/io/test_store_abstractions.py @@ -177,14 +177,21 @@ def test_roundtrip_full_sdata(self, full_sdata: SpatialData) -> None: class TestConsolidatedMetadataOnRead: - """Writing produces a consolidated-metadata artifact; the read path does not consume it yet. - - The second test in this class is intentionally left to fail (not xfail-ed): it pins - the invariant we want -- that reading a remote store uses the consolidated metadata - artifact so small-GET traffic stays bounded -- and leaves the implementation detail - (threading ``use_consolidated=True`` through ``read_zarr`` / ``open_read_store``) - open for reviewer discussion. Please comment on the right place to wire it in; we - would rather the gap be visible than hidden behind ``@pytest.mark.xfail``. + """Writing produces a consolidated-metadata artifact; the read path consumes it. + + The invariant pinned here is: for an sdata built only of elements read by our own + code (shapes / points / tables), a single ``SpatialData.read`` over a remote-backed + ``UPath`` must issue very few metadata GETs. That is what consolidated metadata buys + us: one blob at the root (and one per element group, written by + ``_write_consolidated_metadata``) replaces an O(nodes) walk of small ``zarr.json`` + / ``.zattrs`` / ``.zarray`` / ``.zgroup`` files. + + Element types backed by ``ome-zarr-py`` (images / labels) still issue many small + GETs through ``ome_zarr``'s own ZarrLocation reader, which does a v2-style + ``.zattrs`` / ``.zmetadata`` walk regardless of the v3 consolidation we write at + the root. That is an upstream concern (``ome-zarr-py`` must learn to consume + ``consolidated_metadata`` on ``zarr.json``) and is intentionally *not* covered + here; it would wrongly make this test dependent on an external package's fix. """ def test_write_produces_root_metadata_on_memory_upath(self, images: SpatialData) -> None: @@ -198,19 +205,20 @@ def test_write_produces_root_metadata_on_memory_upath(self, images: SpatialData) root_keys = [p.rsplit("/", 1)[-1] for p in fs.find(upath.path)] assert "zarr.json" in root_keys or ".zmetadata" in root_keys, root_keys - def test_read_zarr_opens_via_consolidated_metadata(self, images: SpatialData) -> None: - # Left to fail intentionally: read_zarr currently opens the root group with - # zarr.open_group(store, mode="r") without use_consolidated=True, so a written - # consolidated-metadata artifact is ignored on read. The fix site (wiring - # use_consolidated through open_read_store / read_zarr) is left open for review - # discussion rather than hidden behind @pytest.mark.xfail. + def test_read_zarr_opens_via_consolidated_metadata(self, shapes: SpatialData) -> None: + # Uses the ``shapes`` fixture specifically because images/labels are read through + # ``ome_zarr.reader.ZarrLocation`` which bypasses our ``open_zarr_for_read`` and + # performs a v2-style metadata walk upstream of our code. Shapes (and points / + # tables) are read by our own readers which go through ``open_zarr_for_read`` + # -- the function under test. upath = _fresh_memory_upath("consolidated-read") - images.write(upath, overwrite=True) + shapes.write(upath, overwrite=True) - # Count store GETs on the memory fs to detect that consolidated metadata is used: - # without consolidation, reading one image requires many small zarr.json / .zgroup GETs. - # We monkeypatch the public ``cat_file`` method (the one MemoryFileSystem actually - # exposes); targeting ``_cat_file`` would silently miss every call. + # Count store GETs on the memory fs. Without consolidation + zarr_format=3 pinning, + # reading this 3-shape sdata costs ~25 small GETs (v2-metadata auto-probes + a walk + # of per-element ``zarr.json``). With both it costs ~7. We monkeypatch the public + # ``cat_file`` (the one ``MemoryFileSystem`` exposes); targeting ``_cat_file`` would + # silently miss every call. fs = upath.fs original_cat_file = fs.cat_file call_count = {"n": 0} @@ -225,9 +233,9 @@ def counting_cat_file(path, *args, **kwargs): finally: fs.cat_file = original_cat_file - # With consolidated metadata, we expect very few small-metadata GETs for a - # trivial 1-image sdata. Without it, typical count is >> 10. The exact bound is - # a documented, loose sanity check, not a micro-benchmark. + # The exact bound is a documented, loose sanity check, not a micro-benchmark. + # 10 comfortably covers the observed 7 GETs for 3 shapes while staying well below + # the ~25 that an unconsolidated / v2-probing read would incur. assert call_count["n"] < 10, f"expected consolidated metadata to reduce GETs, saw {call_count['n']}"