diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 739b225fe..5c5230c90 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -30,6 +30,7 @@ validate_table_attr_keys, ) from spatialdata._logging import logger +from spatialdata._store import ZarrStore, make_zarr_store, open_read_store, open_write_store from spatialdata._types import ArrayLike, Raster_T from spatialdata._utils import _deprecation_alias from spatialdata.models import ( @@ -121,7 +122,8 @@ def __init__( tables: dict[str, AnnData] | Tables | None = None, attrs: Mapping[Any, Any] | None = None, ) -> None: - self._path: Path | None = None + self._path: Path | UPath | None = None + self._zarr_store: ZarrStore | None = None self._shared_keys: set[str | None] = set() self._images: Images = Images(shared_keys=self._shared_keys) @@ -548,16 +550,34 @@ def is_backed(self) -> bool: return self.path is not None @property - def path(self) -> Path | None: - """Path to the Zarr storage.""" + def path(self) -> Path | UPath | None: + """Path to the Zarr storage (always :class:`pathlib.Path` or :class:`upath.UPath` when set).""" return self._path @path.setter - def path(self, value: Path | None) -> None: - if value is None or isinstance(value, str | Path): - self._path = value + def path(self, value: str | Path | UPath | None) -> None: + if value is None: + self._set_zarr_store(None) else: - raise TypeError("Path must be `None`, a `str` or a `Path` object.") + self._set_zarr_store(make_zarr_store(value)) + + def _set_zarr_store(self, zarr_store: ZarrStore | None) -> None: + self._zarr_store = zarr_store + self._path = None if zarr_store is None else zarr_store.path + + def _get_zarr_store(self) -> ZarrStore | None: + if self._zarr_store is not None: + return self._zarr_store + if self.path is None: + return None + self._zarr_store = make_zarr_store(self.path) + return self._zarr_store + + def _require_zarr_store(self) -> ZarrStore: + zarr_store = self._get_zarr_store() + if zarr_store is None: + raise ValueError("The SpatialData object is not backed by a Zarr store.") + return zarr_store def locate_element(self, element: SpatialElement) -> list[str]: """ @@ -982,13 +1002,7 @@ def elements_paths_on_disk(self) -> list[str]: ------- A list of paths of the elements saved in the Zarr store. """ - from spatialdata._io._utils import _resolve_zarr_store - - if self.path is None: - raise ValueError("The SpatialData object is not backed by a Zarr store.") - - store = _resolve_zarr_store(self.path) - root = zarr.open_group(store=store, mode="r") + zarr_store = self._require_zarr_store() elements_in_zarr = [] def find_groups(obj: zarr.Group, path: str) -> None: @@ -997,13 +1011,14 @@ def find_groups(obj: zarr.Group, path: str) -> None: if isinstance(obj, zarr.Group) and path.count("/") == 1: elements_in_zarr.append(path) - for element_type in root: - if element_type in ["images", "labels", "points", "shapes", "tables"]: - for element_name in root[element_type]: - path = f"{element_type}/{element_name}" - elements_in_zarr.append(path) + with open_read_store(zarr_store) as store: + root = zarr.open_group(store=store, mode="r") + for element_type in root: + if element_type in ["images", "labels", "points", "shapes", "tables"]: + for element_name in root[element_type]: + path = f"{element_type}/{element_name}" + elements_in_zarr.append(path) # root.visit(lambda path: find_groups(root[path], path)) - store.close() return elements_in_zarr def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: @@ -1032,18 +1047,56 @@ def _symmetric_difference_with_zarr_store(self) -> tuple[list[str], list[str]]: def _validate_can_safely_write_to_path( self, - file_path: str | Path, + file_path: str | Path | UPath, overwrite: bool = False, saving_an_element: bool = False, ) -> None: - from spatialdata._io._utils import _backed_elements_contained_in_path, _is_subfolder, _resolve_zarr_store + """ + Guard against unsafe writes for **local** paths (zarr check, Dask backing, subfolders). + + For :class:`upath.UPath`, ``overwrite=False`` is rejected: we cannot reliably check + whether a remote store already exists (fsspec existence semantics vary by backend and + object stores have no directory concept), so the "fail if exists" contract cannot be + honored. Callers must pass ``overwrite=True`` to explicitly acknowledge that the write + may clobber pre-existing data at the target. + """ + from upath.implementations.local import PosixUPath, WindowsUPath - if isinstance(file_path, str): + from spatialdata._io._utils import ( + _backed_elements_contained_in_path, + _is_subfolder, + _resolve_zarr_store, + ) + + # Hierarchical URIs ("scheme://...") must become UPath: plain Path(str) breaks cloud URLs + # (S3-compatible stores, Azure abfs:// / az://, GCS gs://, https://, fsspec chains, etc.). + if isinstance(file_path, str) and "://" in file_path: + file_path = UPath(file_path) + elif isinstance(file_path, str): file_path = Path(file_path) - if not isinstance(file_path, Path): - raise ValueError(f"file_path must be a string or a Path object, type(file_path) = {type(file_path)}.") + if not isinstance(file_path, (Path, UPath)): + raise ValueError(f"file_path must be a string, Path or UPath object, type(file_path) = {type(file_path)}.") + + # Local UPath variants (PosixUPath / WindowsUPath) wrap a plain filesystem path; they + # have reliable existence semantics and must go through the same local validation as + # Path. Only *remote* UPath (cloud / http / memory / etc.) falls through the remote guard. + is_remote_upath = isinstance(file_path, UPath) and not isinstance(file_path, (PosixUPath, WindowsUPath)) + + if is_remote_upath: + # The overwrite opt-in only applies at the top-level store entry. Per-element writes + # issued internally by ``write()`` (and incremental ``write_element`` calls into an + # existing store) must not re-trigger the guard on every sub-key, or writing to a + # remote target would be impossible. + if not overwrite and not saving_an_element: + raise NotImplementedError( + "Writing to a remote (UPath) target requires overwrite=True. " + "We cannot reliably check whether the remote store already exists, so the write " + "may clobber existing data; pass overwrite=True to acknowledge this." + ) + return + # Local Path: existing logic # TODO: add test for this if os.path.exists(file_path): store = _resolve_zarr_store(file_path) @@ -1072,8 +1125,13 @@ def _validate_can_safely_write_to_path( ERROR_MSG + "\nDetails: the target path contains one or more files that Dask use for " "backing elements in the SpatialData object." + WORKAROUND ) - if self.path is not None and ( - _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) + # Subfolder checks only for local paths (Path); skip when self.path is UPath + if ( + self.path is not None + and isinstance(self.path, Path) + and ( + _is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path) + ) ): if saving_an_element and _is_subfolder(parent=self.path, child=file_path): raise ValueError( @@ -1102,7 +1160,7 @@ def _validate_all_elements(self) -> None: @_deprecation_alias(format="sdata_formats", version="0.7.0") def write( self, - file_path: str | Path, + file_path: str | Path | UPath | None = None, overwrite: bool = False, consolidate_metadata: bool = True, update_sdata_path: bool = True, @@ -1115,10 +1173,12 @@ def write( Parameters ---------- file_path - The path to the Zarr store to write to. + The path to the Zarr store to write to. If ``None``, uses :attr:`path` (must be set). overwrite If `True`, overwrite the Zarr store if it already exists. If `False`, `write()` will fail if the Zarr store - already exists. + already exists. For remote paths (:class:`upath.UPath`), ``overwrite=True`` is required because we cannot + reliably check whether the remote target exists; passing ``overwrite=False`` raises ``NotImplementedError``. + Pass ``overwrite=True`` to explicitly acknowledge that the write may clobber pre-existing data. consolidate_metadata If `True`, triggers :func:`zarr.convenience.consolidate_metadata`, which writes all the metadata in a single file at the root directory of the store. This makes the data cloud accessible, which is required for certain @@ -1156,21 +1216,23 @@ def write( Whether to use the WKB or geoarrow encoding for GeoParquet. See :meth:`geopandas.GeoDataFrame.to_parquet` for details. If None, uses the value from :attr:`spatialdata.settings.shapes_geometry_encoding`. """ - from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import _parse_formats parsed = _parse_formats(sdata_formats) - if isinstance(file_path, str): - file_path = Path(file_path) + if file_path is None: + if self.path is None: + raise ValueError("file_path must be provided when SpatialData.path is not set.") + file_path = self.path + zarr_store = make_zarr_store(file_path) + file_path = zarr_store.path self._validate_can_safely_write_to_path(file_path, overwrite=overwrite) self._validate_all_elements() - store = _resolve_zarr_store(file_path) - zarr_format = parsed["SpatialData"].zarr_format - zarr_group = zarr.create_group(store=store, overwrite=overwrite, zarr_format=zarr_format) - self.write_attrs(zarr_group=zarr_group, sdata_format=parsed["SpatialData"]) - store.close() + with open_write_store(zarr_store) as store: + zarr_format = parsed["SpatialData"].zarr_format + zarr_group = zarr.create_group(store=store, overwrite=overwrite, zarr_format=zarr_format) + self.write_attrs(zarr_group=zarr_group, sdata_format=parsed["SpatialData"]) for element_type, element_name, element in self.gen_elements(): self._write_element( @@ -1184,7 +1246,7 @@ def write( ) if self.path != file_path and update_sdata_path: - self.path = file_path + self._set_zarr_store(zarr_store) if consolidate_metadata: self.write_consolidated_metadata() @@ -1192,7 +1254,7 @@ def write( def _write_element( self, element: SpatialElement | AnnData, - zarr_container_path: Path, + zarr_container_path: Path | UPath, element_type: str, element_name: str, overwrite: bool, @@ -1201,10 +1263,8 @@ def _write_element( ) -> None: from spatialdata._io.io_zarr import _get_groups_for_element - if not isinstance(zarr_container_path, Path): - raise ValueError( - f"zarr_container_path must be a Path object, type(zarr_container_path) = {type(zarr_container_path)}." - ) + if not isinstance(zarr_container_path, (Path, UPath)): + raise ValueError(f"zarr_container_path must be a Path or UPath, got {type(zarr_container_path).__name__}.") file_path_of_element = zarr_container_path / element_type / element_name self._validate_can_safely_write_to_path( file_path=file_path_of_element, overwrite=overwrite, saving_an_element=True @@ -1423,13 +1483,12 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None: "more elements in the SpatialData object. Deleting the data would corrupt the SpatialData object." ) - from spatialdata._io._utils import _resolve_zarr_store + zarr_store = self._require_zarr_store() # delete the element - store = _resolve_zarr_store(self.path) - root = zarr.open_group(store=store, mode="r+", use_consolidated=False) - del root[element_type][element_name] - store.close() + with open_write_store(zarr_store) as store: + root = zarr.open_group(store=store, mode="r+", use_consolidated=False) + del root[element_type][element_name] if self.has_consolidated_metadata(): self.write_consolidated_metadata() @@ -1452,14 +1511,11 @@ def write_consolidated_metadata(self) -> None: _write_consolidated_metadata(self.path) def has_consolidated_metadata(self) -> bool: - from spatialdata._io._utils import _resolve_zarr_store - return_value = False - store = _resolve_zarr_store(self.path) - group = zarr.open_group(store, mode="r") - if getattr(group.metadata, "consolidated_metadata", None): - return_value = True - store.close() + with open_read_store(self._require_zarr_store()) as store: + group = zarr.open_group(store, mode="r") + if getattr(group.metadata, "consolidated_metadata", None): + return_value = True return return_value def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[str, SpatialElement | AnnData] | None: @@ -1489,7 +1545,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # check if the element exists in the Zarr storage if not _group_for_element_exists( - zarr_path=Path(self.path), + zarr_path=self.path, element_type=element_type, element_name=element_name, ): @@ -1503,7 +1559,7 @@ def _validate_can_write_metadata_on_element(self, element_name: str) -> tuple[st # warn the users if the element is not self-contained, that is, it is Dask-backed by files outside the Zarr # group for the element - element_zarr_path = Path(self.path) / element_type / element_name + element_zarr_path = self.path / element_type / element_name if not _is_element_self_contained(element=element, element_path=element_zarr_path): logger.info( f"Element {element_type}/{element_name} is not self-contained. The metadata will be" @@ -1544,7 +1600,7 @@ def write_channel_names(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have the check in the conditional if element_type == "images" and self.path is not None: _, _, element_group = _get_groups_for_element( - zarr_path=Path(self.path), element_type=element_type, element_name=element_name, use_consolidated=False + zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False ) from spatialdata._io._utils import overwrite_channel_names @@ -1588,7 +1644,7 @@ def write_transformations(self, element_name: str | None = None) -> None: # Mypy does not understand that path is not None so we have a conditional assert self.path is not None _, _, element_group = _get_groups_for_element( - zarr_path=Path(self.path), + zarr_path=self.path, element_type=element_type, element_name=element_name, use_consolidated=False, @@ -1645,18 +1701,17 @@ def write_attrs( sdata_format: SpatialDataContainerFormatType | None = None, zarr_group: zarr.Group | None = None, ) -> None: - from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import CurrentSpatialDataContainerFormat, SpatialDataContainerFormatType sdata_format = sdata_format if sdata_format is not None else CurrentSpatialDataContainerFormat() assert isinstance(sdata_format, SpatialDataContainerFormatType) - store = None - if zarr_group is None: assert self.is_backed(), "The SpatialData object must be backed by a Zarr store to write attrs." - store = _resolve_zarr_store(self.path) - zarr_group = zarr.open_group(store=store, mode="r+") + with open_write_store(self._require_zarr_store()) as store: + zarr_group = zarr.open_group(store=store, mode="r+") + self.write_attrs(sdata_format=sdata_format, zarr_group=zarr_group) + return version = sdata_format.spatialdata_format_version version_specific_attrs = sdata_format.attrs_to_dict() @@ -1667,9 +1722,6 @@ def write_attrs( except TypeError as e: raise TypeError("Invalid attribute in SpatialData.attrs") from e - if store is not None: - store.close() - def write_metadata( self, element_name: str | None = None, @@ -1956,7 +2008,8 @@ def h(s: str) -> str: descr = "SpatialData object" if self.path is not None: - descr += f", with associated Zarr store: {self.path.resolve()}" + path_descr = str(self.path) if isinstance(self.path, UPath) else self.path.resolve() + descr += f", with associated Zarr store: {path_descr}" non_empty_elements = self._non_empty_elements() last_element_index = len(non_empty_elements) - 1 diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 6690d1118..57b2ec642 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import filecmp +import json import os.path import re import sys @@ -11,7 +12,7 @@ from contextlib import contextmanager from enum import Enum from functools import singledispatch -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import Any, Literal import zarr @@ -38,6 +39,12 @@ from spatialdata.transformations.transformations import BaseTransformation, _get_current_output_axes +def join_fsspec_store_path(store_path: str, relative_path: str) -> str: + """Append a relative zarr-group path to an FsspecStore root, yielding a fsspec key.""" + rel = relative_path.lstrip("/") + return str(PurePosixPath(store_path) / rel) if rel else store_path + + def _get_transformations_from_ngff_dict( list_of_encoded_ngff_transformations: list[dict[str, Any]], ) -> MappingToCoordinateSystem_t: @@ -317,6 +324,82 @@ def _find_piece_dict(obj: dict[str, tuple[str | None]] | Task) -> dict[str, tupl return None +def _extract_parquet_paths_from_task(obj: Any) -> list[str]: + """Recursively extract parquet file paths from a dask ``read_parquet`` task. + + Dask's task-graph shape changed between the version pinned before scverse/spatialdata + PR #1006 (https://github.com/scverse/spatialdata/pull/1006 "unpinning dask", commit + 53b9438a https://github.com/scverse/spatialdata/commit/53b9438a328c5fc2a451d2c8afab439b945ba2b8) + and the current one; we tolerate both. + + - Legacy shape: a dict ``{"piece": (parquet_file, None, None)}`` somewhere in the args + (possibly wrapped in other dicts for mixed points+images element graphs). The trailing + elements of the ``piece`` tuple encode row-group / filter constraints; we only support + unfiltered reads (hence the validation on ``check0`` / ``check1``). + - Current shape: a ``dask.dataframe.dask_expr.io.parquet.FragmentWrapper`` whose + ``.fragment.path`` is the parquet file (from ``dask_expr.io.parquet.ReadParquetPyarrowFS``). + The wrapper may live in Task ``kwargs["fragment_wrapper"]`` for simple reads, but in fused + expressions (``readparquetpyarrowfs-fused-*``) it is nested inside lists and tuples + inside a subgraph dict, so we walk every container uniformly rather than targeting named + kwargs. + + ``FragmentWrapper`` is detected via the ``.fragment.path`` attribute chain instead of an + isinstance check to avoid importing private dask_expr internals; the ``endswith(".parquet")`` + guard keeps false positives from random objects out of the result. + """ + found: list[str] = [] + + frag = getattr(obj, "fragment", None) + if frag is not None: + path = getattr(frag, "path", None) + if isinstance(path, str) and path.endswith(".parquet"): + found.append(path) + + if isinstance(obj, Mapping): + # TODO(legacy-dask): the ``"piece"`` branch targets the pre-PR-#1006 dask graph shape + # (``dask/dataframe/io/parquet/core.py`` produced ``{"piece": (file, rg, filters)}``). The + # current dask pin (``dask>=2025.12.0``) no longer emits this shape at runtime; the branch + # is kept only as a safety net for users forcing an older dask via pip. Remove once the + # lower pin is bumped past the PR-#1006 cut-off and CI covers only the new shape. + if "piece" in obj: + piece = obj["piece"] + if isinstance(piece, tuple) and len(piece) >= 1 and isinstance(piece[0], str): + parquet_file = piece[0] + check0 = piece[1] if len(piece) > 1 else None + check1 = piece[2] if len(piece) > 2 else None + if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: + raise ValueError( + f"Unable to parse the parquet file from the dask task {obj!r}. Please report this bug." + ) + found.append(parquet_file) + for v in obj.values(): + found.extend(_extract_parquet_paths_from_task(v)) + return found + + if isinstance(obj, (list, tuple)): + for item in obj: + found.extend(_extract_parquet_paths_from_task(item)) + return found + + # TODO(dask-task-api): the ``kwargs`` / ``args`` getattr probes here rely on the Task wrapper + # object introduced alongside PR #1006. The attribute contract is not documented as public + # (``dask.dataframe.dask_expr``), so we access it defensively via getattr and traverse every + # container uniformly. If dask stabilises a public accessor (e.g. ``task.iter_leaves()`` or an + # expr-level ``file_paths`` property) or if ``FragmentWrapper`` becomes importable from a + # stable namespace, replace the attribute-chain walk with a typed call and drop the getattrs. + kwargs = getattr(obj, "kwargs", None) + if isinstance(kwargs, Mapping): + for v in kwargs.values(): + found.extend(_extract_parquet_paths_from_task(v)) + + args = getattr(obj, "args", None) + if isinstance(args, (list, tuple)): + for a in args: + found.extend(_extract_parquet_paths_from_task(a)) + + return found + + def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> None: # see the types allowed for the dask graph here: https://docs.dask.org/en/stable/spec.html @@ -339,38 +422,32 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No name = k if name is not None: if name.startswith("original-from-zarr"): - # LocalStore.store does not have an attribute path, but we keep it like this for backward compat. + # TODO(zarr-v3-store-path): the ``getattr(..., "path", None)`` fallback dates + # back to zarr v2, where ``DirectoryStore`` exposed ``.path`` and the v3 + # ``LocalStore`` exposes ``.root`` instead. With the current pin + # (``zarr>=3.0.0``) the getattr branch is never taken for local backends -- it + # only covers exotic third-party stores that still mimic the v2 attribute. + # Once we are confident no such shim stores are in use, collapse this to just + # ``v.store.root`` and drop the getattr probe. path = getattr(v.store, "path", None) if getattr(v.store, "path", None) else v.store.root files.append(str(UPath(path).resolve())) - elif name.startswith("read-parquet") or name.startswith("read_parquet"): - # Here v is a read_parquet task with arguments and the only value is a dictionary. - if "piece" in v.args[0]: - # https://github.com/dask/dask/blob/ff2488aec44d641696e0b7aa41ed9e995c710705/dask/dataframe/io/parquet/core.py#L870 - parquet_file, check0, check1 = v.args[0]["piece"] - if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: - raise ValueError( - f"Unable to parse the parquet file from the dask subgraph {subgraph}. Please " - f"report this bug." - ) + elif "parquet" in name.lower(): + # Matches every dask task-key that wraps a parquet read across versions: + # - legacy ``read-parquet-`` / ``read_parquet-`` (pre scverse/ + # spatialdata PR #1006, https://github.com/scverse/spatialdata/pull/1006), + # - current ``read_parquet-`` plus fused-expression forms such as + # ``readparquetpyarrowfs-fused-values-`` produced by + # ``dask_expr.io.parquet.ReadParquetPyarrowFS`` when a parquet column is + # combined with other arrays (see ``test_self_contained``). + # Any false-positive key that matches but carries no parquet payload is filtered + # inside ``_extract_parquet_paths_from_task`` (paths must ``endswith(".parquet")``). + for parquet_file in _extract_parquet_paths_from_task(v): files.append(os.path.realpath(parquet_file)) - else: - # This occurs when for example points and images are mixed, the main task still starts with - # read_parquet, but the execution happens through a subgraph which we iterate over to get the - # actual read_parquet task. - for task in v.args[0].values(): - # Recursively go through tasks, this is required because differences between dask versions. - piece_dict = _find_piece_dict(task) - if isinstance(piece_dict, dict) and "piece" in piece_dict: - parquet_file, check0, check1 = piece_dict["piece"] # type: ignore[misc] - if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: - raise ValueError( - f"Unable to parse the parquet file from the dask subgraph {subgraph}. Please " - f"report this bug." - ) - files.append(os.path.realpath(parquet_file)) - - -def _backed_elements_contained_in_path(path: Path, object: SpatialData | SpatialElement | AnnData) -> list[bool]: + + +def _backed_elements_contained_in_path( + path: Path | UPath, object: SpatialData | SpatialElement | AnnData +) -> list[bool]: """ Return the list of boolean values indicating if backing files for an object are child directory of a path. @@ -389,9 +466,16 @@ def _backed_elements_contained_in_path(path: Path, object: SpatialData | Spatial ----- If an object does not have a Dask computational graph, it will return an empty list. It is possible for a single SpatialElement to contain multiple files in their Dask computational graph. + + For a remote ``path`` (:class:`upath.UPath`), this always returns an empty list: Dask backing paths + are resolved as local filesystem paths, so they cannot be compared to object-store locations. + :meth:`spatialdata.SpatialData.write` therefore skips the local "backing files in target" guard + for remote targets; ``overwrite=True`` on a remote URL must be used only when overwriting is safe. """ + if isinstance(path, UPath): + return [] if not isinstance(path, Path): - raise TypeError(f"Expected a Path object, got {type(path)}") + raise TypeError(f"Expected a Path or UPath object, got {type(path)}") return [_is_subfolder(parent=path, child=Path(fp)) for fp in get_dask_backing_files(object)] @@ -420,16 +504,44 @@ def _is_subfolder(parent: Path, child: Path) -> bool: def _is_element_self_contained( - element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, element_path: Path + element: DataArray | DataTree | DaskDataFrame | GeoDataFrame | AnnData, + element_path: Path | UPath, ) -> bool: + """Whether element Dask graphs only reference files under ``element_path`` (local) or N/A (remote).""" + if isinstance(element_path, UPath): + # Backing-file paths are local; cannot relate them to remote keys—assume OK for this heuristic. + return True if isinstance(element, DaskDataFrame): pass # TODO when running test_save_transformations it seems that for the same element this is called multiple times return all(_backed_elements_contained_in_path(path=element_path, object=element)) +def _ensure_async_fs(fs: Any) -> Any: + """Return an async fsspec filesystem for use with zarr's FsspecStore. + + Zarr's FsspecStore expects an async filesystem. If the given fs is synchronous, + it is converted using fsspec's public API (async instance or AsyncFileSystemWrapper) + so that ZarrUserWarning is not raised. + """ + if getattr(fs, "asynchronous", False): + return fs + import fsspec + + if getattr(fs, "async_impl", False): + fs_dict = json.loads(fs.to_json()) + fs_dict["asynchronous"] = True + return fsspec.AbstractFileSystem.from_json(json.dumps(fs_dict)) + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + return AsyncFileSystemWrapper(fs, asynchronous=True) + + def _resolve_zarr_store( - path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, **kwargs: Any + path: str | Path | UPath | zarr.storage.StoreLike | zarr.Group, + *, + read_only: bool = False, + **kwargs: Any, ) -> zarr.storage.StoreLike: """ Normalize different Zarr store inputs into a usable store instance. @@ -445,9 +557,14 @@ def _resolve_zarr_store( path The input representing a Zarr store or group. Can be a filesystem path, remote path, existing store, or Zarr group. + read_only + If ``True``, constructed ``LocalStore`` / ``FsspecStore`` instances are built with + ``read_only=True``. Stores that already exist (when ``path`` is a ``StoreLike`` or + a ``zarr.Group`` whose wrapped store is not reconstructable) are returned as-is; + the caller is responsible for opening them at the right mode. **kwargs Additional keyword arguments forwarded to the underlying store - constructor (e.g. `mode`, `storage_options`). + constructor. Returns ------- @@ -457,37 +574,39 @@ def _resolve_zarr_store( ------ TypeError If the input type is unsupported. - ValueError + ValueError If a `zarr.Group` has an unsupported store type. """ - # TODO: ensure kwargs like mode are enforced everywhere and passed correctly to the store if isinstance(path, str | Path): - # if the input is str or Path, map it to UPath path = UPath(path) if isinstance(path, PosixUPath | WindowsUPath): # if the input is a local path, use LocalStore - return LocalStore(path.path) + return LocalStore(path.path, read_only=read_only) if isinstance(path, zarr.Group): - # if the input is a zarr.Group, wrap it with a store + # Re-wrap the group's store at the group's subpath. Note: zarr v3 no longer ships + # ``ConsolidatedMetadataStore`` (v2 wrapped the backend in a store; v3 surfaces + # consolidated metadata as a field on ``GroupMetadata`` instead), so we only need to + # handle the two concrete backends below. if isinstance(path.store, LocalStore): store_path = UPath(path.store.root) / path.path - return LocalStore(store_path.path) + return LocalStore(store_path.path, read_only=read_only) if isinstance(path.store, FsspecStore): - # if the store within the zarr.Group is an FSStore, return it - # but extend the path of the store with that of the zarr.Group - return FsspecStore(path.store.path + "/" + path.path, fs=path.store.fs, **kwargs) - if isinstance(path.store, zarr.storage.ConsolidatedMetadataStore): - # if the store is a ConsolidatedMetadataStore, just return the underlying FSSpec store - return path.store.store + return FsspecStore( + fs=_ensure_async_fs(path.store.fs), + path=join_fsspec_store_path(path.store.path, path.path), + read_only=read_only, + **kwargs, + ) raise ValueError(f"Unsupported store type or zarr.Group: {type(path.store)}") - if isinstance(path, zarr.storage.StoreLike): - # if the input already a store, wrap it in an FSStore - return FsspecStore(path, **kwargs) if isinstance(path, UPath): - # if input is a remote UPath, map it to an FSStore - return FsspecStore(path.path, fs=path.fs, **kwargs) + # Check before StoreLike to avoid UnionType isinstance. + return FsspecStore(_ensure_async_fs(path.fs), path=path.path, read_only=read_only, **kwargs) + if isinstance(path, zarr.storage.StoreLike): + # Already a concrete store (LocalStore, FsspecStore, MemoryStore, ...). Do not pass it as ``fs=`` to + # FsspecStore -- that only accepts an async fsspec filesystem and raises on stores (e.g. ``async_impl``). + return path raise TypeError(f"Unsupported type: {type(path)}") diff --git a/src/spatialdata/_io/io_points.py b/src/spatialdata/_io/io_points.py index b47fc418c..32ef88478 100644 --- a/src/spatialdata/_io/io_points.py +++ b/src/spatialdata/_io/io_points.py @@ -6,13 +6,16 @@ from dask.dataframe import DataFrame as DaskDataFrame from dask.dataframe import read_parquet from ome_zarr.format import Format +from upath import UPath from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, + _resolve_zarr_store, _write_metadata, overwrite_coordinate_transformations_non_raster, ) from spatialdata._io.format import CurrentPointsFormat, PointsFormats, _parse_version +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group, open_zarr_for_read from spatialdata.models import get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -21,21 +24,38 @@ def _read_points( - store: str | Path, + store: str | Path | UPath | ZarrStore, ) -> DaskDataFrame: - """Read points from a zarr store.""" - f = zarr.open(store, mode="r") + """Read points from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) + f = open_zarr_for_read(resolved_store, as_group=False) version = _parse_version(f, expect_attrs_key=True) assert version is not None points_format = PointsFormats[version] - store_root = f.store_path.store.root - path = store_root / f.path / "points.parquet" - # cache on remote file needed for parquet reader to work - # TODO: allow reading in the metadata without caching all the data - points = read_parquet("simplecache::" + str(path) if str(path).startswith("http") else path) + parquet_store = zarr_store.child("points.parquet") + # Passing filesystem= to read_parquet makes pyarrow convert dictionary columns into pandas + # categoricals eagerly per partition and marks them known=True with an empty category list. + # This happens for ANY pyarrow filesystem (both LocalFileSystem and PyFileSystem(FSSpecHandler(.)) + # return the same broken categorical), so it is a property of the filesystem= handoff itself, + # not of local-vs-remote. Left as is, it would make write_points' cat.as_known() a no-op and + # the next to_parquet(filesystem=.) would fail with a per-partition schema mismatch + # (dictionary vs dictionary). We demote the categoricals back to + # "unknown" right here so that write_points recomputes categories consistently across partitions. + # TODO: allow reading in the metadata without materializing the data. + points = read_parquet( + parquet_store.arrow_path(), + filesystem=parquet_store.arrow_filesystem(), + ) assert isinstance(points, DaskDataFrame) + for column_name in points.columns: + c = points[column_name] + if c.dtype == "category" and c.cat.known: + points[column_name] = c.cat.as_unknown() + if points.index.name == "__null_dask_index__": + points = points.rename_axis(None) transformations = _get_transformations_from_ngff_dict(f.attrs.asdict()["coordinateTransformations"]) _set_transformations(points, transformations) @@ -68,8 +88,7 @@ def write_points( axes = get_axes_names(points) transformations = _get_transformations(points) - store_root = group.store_path.store.root - path = store_root / group.path / "points.parquet" + parquet_store = make_zarr_store_from_group(group).child("points.parquet") # The following code iterates through all columns in the 'points' DataFrame. If the column's datatype is # 'category', it checks whether the categories of this column are known. If not, it explicitly converts the @@ -84,7 +103,10 @@ def write_points( points_without_transform = points.copy() del points_without_transform.attrs["transform"] - points_without_transform.to_parquet(path) + points_without_transform.to_parquet( + parquet_store.arrow_path(), + filesystem=parquet_store.arrow_filesystem(), + ) attrs = element_format.attrs_to_dict(points.attrs) attrs["version"] = element_format.spatialdata_format_version diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index a8b2ab2ce..7eaf04d57 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -16,10 +16,12 @@ from ome_zarr.writer import write_labels as write_labels_ngff from ome_zarr.writer import write_multiscale as write_multiscale_ngff from ome_zarr.writer import write_multiscale_labels as write_multiscale_labels_ngff +from upath import UPath from xarray import DataArray, DataTree from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, + _resolve_zarr_store, overwrite_coordinate_transformations_raster, ) from spatialdata._io.format import ( @@ -27,6 +29,7 @@ RasterFormatType, get_ome_zarr_format, ) +from spatialdata._store import ZarrStore, make_zarr_store from spatialdata._utils import get_pyramid_levels from spatialdata.models._utils import get_channel_names from spatialdata.models.models import ATTRS_KEY @@ -160,13 +163,14 @@ def _prepare_storage_options( def _read_multiscale( - store: str | Path, raster_type: Literal["image", "labels"], reader_format: Format + store: str | Path | UPath | ZarrStore, raster_type: Literal["image", "labels"], reader_format: Format ) -> DataArray | DataTree: - assert isinstance(store, str | Path) assert raster_type in ["image", "labels"] + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) nodes: list[Node] = [] - image_loc = ZarrLocation(store, fmt=reader_format) + image_loc = ZarrLocation(resolved_store, fmt=reader_format) if exists := image_loc.exists(): image_reader = Reader(image_loc)() image_nodes = list(image_reader) diff --git a/src/spatialdata/_io/io_shapes.py b/src/spatialdata/_io/io_shapes.py index b07256273..290360718 100644 --- a/src/spatialdata/_io/io_shapes.py +++ b/src/spatialdata/_io/io_shapes.py @@ -9,9 +9,11 @@ from natsort import natsorted from ome_zarr.format import Format from shapely import from_ragged_array, to_ragged_array +from upath import UPath from spatialdata._io._utils import ( _get_transformations_from_ngff_dict, + _resolve_zarr_store, _write_metadata, overwrite_coordinate_transformations_non_raster, ) @@ -23,6 +25,7 @@ ShapesFormatV03, _parse_version, ) +from spatialdata._store import ZarrStore, make_zarr_store, make_zarr_store_from_group, open_zarr_for_read from spatialdata.models import ShapesModel, get_axes_names from spatialdata.transformations._utils import ( _get_transformations, @@ -31,10 +34,12 @@ def _read_shapes( - store: str | Path, + store: str | Path | UPath | ZarrStore, ) -> GeoDataFrame: - """Read shapes from a zarr store.""" - f = zarr.open(store, mode="r") + """Read shapes from a zarr store (path, hierarchical URI string, or remote ``UPath``).""" + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) + f = open_zarr_for_read(resolved_store, as_group=False) version = _parse_version(f, expect_attrs_key=True) assert version is not None shape_format = ShapesFormats[version] @@ -54,9 +59,9 @@ def _read_shapes( geometry = from_ragged_array(typ, coords, offsets) geo_df = GeoDataFrame({"geometry": geometry}, index=index) elif isinstance(shape_format, ShapesFormatV02 | ShapesFormatV03): - store_root = f.store_path.store.root - path = Path(store_root) / f.path / "shapes.parquet" - geo_df = read_parquet(path) + parquet_store = zarr_store.child("shapes.parquet") + with parquet_store.arrow_filesystem().open_input_file(parquet_store.arrow_path()) as src: + geo_df = read_parquet(src) else: raise ValueError( f"Unsupported shapes format {shape_format} from version {version}. Please update the spatialdata library." @@ -169,13 +174,13 @@ def _write_shapes_v02_v03( """ from spatialdata.models._utils import TRANSFORM_KEY - store_root = group.store_path.store.root - path = store_root / group.path / "shapes.parquet" + parquet_store = make_zarr_store_from_group(group).child("shapes.parquet") # Temporarily remove transformations from attrs to avoid serialization issues transforms = shapes.attrs[TRANSFORM_KEY] del shapes.attrs[TRANSFORM_KEY] - shapes.to_parquet(path, geometry_encoding=geometry_encoding) + with parquet_store.arrow_filesystem().open_output_stream(parquet_store.arrow_path()) as sink: + shapes.to_parquet(sink, geometry_encoding=geometry_encoding) shapes.attrs[TRANSFORM_KEY] = transforms attrs = element_format.attrs_to_dict(shapes.attrs) diff --git a/src/spatialdata/_io/io_table.py b/src/spatialdata/_io/io_table.py index 8cd7b8385..0e8bef657 100644 --- a/src/spatialdata/_io/io_table.py +++ b/src/spatialdata/_io/io_table.py @@ -8,7 +8,9 @@ from anndata import read_zarr as read_anndata_zarr from anndata._io.specs import write_elem as write_adata from ome_zarr.format import Format +from upath import UPath +from spatialdata._io._utils import _resolve_zarr_store from spatialdata._io.format import ( CurrentTablesFormat, TablesFormats, @@ -16,13 +18,16 @@ TablesFormatV02, _parse_version, ) +from spatialdata._store import ZarrStore, make_zarr_store, open_zarr_for_read from spatialdata.models import TableModel, get_table_keys -def _read_table(store: str | Path) -> AnnData: - table = read_anndata_zarr(str(store)) +def _read_table(store: str | Path | UPath | ZarrStore) -> AnnData: + zarr_store = store if isinstance(store, ZarrStore) else make_zarr_store(store) + resolved_store = _resolve_zarr_store(zarr_store.path) + table = read_anndata_zarr(resolved_store) - f = zarr.open(store, mode="r") + f = open_zarr_for_read(resolved_store, as_group=False) version = _parse_version(f, expect_attrs_key=False) assert version is not None table_format = TablesFormats[version] diff --git a/src/spatialdata/_io/io_zarr.py b/src/spatialdata/_io/io_zarr.py index 4c410fab0..336ba1d93 100644 --- a/src/spatialdata/_io/io_zarr.py +++ b/src/spatialdata/_io/io_zarr.py @@ -1,13 +1,12 @@ from __future__ import annotations -import os import warnings from collections.abc import Callable from json import JSONDecodeError from pathlib import Path from typing import Any, Literal, cast -import zarr.storage +import zarr from anndata import AnnData from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -27,12 +26,19 @@ from spatialdata._io.io_shapes import _read_shapes from spatialdata._io.io_table import _read_table from spatialdata._logging import logger +from spatialdata._store import ( + ZarrStore, + make_zarr_store, + make_zarr_store_from_group, + open_read_store, + open_zarr_for_read, +) from spatialdata._types import Raster_T def _read_zarr_group_spatialdata_element( root_group: zarr.Group, - root_store_path: str, + root_store: ZarrStore, sdata_version: Literal["0.1", "0.2"], selector: set[str], read_func: Callable[..., Any], @@ -54,7 +60,7 @@ def _read_zarr_group_spatialdata_element( # skip hidden files like .zgroup or .zmetadata continue elem_group = group[subgroup_name] - elem_group_path = os.path.join(root_store_path, elem_group.path) + elem_store = root_store.child(elem_group.path) with handle_read_errors( on_bad_files, location=f"{group.path}/{subgroup_name}", @@ -70,12 +76,12 @@ def _read_zarr_group_spatialdata_element( if element_type in ["image", "labels"]: reader_format = get_raster_format_for_read(elem_group, sdata_version) element = read_func( - elem_group_path, + elem_store, cast(Literal["image", "labels"], element_type), reader_format, ) elif element_type in ["shapes", "points", "tables"]: - element = read_func(elem_group_path) + element = read_func(elem_store) else: raise ValueError(f"Unknown element type {element_type}") element_container[subgroup_name] = element @@ -153,24 +159,7 @@ def read_zarr( ------- A SpatialData object. """ - from spatialdata._io._utils import _resolve_zarr_store - - resolved_store = _resolve_zarr_store(store) - root_group = zarr.open_group(resolved_store, mode="r") - # the following is the SpatialDataContainerFormat version - if "spatialdata_attrs" not in root_group.metadata.attributes: - # backward compatibility for pre-versioned SpatialData zarr stores - sdata_version: Literal["0.1", "0.2"] = "0.1" - else: - sdata_version = root_group.metadata.attributes["spatialdata_attrs"]["version"] - if sdata_version == "0.1": - warnings.warn( - "SpatialData is not stored in the most current format. If you want to use Zarr v3" - ", please write the store to a new location using `sdata.write()`.", - UserWarning, - stacklevel=2, - ) - root_store_path = root_group.store.root + zarr_store = make_zarr_store_from_group(store) if isinstance(store, zarr.Group) else make_zarr_store(store) images: dict[str, Raster_T] = {} labels: dict[str, Raster_T] = {} @@ -178,50 +167,69 @@ def read_zarr( shapes: dict[str, GeoDataFrame] = {} tables: dict[str, AnnData] = {} - selector = {"images", "labels", "points", "shapes", "tables"} if not selection else set(selection or []) - logger.debug(f"Reading selection {selector}") - - # we could make this more readable. One can get lost when looking at this dict and iteration over the items - group_readers: dict[ - Literal["images", "labels", "shapes", "points", "tables"], - tuple[ - Callable[..., Any], - Literal["image", "labels", "shapes", "points", "tables"], - dict[str, Raster_T] | dict[str, DaskDataFrame] | dict[str, GeoDataFrame] | dict[str, AnnData], - ], - ] = { - # ome-zarr-py needs a kwargs that has "image" has key. So here we have "image" and not "images" - "images": (_read_multiscale, "image", images), - "labels": (_read_multiscale, "labels", labels), - "points": (_read_points, "points", points), - "shapes": (_read_shapes, "shapes", shapes), - "tables": (_read_table, "tables", tables), - } - for group_name, ( - read_func, - element_type, - element_container, - ) in group_readers.items(): - _read_zarr_group_spatialdata_element( - root_group=root_group, - root_store_path=root_store_path, - sdata_version=sdata_version, - selector=selector, - read_func=read_func, - group_name=group_name, - element_type=element_type, - element_container=element_container, - on_bad_files=on_bad_files, - ) - - # read attrs metadata - attrs = root_group.attrs.asdict() - if "spatialdata_attrs" in attrs: - # when refactoring the read_zarr function into reading componenets separately (and according to the version), - # we can move the code below (.pop()) into attrs_from_dict() - attrs.pop("spatialdata_attrs") - else: - attrs = None + with open_read_store(zarr_store) as resolved_store: + # Use the consolidated + zarr-v3-pinned fast path. See ``open_zarr_for_read`` for why + # pinning ``zarr_format=3`` matters over remote backends (avoids five small v2-metadata + # probes per open) and how the fallback keeps legacy / non-consolidated stores working. + root_group = open_zarr_for_read(resolved_store, as_group=True) + # the following is the SpatialDataContainerFormat version + if "spatialdata_attrs" not in root_group.metadata.attributes: + # backward compatibility for pre-versioned SpatialData zarr stores + sdata_version: Literal["0.1", "0.2"] = "0.1" + else: + sdata_version = root_group.metadata.attributes["spatialdata_attrs"]["version"] + if sdata_version == "0.1": + warnings.warn( + "SpatialData is not stored in the most current format. If you want to use Zarr v3" + ", please write the store to a new location using `sdata.write()`.", + UserWarning, + stacklevel=2, + ) + + selector = {"images", "labels", "points", "shapes", "tables"} if not selection else set(selection or []) + logger.debug(f"Reading selection {selector}") + + # we could make this more readable. One can get lost when looking at this dict and iteration over the items + group_readers: dict[ + Literal["images", "labels", "shapes", "points", "tables"], + tuple[ + Callable[..., Any], + Literal["image", "labels", "shapes", "points", "tables"], + dict[str, Raster_T] | dict[str, DaskDataFrame] | dict[str, GeoDataFrame] | dict[str, AnnData], + ], + ] = { + # ome-zarr-py needs a kwargs that has "image" has key. So here we have "image" and not "images" + "images": (_read_multiscale, "image", images), + "labels": (_read_multiscale, "labels", labels), + "points": (_read_points, "points", points), + "shapes": (_read_shapes, "shapes", shapes), + "tables": (_read_table, "tables", tables), + } + for group_name, ( + read_func, + element_type, + element_container, + ) in group_readers.items(): + _read_zarr_group_spatialdata_element( + root_group=root_group, + root_store=zarr_store, + sdata_version=sdata_version, + selector=selector, + read_func=read_func, + group_name=group_name, + element_type=element_type, + element_container=element_container, + on_bad_files=on_bad_files, + ) + + # read attrs metadata + attrs = root_group.attrs.asdict() + if "spatialdata_attrs" in attrs: + # when refactoring the read_zarr function into reading componenets separately (and according to the version) + # we can move the code below (.pop()) into attrs_from_dict() + attrs.pop("spatialdata_attrs") + else: + attrs = None sdata = SpatialData( images=images, @@ -231,12 +239,12 @@ def read_zarr( tables=tables, attrs=attrs, ) - sdata.path = resolved_store.root + sdata._set_zarr_store(zarr_store) return sdata def _get_groups_for_element( - zarr_path: Path, element_type: str, element_name: str, use_consolidated: bool = True + zarr_path: Path | UPath, element_type: str, element_name: str, use_consolidated: bool = True ) -> tuple[zarr.Group, zarr.Group, zarr.Group]: """ Get the Zarr groups for the root, element_type and element for a specific element. @@ -265,8 +273,8 @@ def _get_groups_for_element( ------- The Zarr groups for the root, element_type and element for a specific element. """ - if not isinstance(zarr_path, Path): - raise ValueError("zarr_path should be a Path object") + if not isinstance(zarr_path, (Path, UPath)): + raise ValueError("zarr_path should be a Path or UPath object") if element_type not in [ "images", @@ -289,7 +297,7 @@ def _get_groups_for_element( return root_group, element_type_group, element_name_group -def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: str) -> bool: +def _group_for_element_exists(zarr_path: Path | UPath, element_type: str, element_name: str) -> bool: """ Check if the group for an element exists. @@ -319,14 +327,35 @@ def _group_for_element_exists(zarr_path: Path, element_type: str, element_name: return exists -def _write_consolidated_metadata(path: Path | str | None) -> None: +def _write_consolidated_metadata(path: Path | UPath | str | None) -> None: if path is not None: - f = zarr.open_group(path, mode="r+", use_consolidated=False) + if isinstance(path, UPath): + store = _resolve_zarr_store(path) + f = zarr.open_group(store, mode="r+", use_consolidated=False) + else: + f = zarr.open_group(path, mode="r+", use_consolidated=False) # .parquet files are not recognized as proper zarr and thus throw a warning. This does not affect SpatialData. # and therefore we silence it for our users as they can't do anything about this. # TODO check with remote PR whether we can prevent this warning at least for points data and whether with zarrv3 # that pr would still work. with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=zarr.errors.ZarrUserWarning) + # Consolidate at the root, then at every element group + # (``/``). The per-element consolidation is what lets our readers + # -- which re-open each element via a child-rooted ``FsspecStore`` -- actually + # consume consolidated metadata at element open time. A root-only consolidation + # only benefits the first ``zarr.open_group`` call in ``read_zarr``; every + # subsequent ``zarr.open(elem_store, ...)`` rooted at the element path would + # still walk its own subtree one ``zarr.json`` at a time because the + # consolidated-metadata field lives on the *root* ``zarr.json``, not the + # child's. Consolidating per-element writes the field on every element's own + # ``zarr.json`` so a child-rooted open is a single GET regardless of depth. zarr.consolidate_metadata(f.store) + for group_name in ("images", "labels", "points", "shapes", "tables"): + if group_name not in f: + continue + for element_name in f[group_name]: + if element_name.startswith("."): + continue + zarr.consolidate_metadata(f.store, path=f"{group_name}/{element_name}") f.store.close() diff --git a/src/spatialdata/_store.py b/src/spatialdata/_store.py new file mode 100644 index 000000000..21f27273f --- /dev/null +++ b/src/spatialdata/_store.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +from contextlib import contextmanager +from dataclasses import dataclass, replace +from pathlib import Path +from typing import Any, TypeAlias + +import pyarrow.fs as pafs +import zarr +from upath import UPath +from zarr.storage import FsspecStore, LocalStore + +PathLike: TypeAlias = Path | UPath + + +def normalize_path(path: str | PathLike, storage_options: dict[str, Any] | None = None) -> PathLike: + if isinstance(path, str): + return UPath(path, **(storage_options or {})) if "://" in path else Path(path) + if isinstance(path, (Path, UPath)): + return path + raise TypeError("Path must be `None`, a `str`, a `Path` or a `UPath` object.") + + +@dataclass(frozen=True) +class ZarrStore: + path: PathLike + + def with_path(self, path: str | PathLike) -> ZarrStore: + return replace(self, path=normalize_path(path)) + + def child(self, path: str | PathLike) -> ZarrStore: + return self.with_path(self.path / path) + + def arrow_path(self) -> str: + return self.path.path if isinstance(self.path, UPath) else str(self.path) + + def arrow_filesystem(self) -> pafs.FileSystem: + if isinstance(self.path, UPath): + return pafs.PyFileSystem(pafs.FSSpecHandler(self.path.fs)) + return pafs.LocalFileSystem() + + +def make_zarr_store( + path: str | PathLike, + *, + storage_options: dict[str, Any] | None = None, +) -> ZarrStore: + return ZarrStore(path=normalize_path(path, storage_options)) + + +def make_zarr_store_from_group(group: zarr.Group) -> ZarrStore: + from spatialdata._io._utils import join_fsspec_store_path + + # zarr v3 does not wrap stores with a ``ConsolidatedMetadataStore`` (that was a v2-only + # concept); consolidated metadata is now a field on ``GroupMetadata``. So the group's + # ``.store`` is already the concrete backend store -- no unwrapping required. + store = group.store + + if isinstance(store, LocalStore): + return make_zarr_store(Path(store.root) / group.path) + if isinstance(store, FsspecStore): + protocol = getattr(store.fs, "protocol", None) + if isinstance(protocol, (list, tuple)): + protocol = protocol[0] if protocol else "file" + elif protocol is None: + protocol = "file" + # Recover the original SYNC filesystem from ``store.fs``. zarr v3's FsspecStore requires + # an async fs, so when callers pass a sync fs (e.g. ``MemoryFileSystem``) we wrap it via + # ``AsyncFileSystemWrapper``, which preserves the original on ``.sync_fs``. We must + # unwrap here because the resulting UPath flows into ``ZarrStore.arrow_filesystem()``, + # i.e. ``pafs.FSSpecHandler(fs)`` -- and pyarrow's handler is strictly sync. Feeding it + # an async-wrapped fs raises ``RuntimeError: Loop is not running`` at read/write time. + # The ``while`` loop tolerates (hypothetical) multi-layer wrapping across zarr versions. + # + # TODO(async-pyarrow-fs): drop this unwrap once either (a) pyarrow's FSSpecHandler learns + # to run an async fs under its own event loop, or (b) zarr exposes the original sync fs + # on FsspecStore without the AsyncFileSystemWrapper indirection (tracked at + # https://github.com/zarr-developers/zarr-python/issues/2073). At that point ``fs`` can be + # assigned directly from ``store.fs`` and the getattr probe can go. + fs = store.fs + while True: + inner = getattr(fs, "sync_fs", None) + if inner is None or inner is fs: + break + fs = inner + path = join_fsspec_store_path(store.path, group.path) + return make_zarr_store(UPath(f"{protocol}://{path}", fs=fs)) + raise ValueError(f"Unsupported store type or zarr.Group: {type(group.store)}") + + +@contextmanager +def open_read_store(store: ZarrStore) -> Any: + """Open ``store`` as a read-only backend store. + + The resolved zarr store is constructed with ``read_only=True`` so that the underlying + ``LocalStore`` / ``FsspecStore`` refuses writes at the store layer (not just at the group's + ``mode="r"`` level). This also lets remote read-only backends (e.g. public HTTPS zarrs) + skip any write-capability probe that fsspec may otherwise perform. + """ + from spatialdata._io._utils import _resolve_zarr_store + + resolved_store = _resolve_zarr_store(store.path, read_only=True) + try: + yield resolved_store + finally: + resolved_store.close() + + +def open_zarr_for_read(store: Any, *, as_group: bool = True) -> Any: + """Open a zarr group or node for reading with remote-friendly defaults. + + Prefers the fast path: pinned ``zarr_format=3`` (we only ever write v3 stores, so skipping + v2-metadata auto-probes saves up to five small GETs per open on remote backends) and + ``use_consolidated=True`` (requires the root / element ``zarr.json`` to carry the + ``consolidated_metadata`` field produced by ``_write_consolidated_metadata``). Falls back + to ``zarr.open*`` with no format/consolidation hints for legacy or third-party stores that + predate either convention. + + Parameters + ---------- + store + A ``zarr.storage.StoreLike`` -- typically the value yielded by ``open_read_store``. + as_group + If ``True`` (default) use ``zarr.open_group``; if ``False`` use ``zarr.open`` which + returns either a ``Group`` or an ``Array`` based on the metadata at the store root. + """ + fn = zarr.open_group if as_group else zarr.open + try: + return fn(store, mode="r", zarr_format=3, use_consolidated=True) + except (ValueError, FileNotFoundError): + return fn(store, mode="r") + + +@contextmanager +def open_write_store(store: ZarrStore) -> Any: + """Open ``store`` as a writable backend store (``read_only=False``).""" + from spatialdata._io._utils import _resolve_zarr_store + + resolved_store = _resolve_zarr_store(store.path, read_only=False) + try: + yield resolved_store + finally: + resolved_store.close() diff --git a/tests/io/test_multi_table.py b/tests/io/test_multi_table.py index abaaea8d2..5c6bcf6e2 100644 --- a/tests/io/test_multi_table.py +++ b/tests/io/test_multi_table.py @@ -113,6 +113,8 @@ def test_set_table_nonexisting_target(self, full_sdata): def test_set_table_annotates_spatialelement(self, full_sdata, tmp_path): tmpdir = Path(tmp_path) / "tmp.zarr" del full_sdata["table"].uns[TableModel.ATTRS_KEY] + # full_sdata table has region labels2d+poly; set to labels2d only so set_table_annotates_spatialelement succeeds + full_sdata["table"].obs["region"] = pd.Categorical(["labels2d"] * full_sdata["table"].n_obs) with pytest.raises( TypeError, match="No current annotation metadata found. Please specify both region_key and instance_key." ): diff --git a/tests/io/test_readwrite.py b/tests/io/test_readwrite.py index 209a43046..bc220c073 100644 --- a/tests/io/test_readwrite.py +++ b/tests/io/test_readwrite.py @@ -1190,6 +1190,17 @@ def test_read_sdata(tmp_path: Path, points: SpatialData) -> None: assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_zarr_group) +def test_path_setter_coerces_str_to_path_or_upath(tmp_path: Path) -> None: + """``SpatialData.path`` is stored as Path | UPath | None; strings are normalized like ``write()``.""" + sdata = SpatialData() + p = tmp_path / "store.zarr" + sdata.path = str(p) + assert isinstance(sdata.path, Path) + assert sdata.path == p + sdata.path = "s3://bucket/key.zarr" + assert isinstance(sdata.path, UPath) + + def test_sdata_with_nan_in_obs(tmp_path: Path) -> None: """Test writing SpatialData with mixed string/NaN values in obs works correctly. diff --git a/tests/io/test_store.py b/tests/io/test_store.py new file mode 100644 index 000000000..d9ef877e6 --- /dev/null +++ b/tests/io/test_store.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import tempfile +from pathlib import Path + +import zarr +from upath import UPath +from zarr.storage import FsspecStore, LocalStore, MemoryStore + +from spatialdata._io._utils import _resolve_zarr_store +from spatialdata._store import ( + make_zarr_store, + make_zarr_store_from_group, + open_read_store, + open_write_store, +) + + +def test_make_zarr_store_normalizes_local_and_remote_paths( + tmp_path: Path, +) -> None: + local_store = make_zarr_store(str(tmp_path / "store.zarr")) + assert isinstance(local_store.path, Path) + + remote_store = make_zarr_store("s3://bucket/store.zarr") + assert isinstance(remote_store.path, UPath) + + +def test_make_zarr_store_applies_storage_options_to_remote_strings() -> None: + zarr_store = make_zarr_store("s3://bucket/store.zarr", storage_options={"anon": True}) + assert isinstance(zarr_store.path, UPath) + assert getattr(zarr_store.path.fs, "anon", None) is True + + +def test_open_read_and_write_store_roundtrip(tmp_path: Path) -> None: + zarr_store = make_zarr_store(tmp_path / "store.zarr") + + with open_write_store(zarr_store) as store: + group = zarr.create_group(store=store, overwrite=True) + group.attrs["answer"] = 42 + + with open_read_store(zarr_store) as store: + group = zarr.open_group(store=store, mode="r") + assert group.attrs["answer"] == 42 + + +def test_make_zarr_store_from_local_group(tmp_path: Path) -> None: + zarr_store = make_zarr_store(tmp_path / "store.zarr") + + with open_write_store(zarr_store) as store: + root = zarr.create_group(store=store, overwrite=True) + group = root.require_group("images").require_group("image") + + child_store = make_zarr_store_from_group(group) + assert child_store.path == tmp_path / "store.zarr" / "images" / "image" + + +def test_resolve_zarr_store_returns_existing_zarr_stores_unchanged() -> None: + """StoreLike inputs must not be wrapped as FsspecStore(fs=store) -- that is only for async filesystems.""" + mem = MemoryStore() + assert _resolve_zarr_store(mem) is mem + loc = LocalStore(tempfile.mkdtemp()) + assert _resolve_zarr_store(loc) is loc + + +def test_resolve_zarr_store_forwards_read_only_local(tmp_path: Path) -> None: + """``_resolve_zarr_store(..., read_only=True)`` must reach the LocalStore constructor.""" + store = _resolve_zarr_store(tmp_path / "store.zarr", read_only=True) + assert isinstance(store, LocalStore) + assert store.read_only is True + + +def test_resolve_zarr_store_forwards_read_only_remote() -> None: + """``_resolve_zarr_store(..., read_only=True)`` must reach the FsspecStore constructor.""" + from fsspec.implementations.memory import MemoryFileSystem + + upath = UPath("memory://ro-remote.zarr", fs=MemoryFileSystem(skip_instance_cache=True)) + store = _resolve_zarr_store(upath, read_only=True) + assert isinstance(store, FsspecStore) + assert store.read_only is True + + +def test_make_zarr_store_from_remote_group() -> None: + """Remote zarr.Group inputs keep a usable UPath and reopen through the same protocol.""" + import fsspec + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + fs = fsspec.filesystem("memory") + async_fs = AsyncFileSystemWrapper(fs, asynchronous=True) + base = FsspecStore(async_fs, path="/") + root = zarr.open_group(store=base, mode="a") + group = root.require_group("points").require_group("points") + + zarr_store = make_zarr_store_from_group(group) + assert getattr(zarr_store.path.fs, "protocol", None) == "memory" + + with open_read_store(zarr_store) as store: + assert isinstance(store, FsspecStore) diff --git a/tests/io/test_store_abstractions.py b/tests/io/test_store_abstractions.py new file mode 100644 index 000000000..315c22e4c --- /dev/null +++ b/tests/io/test_store_abstractions.py @@ -0,0 +1,289 @@ +"""Abstraction stress tests for ``SpatialData`` io against a memory-backed ``UPath``. + +These tests exercise the same read/write code paths that would be hit by a real remote +backend (S3/Azure/GCS/HTTPS), using only ``fsspec.filesystem("memory")`` and a thin +no-listing wrapper to approximate HTTP-like semantics. No emulators, no network. + +The file is deliberately scoped to the **public interface** (``SpatialData.read`` / +``SpatialData.write``) plus tamper-evident inspection of the underlying fsspec backend; +the lower-level ``ZarrStore`` / ``_resolve_zarr_store`` plumbing is unit-tested separately +in ``tests/io/test_store.py``. + +Coverage goals (generic, not provider-specific): +- ``SpatialData.read`` does not mutate backend bytes (tamper-evident snapshot equality). +- Full write / write-read-write round-trip through a remote-backed ``UPath`` for images, + labels, shapes, points, and a full sdata. The write-read-write cycle specifically pins + the categorical-schema invariant that the arrow-filesystem migration (this PR) had to + re-establish in ``_read_points``. +- Writing to a ``UPath`` lands the root metadata artifact in the backend. Reading via + consolidated metadata is left as a failing test on purpose: the invariant is stated, + but the fix (threading ``use_consolidated=True`` through ``read_zarr`` / the store + opener) is intentionally open for review discussion rather than silently suppressed. +- A ``MemoryFileSystem`` subclass that refuses listing proves that ``SpatialData.read`` + does not depend on directory listing for basic elements (the precondition for serving + public HTTPS zarrs). + +These tests are strictly stronger than moto/s3 emulator coverage: they need no external +process, no subprocess, no network, and they pin the exact abstraction boundary that the +cloud-native follow-up must not regress. +""" + +from __future__ import annotations + +from fsspec.implementations.memory import MemoryFileSystem +from upath import UPath + +from spatialdata import SpatialData +from spatialdata.testing import assert_spatial_data_objects_are_identical + + +def _fresh_memory_upath(key: str) -> UPath: + """Build a UPath bound to a fresh (per-test) in-memory fsspec filesystem. + + ``skip_instance_cache=True`` ensures every test gets an isolated memory backend so + tests cannot leak state across each other. + """ + fs = MemoryFileSystem(skip_instance_cache=True) + return UPath(f"memory://{key}.zarr", fs=fs) + + +# --------------------------------------------------------------------------- +# SpatialData.read is side-effect-free against the backend. +# --------------------------------------------------------------------------- + + +class TestReadIsSideEffectFree: + """``SpatialData.read`` must not mutate a single byte of the backend store. + + Using a memory filesystem as a tamper-evident substrate, we snapshot every key+bytes + before and after the read and assert full equality. This is strictly a public-interface + invariant: if ``read_zarr`` (or any element reader) ever silently wrote to a remote + backend, this test is the first to catch it. The lower-level guarantee that + ``_resolve_zarr_store`` forwards ``read_only=True`` to the backend store is unit-tested + separately in ``tests/io/test_store.py``. + """ + + def test_spatialdata_read_does_not_mutate_backend(self, images: SpatialData) -> None: + upath = _fresh_memory_upath("read-only-invariant") + images.write(upath, overwrite=True) + + fs = upath.fs + + def snapshot() -> dict[str, bytes]: + return {key: fs.cat_file(key) for key in fs.find(upath.path)} + + before = snapshot() + SpatialData.read(upath) + after = snapshot() + + assert before.keys() == after.keys(), ( + f"read added/removed backend keys; added={after.keys() - before.keys()}, " + f"removed={before.keys() - after.keys()}" + ) + # Equality on bytes (not just on keys) is what makes this tamper-evident: even a + # same-size rewrite of the same key would be caught. + assert before == after, "read mutated bytes in the backend store" + + +# --------------------------------------------------------------------------- +# Full SpatialData round-trip through a memory-backed UPath: the generic +# remote-backend stress test. +# --------------------------------------------------------------------------- + + +class TestMemoryUPathRoundtrip: + """Round-trip ``SpatialData`` objects through a memory-backed ``UPath``. + + Every code path from ``make_zarr_store`` -> ``_resolve_zarr_store`` -> + ``open_write_store`` / ``open_read_store`` -> ``zarr.open_group(FsspecStore)`` -> + ``io_raster`` / ``io_shapes`` / ``io_points`` / ``io_table`` is exercised identically + to how it would be against S3/Azure/GCS. If any of these regresses for remote backends, + one of these tests must break. + + Note that ``overwrite=True`` is required on every ``write()`` call that targets a + ``UPath`` (per the guard in ``_validate_can_safely_write_to_path``): remote existence + checks are unreliable across fsspec backends, so the caller must explicitly opt in. + """ + + def test_roundtrip_images_only(self, images: SpatialData) -> None: + upath = _fresh_memory_upath("images") + images.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(images, read) + + def test_roundtrip_labels_only(self, labels: SpatialData) -> None: + upath = _fresh_memory_upath("labels") + labels.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(labels, read) + + def test_roundtrip_shapes_only(self, shapes: SpatialData) -> None: + upath = _fresh_memory_upath("shapes") + shapes.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(shapes, read) + + def test_roundtrip_points_only(self, points: SpatialData) -> None: + upath = _fresh_memory_upath("points") + points.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(points, read) + + def test_write_read_write_points_preserves_categorical_schema(self, points: SpatialData) -> None: + """Regression guard for the arrow-filesystem categorical round-trip. + + This PR migrated points io to ``to_parquet`` / ``read_parquet`` with + ``filesystem=arrow_fs``. ``read_parquet(filesystem=arrow_fs)`` eagerly pandas-ifies + pyarrow dictionaries into ``CategoricalDtype`` marked ``known=True`` with an empty + category list -- that would defeat ``write_points``'s ``as_known()`` normalization + and a subsequent ``to_parquet(filesystem=arrow_fs)`` would fail with a per-partition + schema mismatch (``dictionary`` vs ``dictionary``). The + fix lives in ``_read_points`` (demote such categoricals to unknown so that + ``write_points`` recomputes categories across partitions); this test pins it. + """ + upath1 = _fresh_memory_upath("points-rt1") + upath2 = _fresh_memory_upath("points-rt2") + points.write(upath1, overwrite=True) + read = SpatialData.read(upath1) + read.write(upath2, overwrite=True) + round_tripped = SpatialData.read(upath2) + assert_spatial_data_objects_are_identical(points, round_tripped) + + def test_write_read_write_full_sdata(self, full_sdata: SpatialData) -> None: + """End-to-end guard: a full sdata round-trips write -> read -> write cleanly. + + Pinned for the same reason as the points-only variant above: the arrow-filesystem + migration in this PR had to re-establish the categorical-schema invariant on the + read side so that write does not fail on the second pass. + """ + upath1 = _fresh_memory_upath("full-rt1") + upath2 = _fresh_memory_upath("full-rt2") + full_sdata.write(upath1, overwrite=True) + read = SpatialData.read(upath1) + read.write(upath2, overwrite=True) + round_tripped = SpatialData.read(upath2) + assert_spatial_data_objects_are_identical(full_sdata, round_tripped) + + def test_roundtrip_full_sdata(self, full_sdata: SpatialData) -> None: + upath = _fresh_memory_upath("full") + full_sdata.write(upath, overwrite=True) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(full_sdata, read) + + +# --------------------------------------------------------------------------- +# Consolidated metadata on read. +# --------------------------------------------------------------------------- + + +class TestConsolidatedMetadataOnRead: + """Writing produces a consolidated-metadata artifact; the read path consumes it. + + The invariant pinned here is: for an sdata built only of elements read by our own + code (shapes / points / tables), a single ``SpatialData.read`` over a remote-backed + ``UPath`` must issue very few metadata GETs. That is what consolidated metadata buys + us: one blob at the root (and one per element group, written by + ``_write_consolidated_metadata``) replaces an O(nodes) walk of small ``zarr.json`` + / ``.zattrs`` / ``.zarray`` / ``.zgroup`` files. + + Element types backed by ``ome-zarr-py`` (images / labels) still issue many small + GETs through ``ome_zarr``'s own ZarrLocation reader, which does a v2-style + ``.zattrs`` / ``.zmetadata`` walk regardless of the v3 consolidation we write at + the root. That is an upstream concern (``ome-zarr-py`` must learn to consume + ``consolidated_metadata`` on ``zarr.json``) and is intentionally *not* covered + here; it would wrongly make this test dependent on an external package's fix. + """ + + def test_write_produces_root_metadata_on_memory_upath(self, images: SpatialData) -> None: + upath = _fresh_memory_upath("consolidated") + images.write(upath, overwrite=True) + fs = upath.fs + # The root metadata artifact differs by zarr version: zarr v3 writes ``zarr.json`` + # at every group, zarr v2 writes ``.zmetadata`` at the consolidated root. Accepting + # either keeps the test valid across versions and asserts that the write path + # actually reaches the memory backend. + root_keys = [p.rsplit("/", 1)[-1] for p in fs.find(upath.path)] + assert "zarr.json" in root_keys or ".zmetadata" in root_keys, root_keys + + def test_read_zarr_opens_via_consolidated_metadata(self, shapes: SpatialData) -> None: + # Uses the ``shapes`` fixture specifically because images/labels are read through + # ``ome_zarr.reader.ZarrLocation`` which bypasses our ``open_zarr_for_read`` and + # performs a v2-style metadata walk upstream of our code. Shapes (and points / + # tables) are read by our own readers which go through ``open_zarr_for_read`` + # -- the function under test. + upath = _fresh_memory_upath("consolidated-read") + shapes.write(upath, overwrite=True) + + # Count store GETs on the memory fs. Without consolidation + zarr_format=3 pinning, + # reading this 3-shape sdata costs ~25 small GETs (v2-metadata auto-probes + a walk + # of per-element ``zarr.json``). With both it costs ~7. We monkeypatch the public + # ``cat_file`` (the one ``MemoryFileSystem`` exposes); targeting ``_cat_file`` would + # silently miss every call. + fs = upath.fs + original_cat_file = fs.cat_file + call_count = {"n": 0} + + def counting_cat_file(path, *args, **kwargs): + call_count["n"] += 1 + return original_cat_file(path, *args, **kwargs) + + fs.cat_file = counting_cat_file + try: + SpatialData.read(upath) + finally: + fs.cat_file = original_cat_file + + # The exact bound is a documented, loose sanity check, not a micro-benchmark. + # 10 comfortably covers the observed 7 GETs for 3 shapes while staying well below + # the ~25 that an unconsolidated / v2-probing read would incur. + assert call_count["n"] < 10, f"expected consolidated metadata to reduce GETs, saw {call_count['n']}" + + +# --------------------------------------------------------------------------- +# HTTP-like read-only filesystem: simulates a remote that does not support listing. +# --------------------------------------------------------------------------- + + +class _NoListMemoryFileSystem(MemoryFileSystem): + """MemoryFileSystem that refuses directory listing, approximating HTTPS zarr semantics. + + Public HTTPS zarr reads cannot do ``ls`` / ``find`` on an arbitrary prefix; they can + only GET known keys. This wrapper fails any listing operation so we can prove that + our read path does not rely on listing -- the precondition for public HTTPS datasets + to be readable. + """ + + def _ls(self, path, detail=True, **kwargs): # type: ignore[override] + raise NotImplementedError("listing disabled to simulate HTTP-like semantics") + + def ls(self, path, detail=True, **kwargs): # type: ignore[override] + raise NotImplementedError("listing disabled to simulate HTTP-like semantics") + + def find(self, path, **kwargs): # type: ignore[override] + raise NotImplementedError("listing disabled to simulate HTTP-like semantics") + + +class TestHttpLikeReadOnlyStore: + """Approximate HTTPS zarr semantics: a read-only filesystem that refuses listing. + + The point is not to re-test zarr's FsspecStore but to catch the case where our own + ``read_zarr`` implementation (or an element reader) assumes it can list a directory. + That is exactly the pattern that breaks when pointed at a real public HTTPS zarr. + """ + + def test_read_sdata_from_no_list_fs(self, images: SpatialData, tmp_path) -> None: + # Write locally, then copy bytes into a no-list memory fs so that the backend + # resembles a public HTTPS zarr: every known key is readable but listing is disabled. + local_path = tmp_path / "local.zarr" + images.write(local_path) + + no_list_fs = _NoListMemoryFileSystem(skip_instance_cache=True) + remote_root = "no-list.zarr" + for p in local_path.rglob("*"): + if p.is_file(): + rel = p.relative_to(local_path).as_posix() + no_list_fs.pipe_file(f"{remote_root}/{rel}", p.read_bytes()) + + upath = UPath(f"memory://{remote_root}", fs=no_list_fs) + read = SpatialData.read(upath) + assert_spatial_data_objects_are_identical(images, read)