diff --git a/candi/data/__init__.py b/candi/data/__init__.py index d697f2d..3840985 100644 --- a/candi/data/__init__.py +++ b/candi/data/__init__.py @@ -1 +1 @@ -from .depmap import DepMapAPI, DepMapData \ No newline at end of file +from .depmap import DepMapAPI, DepMapData diff --git a/candi/data/_database.py b/candi/data/_database.py new file mode 100644 index 0000000..ee99059 --- /dev/null +++ b/candi/data/_database.py @@ -0,0 +1,55 @@ +from typing import Any + + +class CancerDataNamespace: + """Reusable namespace for attribute-style dataset access under `.data`.""" + + __slots__ = ("_parent",) + + def __init__(self, parent: Any) -> None: + object.__setattr__(self, "_parent", parent) + + def __getattr__(self, name: str) -> Any: + if name in self._parent._datasets: + return self._parent._datasets[name] + if name in self._parent._paths: + raise AttributeError( + f"Dataset '{name}' is available but not loaded. Call `.load('{name}')` first." + ) + raise AttributeError(f"No dataset named '{name}' defined.") + + def __setattr__(self, name: str, value: Any) -> None: + if name == "_parent": + object.__setattr__(self, name, value) + return + self.add(name=name, dataset=value) + + def __dir__(self): + return sorted( + set(super().__dir__()) + | set(self._parent._paths) + | set(self._parent._datasets) + ) + + def add(self, name: str, dataset: Any, overwrite: bool = False) -> None: + """Add a dataset to this namespace.""" + if not name or not isinstance(name, str): + raise ValueError("Dataset name must be a non-empty string.") + if not name.isidentifier(): + raise ValueError( + f"Dataset name '{name}' is not a valid Python identifier for attribute access." + ) + if name in object.__dir__(self): + raise ValueError( + f"Dataset name '{name}' conflicts with an existing namespace attribute." + ) + if name in self._parent._datasets and not overwrite: + raise ValueError( + f"Dataset '{name}' is already loaded. Pass overwrite=True to replace it." + ) + if name in self._parent._paths and not overwrite: + raise ValueError( + f"Dataset '{name}' is already defined as an available built-in dataset. " + f"Pass overwrite=True to replace it." + ) + self._parent._datasets[name] = dataset diff --git a/candi/data/depmap.py b/candi/data/depmap.py index 437dc40..931a42e 100644 --- a/candi/data/depmap.py +++ b/candi/data/depmap.py @@ -1,9 +1,12 @@ import os import subprocess -import pandas as pd + import anndata as ad +import pandas as pd from tqdm import tqdm +from ._database import CancerDataNamespace + LATEST_VERSION = "26Q1" FILES_URL = 'https://depmap.org/portal/api/download/files' @@ -65,7 +68,6 @@ def download_essential(self, gzip=True): "OmicsSomaticMutationsMatrixDamaging.csv", "OmicsCNGeneWGS.csv", "CRISPRGeneDependency.csv", - "CRISPRScreenMap.csv", "CRISPRGeneEffect.csv", "OmicsCNSegmentsWGS.csv" ] @@ -82,38 +84,17 @@ class DepMapData: Provides attribute-style access to datasets (e.g., obj.data.Model). """ - class DataNamespace: - """Namespace object for accessing loaded datasets under .data with type hints for autocomplete.""" - def __init__(self, parent): - self._parent = parent + class DataNamespace(CancerDataNamespace): + """Namespace object for dataset access under `.data`.""" - ## DepMap main datasets ## - - # meta data + # DepMap main datasets Model: pd.DataFrame - # omics data OmicsExpression: pd.DataFrame OmicsSomaticMutations: pd.DataFrame OmicsSomaticMutationsMatrixDamaging: pd.DataFrame OmicsCNGeneWGS: pd.DataFrame - OmicsProteinAbundance: pd.DataFrame - # crispr data CRISPRGeneDependency: pd.DataFrame CRISPRGeneEffect: pd.DataFrame - # PRISM drug sensitivity data - CRISPRScreenMap: ad.AnnData - # Proteomic data - - - - def __getattr__(self, name): - if name in self._parent._datasets: - return self._parent._datasets[name] - if name in self._parent._paths: - raise AttributeError( - f"Dataset '{name}' is available but not loaded. Call `.load('{name}')` first." - ) - raise AttributeError(f"No dataset named '{name}' defined.") def __init__(self, data_dir, version=LATEST_VERSION): self.data_dir = data_dir @@ -145,9 +126,7 @@ def _get_dataset_paths(self): "OmicsCNSegmentsWGS": os.path.join(base, "OmicsCNSegmentsWGS.csv.gz"), "CRISPRGeneDependency": os.path.join(base, "CRISPRGeneDependency.csv.gz"), "CRISPRGeneEffect": os.path.join(base, "CRISPRGeneEffect.csv.gz"), - "CRISPRScreenMap": os.path.join(base, "CRISPRScreenMap.csv.gz"), "PRISMDrugSensitivity": os.path.join(self.data_dir, "PRISM_fold_change_viability.h5ad.gz"), - "OmicsProteinAbundance": os.path.join(self.data_dir, "CCLE_protein_quantitation.tab") } def _check_paths_exist(self): @@ -199,19 +178,6 @@ def load(self, name, inplace=True, engine='pandas', **kwargs): data = df.copy() - elif name in { - "OmicsProteinAbundance", - }: - if engine == 'polars': - # NotImplementedError - raise NotImplementedError("Polars engine is not yet implemented for loading datasets.") - elif engine == 'pandas': - df = pd.read_csv(path, sep='\t', index_col=1, header=0).drop(columns=['UniprotID','EntrezID']).T - df.index.name = "ModelID" - df.columns.name = None - - data = df.copy() - elif name in { "OmicsExpression","OmicsCNGeneWGS", "OmicsSomaticMutationsMatrixDamaging", @@ -266,7 +232,11 @@ def load_all(self): def list_available(self): """List all available datasets for this version.""" - return list(self._paths.keys()) + return list(dict.fromkeys([*self._paths.keys(), *self._datasets.keys()])) + + def add_dataset(self, name, dataset, overwrite=False): + """Add a user-provided dataset to the in-memory namespace.""" + self.data.add(name=name, dataset=dataset, overwrite=overwrite) def get(self, name): """Retrieve dataset if already loaded, otherwise prompt to load it."""