Skip to content
2 changes: 1 addition & 1 deletion candi/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .depmap import DepMapAPI, DepMapData
from .depmap import DepMapAPI, DepMapData
55 changes: 55 additions & 0 deletions candi/data/_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from typing import Any


class CancerDataNamespace:
"""Reusable namespace for attribute-style dataset access under `.data`."""

__slots__ = ("_parent",)

def __init__(self, parent: Any) -> None:
object.__setattr__(self, "_parent", parent)

def __getattr__(self, name: str) -> Any:
if name in self._parent._datasets:
return self._parent._datasets[name]
if name in self._parent._paths:
raise AttributeError(
f"Dataset '{name}' is available but not loaded. Call `.load('{name}')` first."
)
raise AttributeError(f"No dataset named '{name}' defined.")

def __setattr__(self, name: str, value: Any) -> None:
if name == "_parent":
object.__setattr__(self, name, value)
return
self.add(name=name, dataset=value)

def __dir__(self):
return sorted(
set(super().__dir__())
| set(self._parent._paths)
| set(self._parent._datasets)
)

def add(self, name: str, dataset: Any, overwrite: bool = False) -> None:
"""Add a dataset to this namespace."""
if not name or not isinstance(name, str):
raise ValueError("Dataset name must be a non-empty string.")
if not name.isidentifier():
raise ValueError(
f"Dataset name '{name}' is not a valid Python identifier for attribute access."
)
Comment thread
abearab marked this conversation as resolved.
if name in object.__dir__(self):
raise ValueError(
f"Dataset name '{name}' conflicts with an existing namespace attribute."
)
if name in self._parent._datasets and not overwrite:
raise ValueError(
f"Dataset '{name}' is already loaded. Pass overwrite=True to replace it."
)
Comment thread
abearab marked this conversation as resolved.
if name in self._parent._paths and not overwrite:
raise ValueError(
f"Dataset '{name}' is already defined as an available built-in dataset. "
f"Pass overwrite=True to replace it."
)
self._parent._datasets[name] = dataset
54 changes: 12 additions & 42 deletions candi/data/depmap.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
import subprocess
import pandas as pd

import anndata as ad
import pandas as pd
from tqdm import tqdm

from ._database import CancerDataNamespace

Comment thread
abearab marked this conversation as resolved.
LATEST_VERSION = "26Q1"
FILES_URL = 'https://depmap.org/portal/api/download/files'

Expand Down Expand Up @@ -65,7 +68,6 @@ def download_essential(self, gzip=True):
"OmicsSomaticMutationsMatrixDamaging.csv",
"OmicsCNGeneWGS.csv",
"CRISPRGeneDependency.csv",
"CRISPRScreenMap.csv",
"CRISPRGeneEffect.csv",
"OmicsCNSegmentsWGS.csv"
]
Expand All @@ -82,38 +84,17 @@ class DepMapData:
Provides attribute-style access to datasets (e.g., obj.data.Model).
"""

class DataNamespace:
"""Namespace object for accessing loaded datasets under .data with type hints for autocomplete."""
def __init__(self, parent):
self._parent = parent
class DataNamespace(CancerDataNamespace):
"""Namespace object for dataset access under `.data`."""

## DepMap main datasets ##

# meta data
# DepMap main datasets
Model: pd.DataFrame
# omics data
OmicsExpression: pd.DataFrame
OmicsSomaticMutations: pd.DataFrame
OmicsSomaticMutationsMatrixDamaging: pd.DataFrame
OmicsCNGeneWGS: pd.DataFrame
OmicsProteinAbundance: pd.DataFrame
# crispr data
CRISPRGeneDependency: pd.DataFrame
CRISPRGeneEffect: pd.DataFrame
# PRISM drug sensitivity data
CRISPRScreenMap: ad.AnnData
# Proteomic data



def __getattr__(self, name):
if name in self._parent._datasets:
return self._parent._datasets[name]
if name in self._parent._paths:
raise AttributeError(
f"Dataset '{name}' is available but not loaded. Call `.load('{name}')` first."
)
raise AttributeError(f"No dataset named '{name}' defined.")

def __init__(self, data_dir, version=LATEST_VERSION):
self.data_dir = data_dir
Expand Down Expand Up @@ -145,9 +126,7 @@ def _get_dataset_paths(self):
"OmicsCNSegmentsWGS": os.path.join(base, "OmicsCNSegmentsWGS.csv.gz"),
"CRISPRGeneDependency": os.path.join(base, "CRISPRGeneDependency.csv.gz"),
"CRISPRGeneEffect": os.path.join(base, "CRISPRGeneEffect.csv.gz"),
"CRISPRScreenMap": os.path.join(base, "CRISPRScreenMap.csv.gz"),
"PRISMDrugSensitivity": os.path.join(self.data_dir, "PRISM_fold_change_viability.h5ad.gz"),
"OmicsProteinAbundance": os.path.join(self.data_dir, "CCLE_protein_quantitation.tab")
}

def _check_paths_exist(self):
Expand Down Expand Up @@ -199,19 +178,6 @@ def load(self, name, inplace=True, engine='pandas', **kwargs):

data = df.copy()

elif name in {
"OmicsProteinAbundance",
}:
if engine == 'polars':
# NotImplementedError
raise NotImplementedError("Polars engine is not yet implemented for loading datasets.")
elif engine == 'pandas':
df = pd.read_csv(path, sep='\t', index_col=1, header=0).drop(columns=['UniprotID','EntrezID']).T
df.index.name = "ModelID"
df.columns.name = None

data = df.copy()

elif name in {
"OmicsExpression","OmicsCNGeneWGS",
"OmicsSomaticMutationsMatrixDamaging",
Expand Down Expand Up @@ -266,7 +232,11 @@ def load_all(self):

def list_available(self):
"""List all available datasets for this version."""
return list(self._paths.keys())
return list(dict.fromkeys([*self._paths.keys(), *self._datasets.keys()]))

def add_dataset(self, name, dataset, overwrite=False):
"""Add a user-provided dataset to the in-memory namespace."""
self.data.add(name=name, dataset=dataset, overwrite=overwrite)

def get(self, name):
"""Retrieve dataset if already loaded, otherwise prompt to load it."""
Expand Down