diff --git a/README.md b/README.md index da3ffe5..ccf6db4 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,564 @@ -# Task Template +# Spatial Segmentation -This repo is a template to create a new task for the OpenProblems v2. This repo contains several example files and components that can be used when updated with the task info. -> [!WARNING] -> This README will be overwritten when performing the `create_task_readme` script. + -## Create a repository from this template +A one sentence summary of purpose and methodology. Used for creating an +overview tables. -> [!IMPORTANT] -> Before creating a new repository, make sure you are part of the OpenProblems task team. This will be done when you create an issue for the task and you get the go ahead to create the task. -> For more information on how to create a new task, check out the [Create a new task](https://openproblems.bio/documentation/create_task/) documentation. +Repository: +[openproblems-bio/task_template](https://github.com/openproblems-bio/task_template) -The instructions below will guide you through creating a new repository from this template ([creating-a-repository-from-a-template](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template)). +## Description +Provide a clear and concise description of your task, detailing the +specific problem it aims to solve. Outline the input data types, the +expected output, and any assumptions or constraints. Be sure to explain +any terminology or concepts that are essential for understanding the +task. -* Click the "Use this template" button on the top right of the repository. -* Use the Owner dropdown menu to select the `openproblems-bio` account. -* Type a name for your repository (task_...), and a description. -* Set the repository visibility to public. -* Click "Create repository from template". +Explain the motivation behind your proposed task. Describe the +biological or computational problem you aim to address and why it’s +important. Discuss the current state of research in this area and any +gaps or challenges that your task could help address. This section +should convince readers of the significance and relevance of your task. -## Clone the repository +## Authors & contributors -To clone the repository with the submodule files, you can use the following command: +| Name | Roles | Linkedin | Twitter | Email | Github | Orcid | +|:---|:---|:---|:---|:---|:---|:---| +| John Doe | author, maintainer | johndoe | johndoe | john@doe.me | johndoe | 0000-0000-0000-0000 | -```bash -git clone --recursive git@github.com:openproblems-bio/.git +## API + +``` mermaid +flowchart TB + file_common_ist("Common iST Dataset") + comp_data_processor[/"Data processor"/] + file_spatial_dataset("Raw iST Dataset") + file_scrnaseq_reference("scRNA-seq Reference") + comp_control_method[/"Control Method"/] + comp_method[/"Method"/] + comp_metric[/"Metric"/] + file_prediction("Predicted data") + file_score("Score") + file_common_scrnaseq("Common SC Dataset") + file_common_ist---comp_data_processor + comp_data_processor-->file_spatial_dataset + comp_data_processor-->file_scrnaseq_reference + file_spatial_dataset---comp_control_method + file_spatial_dataset---comp_method + file_scrnaseq_reference---comp_control_method + file_scrnaseq_reference---comp_metric + comp_control_method-->file_prediction + comp_method-->file_prediction + comp_metric-->file_score + file_prediction---comp_metric + file_common_scrnaseq---comp_data_processor ``` ->[!NOTE] -> If somehow there are no files visible in the submodule after cloning using the above command. Check the instructions [here](common/README.md). -## What to do next +## File format: Common iST Dataset + +An unprocessed spatial imaging dataset stored as a zarr file. + +Example file: +`resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr` + +Description: + +This dataset contains raw images, labels, points, shapes, and tables as +output by a dataset loader. + +Format: + +
+ + SpatialData object + images: 'image', 'image_3D', 'he_image' + labels: 'cell_labels', 'nucleus_labels' + points: 'transcripts' + shapes: 'cell_boundaries', 'nucleus_boundaries' + tables: 'metadata' + coordinate_systems: 'global' + +
+ +Data structure: + +
+ +*images* + +| Name | Description | +|:-----------|:------------------------------------| +| `image` | The raw image data. | +| `image_3D` | (*Optional*) The raw 3D image data. | +| `he_image` | (*Optional*) H&E image data. | + +*labels* + +| Name | Description | +|:-----------------|:---------------------------------------| +| `cell_labels` | (*Optional*) Cell segmentation labels. | +| `nucleus_labels` | (*Optional*) Cell segmentation labels. | + +*points* + +`transcripts`: Point cloud data of transcripts. + +| Column | Type | Description | +|:---|:---|:---| +| `x` | `float` | x-coordinate of the point. | +| `y` | `float` | y-coordinate of the point. | +| `z` | `float` | (*Optional*) z-coordinate of the point. | +| `feature_name` | `categorical` | Name of the feature. | +| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | +| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | +| `cell_type` | `string` | (*Optional*) Cell type of the cell. | +| `qv` | `float` | (*Optional*) Quality value of the point. | +| `transcript_id` | `long` | Unique identifier of the transcript. | +| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | + +*shapes* + +`cell_boundaries`: Cell boundaries. + +| Column | Type | Description | +|:-----------|:---------|:-------------------------------| +| `geometry` | `object` | Geometry of the cell boundary. | + +`nucleus_boundaries`: Nucleus boundaries. + +| Column | Type | Description | +|:-----------|:---------|:----------------------------------| +| `geometry` | `object` | Geometry of the nucleus boundary. | + +*tables* + +`metadata`: Metadata of spatial dataset. + +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_id"]` | `string` | A unique identifier for the cell. | +| `var["gene_ids"]` | `string` | Unique identifier for the gene. | +| `var["feature_types"]` | `string` | Type of the feature. | +| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | +| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | + +*coordinate_systems* + +| Name | Description | +|:---------|:------------------------------------| +| `global` | Coordinate system of the replicate. | + +
+ +## Component type: Data processor + +A data processor. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | +| `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | +| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. | + +
+ +## File format: Raw iST Dataset + +A spatial transcriptomics dataset, preprocessed for this benchmark. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr` + +Description: + +This dataset contains preprocessed images, labels, points, shapes, and +tables for spatial transcriptomics data. + +Format: + +
+ + SpatialData object + images: 'image', 'image_3D', 'he_image' + labels: 'cell_labels', 'nucleus_labels' + points: 'transcripts' + shapes: 'cell_boundaries', 'nucleus_boundaries' + tables: 'metadata' + coordinate_systems: 'global' + +
+ +Data structure: + +
+ +*images* + +| Name | Description | +|:-----------|:------------------------------------| +| `image` | The raw image data. | +| `image_3D` | (*Optional*) The raw 3D image data. | +| `he_image` | (*Optional*) H&E image data. | + +*labels* + +| Name | Description | +|:-----------------|:---------------------------------------| +| `cell_labels` | (*Optional*) Cell segmentation labels. | +| `nucleus_labels` | (*Optional*) Cell segmentation labels. | + +*points* + +`transcripts`: Point cloud data of transcripts. + +| Column | Type | Description | +|:---|:---|:---| +| `x` | `float` | x-coordinate of the point. | +| `y` | `float` | y-coordinate of the point. | +| `z` | `float` | (*Optional*) z-coordinate of the point. | +| `feature_name` | `categorical` | Name of the feature. | +| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | +| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | +| `cell_type` | `string` | (*Optional*) Cell type of the cell. | +| `qv` | `float` | (*Optional*) Quality value of the point. | +| `transcript_id` | `long` | Unique identifier of the transcript. | +| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | + +*shapes* + +`cell_boundaries`: Cell boundaries. + +| Column | Type | Description | +|:-----------|:---------|:-------------------------------| +| `geometry` | `object` | Geometry of the cell boundary. | + +`nucleus_boundaries`: Nucleus boundaries. + +| Column | Type | Description | +|:-----------|:---------|:----------------------------------| +| `geometry` | `object` | Geometry of the nucleus boundary. | + +*tables* + +`metadata`: Metadata of spatial dataset. + +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_id"]` | `string` | A unique identifier for the cell. | +| `var["gene_ids"]` | `string` | Unique identifier for the gene. | +| `var["feature_types"]` | `string` | Type of the feature. | +| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | +| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | + +*coordinate_systems* + +| Name | Description | +|:---------|:------------------------------------| +| `global` | Coordinate system of the replicate. | + +
+ +## File format: scRNA-seq Reference + +A single-cell reference dataset, preprocessed for this benchmark. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad` + +Description: + +This dataset contains preprocessed counts and metadata for single-cell +RNA-seq data. + +Format: + +
+ + AnnData object + obs: 'cell_type', 'cell_type_level2', 'cell_type_level3', 'cell_type_level4', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid' + var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level2"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level3"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level4"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | +| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | +| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | +| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | +| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | +| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | +| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | +| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | +| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | +| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | +| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | +| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | +| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | +| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | +| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | +| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | +| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | +| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | +| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | +| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `integer` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | + +
+ +## Component type: Control Method + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | + +
+ +## Component type: Method + +A method. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | + +
+ +## Component type: Metric + +A task template metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_prediction` | `file` | A predicted dataset as output by a method. | +| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) File indicating the score of a metric. | + +
+ +## File format: Predicted data + +A predicted dataset as output by a method. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad` + +Format: + +
+ + SpatialData object + labels: 'segmentation' + tables: 'table' + +
+ +Data structure: + +
+ +*labels* + +| Name | Description | +|:---------------|:--------------------------| +| `segmentation` | Segmentation of the data. | + +*tables* + +`table`: AnnData table. + +| Slot | Type | Description | +|:-----------------|:---------|:------------| +| `obs["cell_id"]` | `string` | Cell ID. | +| `obs["region"]` | `string` | Region. | + +
+ +## File format: Score + +File indicating the score of a metric. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ +## File format: Common SC Dataset + +An unprocessed dataset as output by a dataset loader. + +Example file: +`resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad` + +Description: + +This dataset contains raw counts and metadata as output by a dataset +loader. + +The format of this file is mainly derived from the [CELLxGENE schema +v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). + +Format: + +
+ + AnnData object + obs: 'cell_type', 'cell_type_level2', 'cell_type_level3', 'cell_type_level4', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid' + var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + +
+ +Data structure: + +
-Check out the [instructions](https://github.com/openproblems-bio/common_resources/blob/main/INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level2"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level3"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level4"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | +| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | +| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | +| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | +| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | +| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | +| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | +| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | +| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | +| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | +| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | +| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | +| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | +| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | +| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | +| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | +| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | +| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | +| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | +| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `integer` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -For more information on the OpenProblems v2, check out the [documentation](https://openproblems.bio/documentation/). \ No newline at end of file +
diff --git a/_viash.yaml b/_viash.yaml index 17b6093..31ad320 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -18,7 +18,7 @@ links: # Step 4: Update the label, summary and description. # A unique, human-readable, short label. Used for creating summary tables and visualisations. -label: Template +label: Spatial Segmentation summary: A one sentence summary of purpose and methodology. Used for creating an overview tables. description: | Provide a clear and concise description of your task, detailing the specific problem it aims @@ -48,6 +48,12 @@ references: info: image: The name of the image file to use for the component on the website. test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium_rep1/ + dest: resources_test/common/2023_10x_mouse_brain_xenium_rep1/ + - type: s3 + path: s3://openproblems-data/resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/ + dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/ - type: s3 path: s3://openproblems-data/resources_test/task_spatial_segmentation/ dest: resources_test/task_spatial_segmentation diff --git a/scripts/create_test_resources/mouse_brain_combined.sh b/scripts/create_test_resources/mouse_brain_combined.sh index fdc4e37..05e4de9 100755 --- a/scripts/create_test_resources/mouse_brain_combined.sh +++ b/scripts/create_test_resources/mouse_brain_combined.sh @@ -16,12 +16,12 @@ fi # we can just copy them for now aws s3 sync --profile op \ - s3://openproblems-data/resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr \ - resources_test/task_spatial_segmentation/mouse_brain_combined/raw_ist.zarr + s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr \ + resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr aws s3 cp --profile op \ - s3://openproblems-data/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad \ - resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad + s3://openproblems-data/resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \ + resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad # ...additional preprocessing if needed ... diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index ca86340..54d8e3d 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -29,8 +29,8 @@ nextflow run . \ -resume \ -c common/nextflow_helpers/labels_ci.config \ --id cxg_mouse_pancreas_atlas \ - --input_train resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad \ - --input_test resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad \ - --input_solution resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad \ + --input_train resources_test/task_spatial_segmentation/mouse_brain_combined/train.h5ad \ + --input_test resources_test/task_spatial_segmentation/mouse_brain_combined/test.h5ad \ + --input_solution resources_test/task_spatial_segmentation/mouse_brain_combined/solution.h5ad \ --output_state state.yaml \ --publish_dir "$publish_dir" diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index f637aed..3f4fa2e 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -12,16 +12,12 @@ info: the task, and also as a quality control for the metrics defined in the task. arguments: - - name: --input_train - __merge__: file_train.yaml + - name: --input + __merge__: file_spatial_dataset.yaml required: true direction: input - - name: --input_test - __merge__: file_test.yaml - required: true - direction: input - - name: "--input_solution" - __merge__: file_solution.yaml + - name: "--input_scrnaseq_reference" + __merge__: file_scrnaseq_reference.yaml direction: input required: true - name: --output @@ -33,5 +29,5 @@ test_resources: path: /common/component_tests/run_and_check_output.py - type: python_script path: /common/component_tests/check_config.py - - path: /resources_test/task_template/cxg_mouse_pancreas_atlas - dest: resources_test/task_template/cxg_mouse_pancreas_atlas \ No newline at end of file + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined \ No newline at end of file diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 1ed53bd..22c77aa 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -6,26 +6,32 @@ info: summary: A data processor. description: | A component for processing a Common Dataset into a task-specific dataset. -arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - direction: input - required: true - - name: "--output_train" - __merge__: file_train.yaml - direction: output - required: true - - name: "--output_test" - __merge__: file_test.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml - direction: output - required: true +argument_groups: + - name: Inputs + arguments: + - name: "--input_sp" + __merge__: file_common_ist.yaml + required: true + direction: input + - name: "--input_sc" + __merge__: file_common_scrnaseq.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_spatial_dataset" + __merge__: file_spatial_dataset.yaml + direction: output + required: true + - name: "--output_scrnaseq_reference" + __merge__: file_scrnaseq_reference.yaml + direction: output + required: true test_resources: - - path: /resources_test/common/cxg_mouse_pancreas_atlas - dest: resources_test/common/cxg_mouse_pancreas_atlas + - path: /resources_test/common/2023_10x_mouse_brain_xenium_rep1 + dest: resources_test/common/2023_10x_mouse_brain_xenium_rep1 + - path: /resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2 + dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2 - type: python_script path: /common/component_tests/run_and_check_output.py diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index 3a93846..633a8a1 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -7,14 +7,10 @@ info: description: | A method to predict the task effects. arguments: - - name: --input_train - __merge__: file_train.yaml + - name: --input + __merge__: file_spatial_dataset.yaml required: true direction: input - - name: "--input_test" - __merge__: file_test.yaml - direction: input - required: true - name: --output __merge__: file_prediction.yaml required: true @@ -24,5 +20,5 @@ test_resources: path: /common/component_tests/run_and_check_output.py - type: python_script path: /common/component_tests/check_config.py - - path: /resources_test/task_template/cxg_mouse_pancreas_atlas - dest: resources_test/task_template/cxg_mouse_pancreas_atlas \ No newline at end of file + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined \ No newline at end of file diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 1c76a3d..a7470e9 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -7,14 +7,14 @@ info: description: | A metric for evaluating method predictions. arguments: - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - name: "--input_prediction" __merge__: file_prediction.yaml direction: input required: true + - name: "--input_scrnaseq_reference" + __merge__: file_scrnaseq_reference.yaml + direction: input + required: true - name: "--output" __merge__: file_score.yaml direction: output @@ -24,5 +24,5 @@ test_resources: path: /common/component_tests/run_and_check_output.py - type: python_script path: /common/component_tests/check_config.py - - path: /resources_test/task_template/cxg_mouse_pancreas_atlas - dest: resources_test/task_template/cxg_mouse_pancreas_atlas + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml deleted file mode 100644 index e8a74a0..0000000 --- a/src/api/file_common_dataset.yaml +++ /dev/null @@ -1,72 +0,0 @@ -type: file -example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad" -label: "Common Dataset" -summary: A subset of the common dataset. -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: cell_type - description: Cell type information - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/api/file_common_ist.yaml b/src/api/file_common_ist.yaml new file mode 100644 index 0000000..ec45900 --- /dev/null +++ b/src/api/file_common_ist.yaml @@ -0,0 +1,171 @@ +type: file +example: "resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr" +label: "Common iST Dataset" +summary: An unprocessed spatial imaging dataset stored as a zarr file. +description: | + This dataset contains raw images, labels, points, shapes, and tables as output by a dataset loader. +info: + format: + type: spatialdata_zarr + images: + - type: object + name: image + description: The raw image data + required: true + - type: object + name: image_3D + description: The raw 3D image data + required: false + - type: object + name: he_image + description: H&E image data + required: false + labels: + - type: object + name: "cell_labels" + description: Cell segmentation labels + required: false + - type: object + name: "nucleus_labels" + description: Cell segmentation labels + required: false + # - type: datatree + # name: "{segm}_3D" + # description: Custom segmentation of the 3D data + # required: false + # - type: datatree + # name: "expert_segm_{patch}" + # description: Expert segmentation of a patch of the data + # required: false + # - type: DataTree[zyx] + # name: "expert_segm_{patch}_3D" + # description: Expert segmentation of a 3D patch of the data + # required: false + points: + - type: dataframe + name: transcripts + description: Point cloud data of transcripts + required: true + columns: + - type: float + name: "x" + required: true + description: x-coordinate of the point + - type: float + name: "y" + required: true + description: y-coordinate of the point + - type: float + name: "z" + required: false + description: z-coordinate of the point + - type: categorical + name: feature_name + required: true + description: Name of the feature + - type: integer + name: "cell_id" + required: false + description: Unique identifier of the cell + - type: integer + name: "nucleus_id" + required: false + description: Unique identifier of the nucleus + - type: string + name: "cell_type" + required: false + description: Cell type of the cell + - type: float + name: qv + required: false + description: Quality value of the point + - type: long + name: transcript_id + required: true + description: Unique identifier of the transcript + - type: boolean + name: overlaps_nucleus + required: false + description: Whether the point overlaps with a nucleus + shapes: + - type: dataframe + name: "cell_boundaries" + description: Cell boundaries + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the cell boundary + - type: dataframe + name: "nucleus_boundaries" + description: Nucleus boundaries + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the nucleus boundary + tables: + - type: anndata + name: "metadata" + description: Metadata of spatial dataset + required: true + uns: + - type: string + name: dataset_id + required: true + description: A unique identifier for the dataset + - type: string + name: dataset_name + required: true + description: A human-readable name for the dataset + - type: string + name: dataset_url + required: true + description: Link to the original source of the dataset + - type: string + name: dataset_reference + required: true + description: Bibtex reference of the paper in which the dataset was published + - type: string + name: dataset_summary + required: true + description: Short description of the dataset + - type: string + name: dataset_description + required: true + description: Long description of the dataset + - type: string + name: dataset_organism + required: true + description: The organism of the sample in the dataset + - type: string + name: segmentation_id + required: true + multiple: true + description: A unique identifier for the segmentation + obs: + - type: string + name: cell_id + required: true + description: A unique identifier for the cell + var: + - type: string + name: gene_ids + required: true + description: Unique identifier for the gene + - type: string + name: feature_types + required: true + description: Type of the feature + obsm: + - type: double + name: spatial + required: true + description: Spatial coordinates of the cell + coordinate_systems: + - type: object + name: global + description: Coordinate system of the replicate + required: true diff --git a/src/api/file_common_scrnaseq.yaml b/src/api/file_common_scrnaseq.yaml new file mode 100644 index 0000000..6986efb --- /dev/null +++ b/src/api/file_common_scrnaseq.yaml @@ -0,0 +1,259 @@ +type: file +example: "resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad" +label: "Common SC Dataset" +summary: An unprocessed dataset as output by a dataset loader. +description: | + This dataset contains raw counts and metadata as output by a dataset loader. + + The format of this file is mainly derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: integer + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: true + + - type: string + name: cell_type_level2 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: cell_type_level3 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: cell_type_level4 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: dataset_id + description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. + required: false + + - type: string + name: assay + description: Type of assay used to generate the cell data, indicating the methodology or technique employed. + required: false + + - type: string + name: assay_ontology_term_id + description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. + required: false + + - type: string + name: cell_type_ontology_term_id + description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. + required: false + + - type: string + name: development_stage + description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. + required: false + + - type: string + name: development_stage_ontology_term_id + description: | + Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. + If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. + Otherwise, the Uberon (`UBERON:`) ontology is used. + required: false + + - type: string + name: disease + description: Information on any disease or pathological condition associated with the cell or donor. + required: false + + - type: string + name: disease_ontology_term_id + description: | + Ontology term identifier for the disease, enabling standardized disease classification and referencing. + + Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). + required: false + + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: false + + - type: boolean + name: is_primary_data + description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. + required: false + + - type: string + name: organism + description: Organism from which the cell sample is obtained. + required: false + + - type: string + name: organism_ontology_term_id + description: | + Ontology term identifier for the organism, providing a standardized reference for the organism. + + Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. + required: false + + - type: string + name: self_reported_ethnicity + description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. + required: false + + - type: string + name: self_reported_ethnicity_ontology_term_id + description: | + Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. + required: false + + - type: string + name: sex + description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. + required: false + + - type: string + name: sex_ontology_term_id + description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. + required: false + + - type: string + name: suspension_type + description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. + required: false + + - type: string + name: tissue + description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. + required: false + + - type: string + name: tissue_ontology_term_id + description: | + Ontology term identifier for the tissue, providing a standardized reference for the tissue type. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: tissue_general + description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. + required: false + + - type: string + name: tissue_general_ontology_term_id + description: | + Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: false + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. + required: false + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: false + + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: true + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + + obsp: + - type: double + name: knn_distances + description: K nearest neighbors distance matrix. + required: true + + - type: double + name: knn_connectivities + description: K nearest neighbors connectivities matrix. + required: true + + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + + varm: + - type: double + name: pca_loadings + description: The PCA loadings matrix. + required: true + + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml index 26068ab..23850bb 100644 --- a/src/api/file_prediction.yaml +++ b/src/api/file_prediction.yaml @@ -1,26 +1,27 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/prediction.h5ad" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad" label: "Predicted data" summary: A predicted dataset as output by a method. info: format: - type: h5ad - obs: - - type: string - name: label_pred - description: Predicted labels for the test cells. + type: spatialdata_zarr + labels: + - type: object + name: "segmentation" + description: Segmentation of the data required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" + tables: + - type: anndata + name: table + description: AnnData table required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true \ No newline at end of file + obs: + - type: string + name: cell_id + description: Cell ID + required: true + - type: string + name: region + description: Region + required: true diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml index 8bdad65..7ad49c8 100644 --- a/src/api/file_score.yaml +++ b/src/api/file_score.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/score.h5ad" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/score.h5ad" label: Score summary: "File indicating the score of a metric." info: diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml new file mode 100644 index 0000000..06d8491 --- /dev/null +++ b/src/api/file_scrnaseq_reference.yaml @@ -0,0 +1,9 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad" +# TODO: revert to the original example once file exists +# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad" +label: "scRNA-seq Reference" +summary: A single-cell reference dataset, preprocessed for this benchmark. +description: | + This dataset contains preprocessed counts and metadata for single-cell RNA-seq data. +__merge__: file_common_scrnaseq.yaml \ No newline at end of file diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml deleted file mode 100644 index d2f6200..0000000 --- a/src/api/file_solution.yaml +++ /dev/null @@ -1,73 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad" -label: "Solution" -summary: "The solution for the test data" -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: label - description: Ground truth cell type labels - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_dataset.yaml new file mode 100644 index 0000000..5668a3f --- /dev/null +++ b/src/api/file_spatial_dataset.yaml @@ -0,0 +1,9 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr" +# TODO: revert to the original example once file exists +# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr" +label: "Raw iST Dataset" +summary: A spatial transcriptomics dataset, preprocessed for this benchmark. +description: | + This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data. +__merge__: file_common_ist.yaml diff --git a/src/api/file_test.yaml b/src/api/file_test.yaml deleted file mode 100644 index cb9d9a6..0000000 --- a/src/api/file_test.yaml +++ /dev/null @@ -1,45 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad" -label: "Test data" -summary: The subset of molecules used for the test dataset -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true \ No newline at end of file diff --git a/src/api/file_train.yaml b/src/api/file_train.yaml deleted file mode 100644 index c01eda5..0000000 --- a/src/api/file_train.yaml +++ /dev/null @@ -1,49 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad" -label: "Training data" -summary: "The training data in h5ad format" -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: label - description: Ground truth cell type labels - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 3eb56c2..7cca2bd 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -6,14 +6,10 @@ ## VIASH START par = { - 'input': 'resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad', - 'method': 'batch', - 'seed': None, - 'obs_batch': 'batch', - 'obs_label': 'cell_type', - 'output_train': 'train.h5ad', - 'output_test': 'test.h5ad', - 'output_solution': 'solution.h5ad' + 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', + 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', + 'output_spatial_dataset': 'output_spatial_dataset.zarr', + 'output_scrnaseq_reference': 'output_scrnaseq_reference.h5ad', } meta = { 'resources_dir': 'target/executable/data_processors/process_dataset', diff --git a/src/methods/cellpose/config.vsh.yaml b/src/methods/cellpose/config.vsh.yaml new file mode 100644 index 0000000..46be884 --- /dev/null +++ b/src/methods/cellpose/config.vsh.yaml @@ -0,0 +1,67 @@ +name: cellpose +label: "Cellpose" +# TODO: update the summary, description and links +summary: "Output of the segmantation methot cellpose" +description: "Output of the segmantation methot cellpose" +links: # these should point to the documentation of the method + documentation: "https://github.com/openproblems-bio/task_ist_preprocessing" + repository: "https://github.com/openproblems-bio/task_ist_preprocessing" +references: + doi: "10.1038/s41592-020-01018-x" + + +__merge__: /src/api/comp_method.yaml + +arguments: + - name: --diameter + type: double + description: "Cell diameter in pixels. If not set, cellpose runs a size model to estimate it (slower)." + info: + test_default: 30 + + - name: --flow_threshold + type: double + description: "Flow error threshold. Set to 0 to skip flow quality check for faster execution." + info: + test_default: 0 + + - name: --niter + type: integer + description: "Number of iterations for dynamics. Lower values are faster but less accurate." + info: + test_default: 10 + + - name: --min_size + type: integer + description: "Minimum number of pixels per mask. Set to -1 to skip small mask removal." + info: + test_default: -1 + + - name: --resample + type: boolean + description: "Whether to run dynamics at original image size. Disabling is faster." + info: + test_default: false + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work + image: openproblems/base_python:1 + setup: + - type: python + pypi: cellpose + - type: python + script: from cellpose.models import CellposeModel; model = CellposeModel() + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [ midtime, midcpu, veryhighmem, gpu ] diff --git a/src/methods/cellpose/script.py b/src/methods/cellpose/script.py new file mode 100644 index 0000000..4949b8d --- /dev/null +++ b/src/methods/cellpose/script.py @@ -0,0 +1,51 @@ +import dask.array as da +import numpy as np +import os +import shutil +import spatialdata as sd +import xarray as xr +from cellpose.models import CellposeModel +from spatialdata.models import Labels2DModel + +## VIASH START +par = { + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', + 'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad' +} +meta = { + 'name': 'cellpose' +} +## VIASH END + +print('Reading input', flush=True) +sdata = sd.read_zarr(par["input"]) +image = sdata['morphology_mip']['scale0'].image.compute().to_numpy() +transformation = sdata['morphology_mip']['scale0'].image.transform.copy() + +print('Initializing Cellpose model', flush=True) +model = CellposeModel() + +eval_params = {k: par[k] for k in ("diameter", "flow_threshold", "niter", "min_size", "resample") if par.get(k) is not None} +print(f"Running Cellpose segmentation with parameters: {eval_params}") +masks, _, _ = model.eval(image[0], progress=True, **eval_params) + +print('Cellpose segmentation finished, post-processing results', flush=True) +# Convert to smallest sufficient unsigned int dtype +max_val = masks.max() +for dtype in (np.uint8, np.uint16, np.uint32, np.uint64): + if max_val <= np.iinfo(dtype).max: + masks = masks.astype(dtype) + break + +print('Segmentation done, preparing output', flush=True) +sd_output = sd.SpatialData() +# Wrap masks as a single-chunk dask array with flat chunk shape for zarr v3 compat +dask_masks = da.from_array(masks, chunks=masks.shape) +data_array = xr.DataArray(dask_masks, name='segmentation', dims=('y', 'x')) +parsed = Labels2DModel.parse(data_array, transformations=transformation) +sd_output.labels['segmentation'] = parsed + +print('Saving output', flush=True) +if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) +sd_output.write(par["output"]) diff --git a/src/methods/logistic_regression/config.vsh.yaml b/src/methods/logistic_regression/config.vsh.yaml deleted file mode 100644 index e570c03..0000000 --- a/src/methods/logistic_regression/config.vsh.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_method.yaml - - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: logistic_regression -# A relatively short label, used when rendering visualisations (required) -label: Logistic Regression -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. -summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. -description: | - Logistic Regression estimates parameters of a logistic function for - multivariate classification tasks. Here, we use 100-dimensional whitened PCA - coordinates as independent variables, and the model minimises the cross - entropy loss over all cell type classes. -# Metadata for your component -# A reference key from the bibtex library at src/common/library.bib (required). -references: - bibtex: - - | - @book{hosmer2013applied, - title = {Applied logistic regression}, - author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, - year = {2013}, - publisher = {John Wiley \& Sons}, - volume = {398} - } - -links: - # URL to the code repository for this method (required). - repository: https://github.com/scikit-learn/scikit-learn - # URL to the documentation for this method (required). - documentation: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" - -info: - # Which normalisation method this component prefers to use (required). - preferred_normalization: log_cp10k - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - setup: - - type: python - packages: scikit-learn - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/methods/logistic_regression/script.py b/src/methods/logistic_regression/script.py deleted file mode 100644 index 6ab5782..0000000 --- a/src/methods/logistic_regression/script.py +++ /dev/null @@ -1,46 +0,0 @@ -import anndata as ad -import sklearn.linear_model - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_train': 'resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad', - 'input_test': 'resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'logistic_regression' -} -## VIASH END - -print('Reading input files', flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print('Preprocess data', flush=True) -# ... preprocessing ... - -print('Train model', flush=True) -# ... train model ... -classifier = sklearn.linear_model.LogisticRegression() -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) - -print('Generate predictions', flush=True) -# ... generate predictions ... -obs_label_pred = classifier.predict(input_test.obsm["X_pca"]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_train.uns['dataset_id'], - 'normalization_id': input_train.uns['normalization_id'], - 'method_id': meta['name'] - }, - obs={ - 'label_pred': obs_label_pred - } -) -output.obs_names = input_test.obs_names - -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index d2e4915..c71286a 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -4,24 +4,24 @@ namespace: workflows argument_groups: - name: Inputs arguments: - - name: "--input" - __merge__: /src/api/file_common_dataset.yaml + - name: "--input_sp" + __merge__: /src/api/file_common_ist.yaml + required: true + direction: input + - name: "--input_sc" + __merge__: /src/api/file_common_scrnaseq.yaml required: true direction: input - name: Outputs arguments: - - name: "--output_train" - __merge__: /src/api/file_train.yaml - required: true + - name: "--output_spatial_dataset" + __merge__: /src/api/file_spatial_dataset.yaml direction: output - - name: "--output_test" - __merge__: /src/api/file_test.yaml required: true + - name: "--output_scrnaseq_reference" + __merge__: /src/api/file_scrnaseq_reference.yaml direction: output - - name: "--output_solution" - __merge__: /src/api/file_solution.yaml required: true - direction: output resources: - type: nextflow_script diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 2732475..947a8f1 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -14,41 +14,43 @@ workflow run_wf { main: output_ch = input_ch - | check_dataset_with_schema.run( - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset": checks["exit_code"] == 0 ? state.input : null, - ] - } - ) - - // remove datasets which didn't pass the schema check - | filter { id, state -> - state.dataset != null - } + // | check_dataset_with_schema.run( + // fromState: { id, state -> + // def schema = findArgumentSchema(meta.config, "input") + // def schemaYaml = tempFile("schema.yaml") + // writeYaml(schema, schemaYaml) + // [ + // "input": state.input, + // "schema": schemaYaml + // ] + // }, + // toState: { id, output, state -> + // // read the output to see if dataset passed the qc + // def checks = readYaml(output.output) + // state + [ + // "dataset": checks["exit_code"] == 0 ? state.input : null, + // ] + // } + // ) + + // // remove datasets which didn't pass the schema check + // | filter { id, state -> + // state.dataset != null + // } | process_dataset.run( - fromState: [ input: "dataset" ], + fromState: [ + input_sp: "input_sp", + "input_sc": "input_sc" + ], toState: [ - output_train: "output_train", - output_test: "output_test", - output_solution: "output_solution" + output_spatial_dataset: "output_spatial_dataset", + output_scrnaseq_reference: "output_scrnaseq_reference" ] ) // only output the files for which an output file was specified - | setState(["output_train", "output_test", "output_solution"]) + | setState(["output_spatial_dataset", "output_scrnaseq_reference"]) emit: output_ch diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 4c1602d..4ab5f83 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -4,20 +4,13 @@ namespace: workflows argument_groups: - name: Inputs arguments: - - name: "--input_train" - __merge__: /src/api/file_train.yaml - type: file - direction: input - required: true - - name: "--input_test" - __merge__: /src/api/file_test.yaml - type: file - direction: input + - name: "--input_spatial_dataset" + __merge__: /src/api/file_spatial_dataset.yaml + direction: output required: true - - name: "--input_solution" - __merge__: /src/api/file_solution.yaml - type: file - direction: input + - name: "--input_scrnaseq_reference" + __merge__: /src/api/file_scrnaseq_reference.yaml + direction: output required: true - name: Outputs arguments: @@ -65,7 +58,7 @@ dependencies: - name: utils/extract_uns_metadata repository: openproblems - name: control_methods/true_labels - - name: methods/logistic_regression + - name: methods/cellpose - name: metrics/accuracy runners: diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 826dec4..fe25140 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -8,7 +8,7 @@ workflow auto { // construct list of methods and control methods methods = [ true_labels, - logistic_regression + cellpose ] // construct list of metrics