From a31d84a67379861193193ff99492a2c6a197fa14 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 10:19:54 +0200 Subject: [PATCH 01/17] update test resource --- scripts/create_test_resources/mouse_brain_combined.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/create_test_resources/mouse_brain_combined.sh b/scripts/create_test_resources/mouse_brain_combined.sh index fdc4e37..05e4de9 100755 --- a/scripts/create_test_resources/mouse_brain_combined.sh +++ b/scripts/create_test_resources/mouse_brain_combined.sh @@ -16,12 +16,12 @@ fi # we can just copy them for now aws s3 sync --profile op \ - s3://openproblems-data/resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr \ - resources_test/task_spatial_segmentation/mouse_brain_combined/raw_ist.zarr + s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr \ + resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr aws s3 cp --profile op \ - s3://openproblems-data/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad \ - resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad + s3://openproblems-data/resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \ + resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad # ...additional preprocessing if needed ... From 270e500352627b0c28d46afaf1b4f09defb1003e Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 10:20:13 +0200 Subject: [PATCH 02/17] update api files --- src/api/comp_data_processor.yaml | 42 +++-- src/api/file_common_dataset.yaml | 72 --------- src/api/file_common_ist.yaml | 171 ++++++++++++++++++++ src/api/file_common_scrnaseq.yaml | 259 ++++++++++++++++++++++++++++++ 4 files changed, 455 insertions(+), 89 deletions(-) delete mode 100644 src/api/file_common_dataset.yaml create mode 100644 src/api/file_common_ist.yaml create mode 100644 src/api/file_common_scrnaseq.yaml diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 1ed53bd..333680e 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -6,23 +6,31 @@ info: summary: A data processor. description: | A component for processing a Common Dataset into a task-specific dataset. -arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - direction: input - required: true - - name: "--output_train" - __merge__: file_train.yaml - direction: output - required: true - - name: "--output_test" - __merge__: file_test.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml - direction: output - required: true +argument_groups: + - name: Inputs + arguments: + - name: "--input_sp" + __merge__: file_common_ist.yaml + required: true + direction: input + - name: "--input_sc" + __merge__: file_common_scrnaseq.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_train" + __merge__: file_train.yaml + direction: output + required: true + - name: "--output_test" + __merge__: file_test.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true test_resources: - path: /resources_test/common/cxg_mouse_pancreas_atlas dest: resources_test/common/cxg_mouse_pancreas_atlas diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml deleted file mode 100644 index e8a74a0..0000000 --- a/src/api/file_common_dataset.yaml +++ /dev/null @@ -1,72 +0,0 @@ -type: file -example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad" -label: "Common Dataset" -summary: A subset of the common dataset. -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: cell_type - description: Cell type information - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/api/file_common_ist.yaml b/src/api/file_common_ist.yaml new file mode 100644 index 0000000..529b5d2 --- /dev/null +++ b/src/api/file_common_ist.yaml @@ -0,0 +1,171 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr" +label: "Common iST Dataset" +summary: An unprocessed spatial imaging dataset stored as a zarr file. +description: | + This dataset contains raw images, labels, points, shapes, and tables as output by a dataset loader. +info: + format: + type: spatialdata_zarr + images: + - type: object + name: image + description: The raw image data + required: true + - type: object + name: image_3D + description: The raw 3D image data + required: false + - type: object + name: he_image + description: H&E image data + required: false + labels: + - type: object + name: "cell_labels" + description: Cell segmentation labels + required: false + - type: object + name: "nucleus_labels" + description: Cell segmentation labels + required: false + # - type: datatree + # name: "{segm}_3D" + # description: Custom segmentation of the 3D data + # required: false + # - type: datatree + # name: "expert_segm_{patch}" + # description: Expert segmentation of a patch of the data + # required: false + # - type: DataTree[zyx] + # name: "expert_segm_{patch}_3D" + # description: Expert segmentation of a 3D patch of the data + # required: false + points: + - type: dataframe + name: transcripts + description: Point cloud data of transcripts + required: true + columns: + - type: float + name: "x" + required: true + description: x-coordinate of the point + - type: float + name: "y" + required: true + description: y-coordinate of the point + - type: float + name: "z" + required: false + description: z-coordinate of the point + - type: categorical + name: feature_name + required: true + description: Name of the feature + - type: integer + name: "cell_id" + required: false + description: Unique identifier of the cell + - type: integer + name: "nucleus_id" + required: false + description: Unique identifier of the nucleus + - type: string + name: "cell_type" + required: false + description: Cell type of the cell + - type: float + name: qv + required: false + description: Quality value of the point + - type: long + name: transcript_id + required: true + description: Unique identifier of the transcript + - type: boolean + name: overlaps_nucleus + required: false + description: Whether the point overlaps with a nucleus + shapes: + - type: dataframe + name: "cell_boundaries" + description: Cell boundaries + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the cell boundary + - type: dataframe + name: "nucleus_boundaries" + description: Nucleus boundaries + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the nucleus boundary + tables: + - type: anndata + name: "metadata" + description: Metadata of spatial dataset + required: true + uns: + - type: string + name: dataset_id + required: true + description: A unique identifier for the dataset + - type: string + name: dataset_name + required: true + description: A human-readable name for the dataset + - type: string + name: dataset_url + required: true + description: Link to the original source of the dataset + - type: string + name: dataset_reference + required: true + description: Bibtex reference of the paper in which the dataset was published + - type: string + name: dataset_summary + required: true + description: Short description of the dataset + - type: string + name: dataset_description + required: true + description: Long description of the dataset + - type: string + name: dataset_organism + required: true + description: The organism of the sample in the dataset + - type: string + name: segmentation_id + required: true + multiple: true + description: A unique identifier for the segmentation + obs: + - type: string + name: cell_id + required: true + description: A unique identifier for the cell + var: + - type: string + name: gene_ids + required: true + description: Unique identifier for the gene + - type: string + name: feature_types + required: true + description: Type of the feature + obsm: + - type: double + name: spatial + required: true + description: Spatial coordinates of the cell + coordinate_systems: + - type: object + name: global + description: Coordinate system of the replicate + required: true diff --git a/src/api/file_common_scrnaseq.yaml b/src/api/file_common_scrnaseq.yaml new file mode 100644 index 0000000..0a15596 --- /dev/null +++ b/src/api/file_common_scrnaseq.yaml @@ -0,0 +1,259 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad" +label: "Common SC Dataset" +summary: An unprocessed dataset as output by a dataset loader. +description: | + This dataset contains raw counts and metadata as output by a dataset loader. + + The format of this file is mainly derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: integer + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: true + + - type: string + name: cell_type_level2 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: cell_type_level3 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: cell_type_level4 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: dataset_id + description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. + required: false + + - type: string + name: assay + description: Type of assay used to generate the cell data, indicating the methodology or technique employed. + required: false + + - type: string + name: assay_ontology_term_id + description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. + required: false + + - type: string + name: cell_type_ontology_term_id + description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. + required: false + + - type: string + name: development_stage + description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. + required: false + + - type: string + name: development_stage_ontology_term_id + description: | + Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. + If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. + Otherwise, the Uberon (`UBERON:`) ontology is used. + required: false + + - type: string + name: disease + description: Information on any disease or pathological condition associated with the cell or donor. + required: false + + - type: string + name: disease_ontology_term_id + description: | + Ontology term identifier for the disease, enabling standardized disease classification and referencing. + + Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). + required: false + + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: false + + - type: boolean + name: is_primary_data + description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. + required: false + + - type: string + name: organism + description: Organism from which the cell sample is obtained. + required: false + + - type: string + name: organism_ontology_term_id + description: | + Ontology term identifier for the organism, providing a standardized reference for the organism. + + Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. + required: false + + - type: string + name: self_reported_ethnicity + description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. + required: false + + - type: string + name: self_reported_ethnicity_ontology_term_id + description: | + Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. + required: false + + - type: string + name: sex + description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. + required: false + + - type: string + name: sex_ontology_term_id + description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. + required: false + + - type: string + name: suspension_type + description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. + required: false + + - type: string + name: tissue + description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. + required: false + + - type: string + name: tissue_ontology_term_id + description: | + Ontology term identifier for the tissue, providing a standardized reference for the tissue type. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: tissue_general + description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. + required: false + + - type: string + name: tissue_general_ontology_term_id + description: | + Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: false + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. + required: false + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: false + + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: true + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + + obsp: + - type: double + name: knn_distances + description: K nearest neighbors distance matrix. + required: true + + - type: double + name: knn_connectivities + description: K nearest neighbors connectivities matrix. + required: true + + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + + varm: + - type: double + name: pca_loadings + description: The PCA loadings matrix. + required: true + + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true From 034c10c7c8e6aee002052e8cc8f2832d0f9b6c29 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 10:25:38 +0200 Subject: [PATCH 03/17] update api files --- _viash.yaml | 2 +- scripts/run_benchmark/run_test_local.sh | 6 +- src/api/comp_control_method.yaml | 16 ++---- src/api/comp_data_processor.yaml | 12 ++-- src/api/comp_method.yaml | 12 ++-- src/api/comp_metric.yaml | 12 ++-- src/api/file_prediction.yaml | 2 +- src/api/file_score.yaml | 2 +- src/api/file_scrnaseq_reference.yaml | 7 +++ src/api/file_solution.yaml | 73 ------------------------- src/api/file_spatial_dataset.yaml | 7 +++ src/api/file_test.yaml | 45 --------------- src/api/file_train.yaml | 49 ----------------- 13 files changed, 40 insertions(+), 205 deletions(-) create mode 100644 src/api/file_scrnaseq_reference.yaml delete mode 100644 src/api/file_solution.yaml create mode 100644 src/api/file_spatial_dataset.yaml delete mode 100644 src/api/file_test.yaml delete mode 100644 src/api/file_train.yaml diff --git a/_viash.yaml b/_viash.yaml index 17b6093..38c3830 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -18,7 +18,7 @@ links: # Step 4: Update the label, summary and description. # A unique, human-readable, short label. Used for creating summary tables and visualisations. -label: Template +label: Spatial Segmentation summary: A one sentence summary of purpose and methodology. Used for creating an overview tables. description: | Provide a clear and concise description of your task, detailing the specific problem it aims diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index ca86340..54d8e3d 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -29,8 +29,8 @@ nextflow run . \ -resume \ -c common/nextflow_helpers/labels_ci.config \ --id cxg_mouse_pancreas_atlas \ - --input_train resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad \ - --input_test resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad \ - --input_solution resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad \ + --input_train resources_test/task_spatial_segmentation/mouse_brain_combined/train.h5ad \ + --input_test resources_test/task_spatial_segmentation/mouse_brain_combined/test.h5ad \ + --input_solution resources_test/task_spatial_segmentation/mouse_brain_combined/solution.h5ad \ --output_state state.yaml \ --publish_dir "$publish_dir" diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index f637aed..3f4fa2e 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -12,16 +12,12 @@ info: the task, and also as a quality control for the metrics defined in the task. arguments: - - name: --input_train - __merge__: file_train.yaml + - name: --input + __merge__: file_spatial_dataset.yaml required: true direction: input - - name: --input_test - __merge__: file_test.yaml - required: true - direction: input - - name: "--input_solution" - __merge__: file_solution.yaml + - name: "--input_scrnaseq_reference" + __merge__: file_scrnaseq_reference.yaml direction: input required: true - name: --output @@ -33,5 +29,5 @@ test_resources: path: /common/component_tests/run_and_check_output.py - type: python_script path: /common/component_tests/check_config.py - - path: /resources_test/task_template/cxg_mouse_pancreas_atlas - dest: resources_test/task_template/cxg_mouse_pancreas_atlas \ No newline at end of file + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined \ No newline at end of file diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 333680e..8c02b1d 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -19,16 +19,12 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_train" - __merge__: file_train.yaml + - name: "--output_spatial_dataset" + __merge__: file_spatial_dataset.yaml direction: output required: true - - name: "--output_test" - __merge__: file_test.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml + - name: "--output_scrnaseq_reference" + __merge__: file_scrnaseq_reference.yaml direction: output required: true test_resources: diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index 3a93846..633a8a1 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -7,14 +7,10 @@ info: description: | A method to predict the task effects. arguments: - - name: --input_train - __merge__: file_train.yaml + - name: --input + __merge__: file_spatial_dataset.yaml required: true direction: input - - name: "--input_test" - __merge__: file_test.yaml - direction: input - required: true - name: --output __merge__: file_prediction.yaml required: true @@ -24,5 +20,5 @@ test_resources: path: /common/component_tests/run_and_check_output.py - type: python_script path: /common/component_tests/check_config.py - - path: /resources_test/task_template/cxg_mouse_pancreas_atlas - dest: resources_test/task_template/cxg_mouse_pancreas_atlas \ No newline at end of file + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined \ No newline at end of file diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 1c76a3d..a7470e9 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -7,14 +7,14 @@ info: description: | A metric for evaluating method predictions. arguments: - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - name: "--input_prediction" __merge__: file_prediction.yaml direction: input required: true + - name: "--input_scrnaseq_reference" + __merge__: file_scrnaseq_reference.yaml + direction: input + required: true - name: "--output" __merge__: file_score.yaml direction: output @@ -24,5 +24,5 @@ test_resources: path: /common/component_tests/run_and_check_output.py - type: python_script path: /common/component_tests/check_config.py - - path: /resources_test/task_template/cxg_mouse_pancreas_atlas - dest: resources_test/task_template/cxg_mouse_pancreas_atlas + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml index 26068ab..d8d042b 100644 --- a/src/api/file_prediction.yaml +++ b/src/api/file_prediction.yaml @@ -1,6 +1,6 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/prediction.h5ad" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad" label: "Predicted data" summary: A predicted dataset as output by a method. info: diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml index 8bdad65..7ad49c8 100644 --- a/src/api/file_score.yaml +++ b/src/api/file_score.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/score.h5ad" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/score.h5ad" label: Score summary: "File indicating the score of a metric." info: diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml new file mode 100644 index 0000000..52db7e9 --- /dev/null +++ b/src/api/file_scrnaseq_reference.yaml @@ -0,0 +1,7 @@ +type: file +example: "resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad" +label: "scRNA-seq Reference" +summary: A single-cell reference dataset, preprocessed for this benchmark. +description: | + This dataset contains preprocessed counts and metadata for single-cell RNA-seq data. +__merge__: file_common_scrnaseq.yaml \ No newline at end of file diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml deleted file mode 100644 index d2f6200..0000000 --- a/src/api/file_solution.yaml +++ /dev/null @@ -1,73 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad" -label: "Solution" -summary: "The solution for the test data" -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: label - description: Ground truth cell type labels - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_dataset.yaml new file mode 100644 index 0000000..0b86628 --- /dev/null +++ b/src/api/file_spatial_dataset.yaml @@ -0,0 +1,7 @@ +type: file +example: "resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr" +label: "Raw iST Dataset" +summary: A spatial transcriptomics dataset, preprocessed for this benchmark. +description: | + This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data. +__merge__: file_common_ist.yaml diff --git a/src/api/file_test.yaml b/src/api/file_test.yaml deleted file mode 100644 index cb9d9a6..0000000 --- a/src/api/file_test.yaml +++ /dev/null @@ -1,45 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad" -label: "Test data" -summary: The subset of molecules used for the test dataset -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true \ No newline at end of file diff --git a/src/api/file_train.yaml b/src/api/file_train.yaml deleted file mode 100644 index c01eda5..0000000 --- a/src/api/file_train.yaml +++ /dev/null @@ -1,49 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad" -label: "Training data" -summary: "The training data in h5ad format" -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: label - description: Ground truth cell type labels - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true \ No newline at end of file From 85e16db4866ff20b18a1f7b1a60d1a5949725afb Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 10:26:31 +0200 Subject: [PATCH 04/17] update par in process_dataset --- src/data_processors/process_dataset/script.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 3eb56c2..7cca2bd 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -6,14 +6,10 @@ ## VIASH START par = { - 'input': 'resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad', - 'method': 'batch', - 'seed': None, - 'obs_batch': 'batch', - 'obs_label': 'cell_type', - 'output_train': 'train.h5ad', - 'output_test': 'test.h5ad', - 'output_solution': 'solution.h5ad' + 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', + 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', + 'output_spatial_dataset': 'output_spatial_dataset.zarr', + 'output_scrnaseq_reference': 'output_scrnaseq_reference.h5ad', } meta = { 'resources_dir': 'target/executable/data_processors/process_dataset', From 57f9642cd3791fa54dac1fa1c596a69095c8e98f Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 10:26:38 +0200 Subject: [PATCH 05/17] update readme --- README.md | 408 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 385 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index da3ffe5..f505bb8 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,399 @@ -# Task Template +# Spatial Segmentation -This repo is a template to create a new task for the OpenProblems v2. This repo contains several example files and components that can be used when updated with the task info. -> [!WARNING] -> This README will be overwritten when performing the `create_task_readme` script. + -## Create a repository from this template +A one sentence summary of purpose and methodology. Used for creating an +overview tables. -> [!IMPORTANT] -> Before creating a new repository, make sure you are part of the OpenProblems task team. This will be done when you create an issue for the task and you get the go ahead to create the task. -> For more information on how to create a new task, check out the [Create a new task](https://openproblems.bio/documentation/create_task/) documentation. +Repository: +[openproblems-bio/task_template](https://github.com/openproblems-bio/task_template) -The instructions below will guide you through creating a new repository from this template ([creating-a-repository-from-a-template](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template)). +## Description +Provide a clear and concise description of your task, detailing the +specific problem it aims to solve. Outline the input data types, the +expected output, and any assumptions or constraints. Be sure to explain +any terminology or concepts that are essential for understanding the +task. -* Click the "Use this template" button on the top right of the repository. -* Use the Owner dropdown menu to select the `openproblems-bio` account. -* Type a name for your repository (task_...), and a description. -* Set the repository visibility to public. -* Click "Create repository from template". +Explain the motivation behind your proposed task. Describe the +biological or computational problem you aim to address and why it’s +important. Discuss the current state of research in this area and any +gaps or challenges that your task could help address. This section +should convince readers of the significance and relevance of your task. -## Clone the repository +## Authors & contributors -To clone the repository with the submodule files, you can use the following command: +| name | roles | +|:---------|:-------------------| +| John Doe | author, maintainer | -```bash -git clone --recursive git@github.com:openproblems-bio/.git +## API + +``` mermaid +flowchart TB + file_common_ist("Common iST Dataset") + comp_data_processor[/"Data processor"/] + file_scrnaseq_reference("scRNA-seq Reference") + file_spatial_dataset("Raw iST Dataset") + comp_control_method[/"Control Method"/] + comp_metric[/"Metric"/] + comp_method[/"Method"/] + file_prediction("Predicted data") + file_score("Score") + file_common_scrnaseq("Common SC Dataset") + file_common_ist---comp_data_processor + comp_data_processor-->file_scrnaseq_reference + comp_data_processor-->file_spatial_dataset + file_scrnaseq_reference---comp_control_method + file_scrnaseq_reference---comp_metric + file_spatial_dataset---comp_control_method + file_spatial_dataset---comp_method + comp_control_method-->file_prediction + comp_metric-->file_score + comp_method-->file_prediction + file_prediction---comp_metric + file_common_scrnaseq---comp_data_processor ``` ->[!NOTE] -> If somehow there are no files visible in the submodule after cloning using the above command. Check the instructions [here](common/README.md). -## What to do next +## File format: Common iST Dataset + +An unprocessed spatial imaging dataset stored as a zarr file. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr` + +Description: + +This dataset contains raw images, labels, points, shapes, and tables as +output by a dataset loader. + +Format: + +
+ +
+ +Data structure: + +
+ +
+ +## Component type: Data processor + +A data processor. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | +| `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | +| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. | + +
+ +## File format: scRNA-seq Reference + +A single-cell reference dataset, preprocessed for this benchmark. + +Example file: +`resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad` + +Description: + +This dataset contains preprocessed counts and metadata for single-cell +RNA-seq data. + +Format: + +
+ + AnnData object + obs: 'cell_type', 'cell_type_level2', 'cell_type_level3', 'cell_type_level4', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid' + var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level2"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level3"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level4"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | +| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | +| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | +| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | +| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | +| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | +| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | +| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | +| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | +| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | +| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | +| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | +| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | +| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | +| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | +| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | +| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | +| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | +| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | +| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `integer` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | + +
+ +## File format: Raw iST Dataset + +A spatial transcriptomics dataset, preprocessed for this benchmark. + +Example file: +`resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr` + +Description: + +This dataset contains preprocessed images, labels, points, shapes, and +tables for spatial transcriptomics data. + +Format: + +
+ +
+ +Data structure: + +
+ +
+ +## Component type: Control Method + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | + +
+ +## Component type: Metric + +A task template metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_prediction` | `file` | A predicted dataset as output by a method. | +| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) File indicating the score of a metric. | + +
+ +## Component type: Method + +A method. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | + +
+ +## File format: Predicted data + +A predicted dataset as output by a method. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad` + +Format: + +
+ + AnnData object + obs: 'label_pred' + uns: 'dataset_id', 'normalization_id', 'method_id' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:-------------------------------------| +| `obs["label_pred"]` | `string` | Predicted labels for the test cells. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +File indicating the score of a metric. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ +## File format: Common SC Dataset + +An unprocessed dataset as output by a dataset loader. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad` + +Description: + +This dataset contains raw counts and metadata as output by a dataset +loader. + +The format of this file is mainly derived from the [CELLxGENE schema +v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). + +Format: + +
+ + AnnData object + obs: 'cell_type', 'cell_type_level2', 'cell_type_level3', 'cell_type_level4', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid' + var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level2"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level3"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level4"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | +| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | +| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | +| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | +| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | +| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | +| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | +| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | +| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | +| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | +| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | +| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | +| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | +| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | +| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | +| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | +| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | +| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | +| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | +| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `integer` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -Check out the [instructions](https://github.com/openproblems-bio/common_resources/blob/main/INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. +
-For more information on the OpenProblems v2, check out the [documentation](https://openproblems.bio/documentation/). \ No newline at end of file From 59cd82740d98c15ae8ff3ca1e8204082610be388 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 10:31:02 +0200 Subject: [PATCH 06/17] update prediction api --- README.md | 11 ----------- src/api/file_prediction.yaml | 35 ++++++++++++++++++----------------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index f505bb8..8d4beee 100644 --- a/README.md +++ b/README.md @@ -264,23 +264,12 @@ Format:
- AnnData object - obs: 'label_pred' - uns: 'dataset_id', 'normalization_id', 'method_id' -
Data structure:
-| Slot | Type | Description | -|:--------------------------|:---------|:-------------------------------------| -| `obs["label_pred"]` | `string` | Predicted labels for the test cells. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -
## File format: Score diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml index d8d042b..23850bb 100644 --- a/src/api/file_prediction.yaml +++ b/src/api/file_prediction.yaml @@ -5,22 +5,23 @@ label: "Predicted data" summary: A predicted dataset as output by a method. info: format: - type: h5ad - obs: - - type: string - name: label_pred - description: Predicted labels for the test cells. + type: spatialdata_zarr + labels: + - type: object + name: "segmentation" + description: Segmentation of the data required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" + tables: + - type: anndata + name: table + description: AnnData table required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true \ No newline at end of file + obs: + - type: string + name: cell_id + description: Cell ID + required: true + - type: string + name: region + description: Region + required: true From 4815ce7110c3d65debbe33be6fc0035fc35f5afd Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 10:36:06 +0200 Subject: [PATCH 07/17] use workaround examples for now --- src/api/file_scrnaseq_reference.yaml | 4 +++- src/api/file_spatial_dataset.yaml | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml index 52db7e9..06d8491 100644 --- a/src/api/file_scrnaseq_reference.yaml +++ b/src/api/file_scrnaseq_reference.yaml @@ -1,5 +1,7 @@ type: file -example: "resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad" +# TODO: revert to the original example once file exists +# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad" label: "scRNA-seq Reference" summary: A single-cell reference dataset, preprocessed for this benchmark. description: | diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_dataset.yaml index 0b86628..5668a3f 100644 --- a/src/api/file_spatial_dataset.yaml +++ b/src/api/file_spatial_dataset.yaml @@ -1,5 +1,7 @@ type: file -example: "resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr" +# TODO: revert to the original example once file exists +# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr" label: "Raw iST Dataset" summary: A spatial transcriptomics dataset, preprocessed for this benchmark. description: | From d29b0eb5d4222f09cf26e52da33c08bdc5d3207c Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 10:40:10 +0200 Subject: [PATCH 08/17] copy cellpose from task_ist_preprocessing --- src/methods/cellpose/config.vsh.yaml | 97 +++++++++++++++++++ src/methods/cellpose/script.py | 57 +++++++++++ .../logistic_regression/config.vsh.yaml | 79 --------------- src/methods/logistic_regression/script.py | 46 --------- 4 files changed, 154 insertions(+), 125 deletions(-) create mode 100644 src/methods/cellpose/config.vsh.yaml create mode 100644 src/methods/cellpose/script.py delete mode 100644 src/methods/logistic_regression/config.vsh.yaml delete mode 100644 src/methods/logistic_regression/script.py diff --git a/src/methods/cellpose/config.vsh.yaml b/src/methods/cellpose/config.vsh.yaml new file mode 100644 index 0000000..d5fd3f0 --- /dev/null +++ b/src/methods/cellpose/config.vsh.yaml @@ -0,0 +1,97 @@ +name: cellpose +label: "Cellpose" +# TODO: update the summary, description and links +summary: "Output of the segmantation methot cellpose" +description: "Output of the segmantation methot cellpose" +links: # these should point to the documentation of the method + documentation: "https://github.com/openproblems-bio/task_ist_preprocessing" + repository: "https://github.com/openproblems-bio/task_ist_preprocessing" +references: + doi: "10.1038/s41592-020-01018-x" + + +__merge__: /src/api/comp_method.yaml + +arguments: + - name: --batch_size + type: integer + default: 8 + - name: --model_type + type: string + default: "cyto" + - name: --resample + type: boolean + default: True + - name: --channel_axis + type: string + default: "None" + - name: --z_axis + type: string + default: "None" + - name: --normalize + type: boolean + default: True + - name: --invert + type: boolean + default: False + - name: --rescale + type: string + default: "None" + - name: --diameter + type: double + default: 30.0 #default should be None with cellpose v4 + - name: --do_3D + type: boolean + default: False + - name: --anisotropy + type: string + default: "None" +# - name: --net_avg +# type: boolean +# default: False + - name: --augment + type: boolean + default: False + #- name: --tile + # type: boolean + # default: True + - name: --tile_overlap + type: double + default: 0.1 + #- name: --interp #Seems to be removed in v4 + # type: boolean + # default: True + - name: --flow_threshold + type: double + default: 0.4 + - name: --cellprob_threshold + type: double + default: 0.0 + - name: --min_size + type: integer + default: 15 + - name: --stitch_threshold + type: double + default: 0.0 + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work + image: openproblems/base_python:1 + setup: + - type: python + pypi: cellpose<4.0.0 + __merge__: + - /src/base/setup_txsim_partial.yaml + - /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [ midtime, midcpu, veryhighmem, gpu ] diff --git a/src/methods/cellpose/script.py b/src/methods/cellpose/script.py new file mode 100644 index 0000000..15f8590 --- /dev/null +++ b/src/methods/cellpose/script.py @@ -0,0 +1,57 @@ +import txsim as tx +import numpy as np +import os +import yaml +import spatialdata as sd +import anndata as ad +import shutil +import numpy as np +from spatialdata.models import Labels2DModel +import xarray as xr + + +def convert_to_lower_dtype(arr): + max_val = arr.max() + if max_val <= np.iinfo(np.uint8).max: + new_dtype = np.uint8 + elif max_val <= np.iinfo(np.uint16).max: + new_dtype = np.uint16 + elif max_val <= np.iinfo(np.uint32).max: + new_dtype = np.uint32 + else: + new_dtype = np.uint64 + + return arr.astype(new_dtype) + +## VIASH START +par = { + "input": "../task_ist_preprocessing/resources_test/common/2023_10x_mouse_brain_xenium/dataset.zarr", + "output": "segmentation.zarr" +} + +## VIASH END + +hyperparameters = par.copy() + +hyperparameters = {k:(v if v != "None" else None) for k,v in hyperparameters.items()} +del hyperparameters['input'] +del hyperparameters['output'] + +sdata = sd.read_zarr(par["input"]) +image = sdata['morphology_mip']['scale0'].image.compute().to_numpy() +transformation = sdata['morphology_mip']['scale0'].image.transform.copy() + +sd_output = sd.SpatialData() +image = sdata['morphology_mip']['scale0'].image.compute().to_numpy() +transformation = sdata['morphology_mip']['scale0'].image.transform.copy() +img_arr = tx.preprocessing.segment_cellpose(image[0], hyperparameters) +image = convert_to_lower_dtype(img_arr) +data_array = xr.DataArray(image, name=f'segmentation', dims=('y', 'x')) +parsed_data = Labels2DModel.parse(data_array, transformations=transformation) +sd_output.labels['segmentation'] = parsed_data + +print("Writing output", flush=True) +if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) +sd_output.write(par["output"]) + diff --git a/src/methods/logistic_regression/config.vsh.yaml b/src/methods/logistic_regression/config.vsh.yaml deleted file mode 100644 index e570c03..0000000 --- a/src/methods/logistic_regression/config.vsh.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_method.yaml - - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: logistic_regression -# A relatively short label, used when rendering visualisations (required) -label: Logistic Regression -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. -summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. -description: | - Logistic Regression estimates parameters of a logistic function for - multivariate classification tasks. Here, we use 100-dimensional whitened PCA - coordinates as independent variables, and the model minimises the cross - entropy loss over all cell type classes. -# Metadata for your component -# A reference key from the bibtex library at src/common/library.bib (required). -references: - bibtex: - - | - @book{hosmer2013applied, - title = {Applied logistic regression}, - author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, - year = {2013}, - publisher = {John Wiley \& Sons}, - volume = {398} - } - -links: - # URL to the code repository for this method (required). - repository: https://github.com/scikit-learn/scikit-learn - # URL to the documentation for this method (required). - documentation: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" - -info: - # Which normalisation method this component prefers to use (required). - preferred_normalization: log_cp10k - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - setup: - - type: python - packages: scikit-learn - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/methods/logistic_regression/script.py b/src/methods/logistic_regression/script.py deleted file mode 100644 index 6ab5782..0000000 --- a/src/methods/logistic_regression/script.py +++ /dev/null @@ -1,46 +0,0 @@ -import anndata as ad -import sklearn.linear_model - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_train': 'resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad', - 'input_test': 'resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'logistic_regression' -} -## VIASH END - -print('Reading input files', flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print('Preprocess data', flush=True) -# ... preprocessing ... - -print('Train model', flush=True) -# ... train model ... -classifier = sklearn.linear_model.LogisticRegression() -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) - -print('Generate predictions', flush=True) -# ... generate predictions ... -obs_label_pred = classifier.predict(input_test.obsm["X_pca"]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_train.uns['dataset_id'], - 'normalization_id': input_train.uns['normalization_id'], - 'method_id': meta['name'] - }, - obs={ - 'label_pred': obs_label_pred - } -) -output.obs_names = input_test.obs_names - -output.write_h5ad(par['output'], compression='gzip') From 3ac0903ca481889ece196b58723a4a9deb21f5c3 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 12:06:55 +0200 Subject: [PATCH 09/17] update cellpose component --- src/methods/cellpose/config.vsh.yaml | 82 +++++++++------------------- src/methods/cellpose/script.py | 71 +++++++++++------------- 2 files changed, 58 insertions(+), 95 deletions(-) diff --git a/src/methods/cellpose/config.vsh.yaml b/src/methods/cellpose/config.vsh.yaml index d5fd3f0..46be884 100644 --- a/src/methods/cellpose/config.vsh.yaml +++ b/src/methods/cellpose/config.vsh.yaml @@ -13,66 +13,35 @@ references: __merge__: /src/api/comp_method.yaml arguments: - - name: --batch_size - type: integer - default: 8 - - name: --model_type - type: string - default: "cyto" - - name: --resample - type: boolean - default: True - - name: --channel_axis - type: string - default: "None" - - name: --z_axis - type: string - default: "None" - - name: --normalize - type: boolean - default: True - - name: --invert - type: boolean - default: False - - name: --rescale - type: string - default: "None" - name: --diameter type: double - default: 30.0 #default should be None with cellpose v4 - - name: --do_3D - type: boolean - default: False - - name: --anisotropy - type: string - default: "None" -# - name: --net_avg -# type: boolean -# default: False - - name: --augment - type: boolean - default: False - #- name: --tile - # type: boolean - # default: True - - name: --tile_overlap - type: double - default: 0.1 - #- name: --interp #Seems to be removed in v4 - # type: boolean - # default: True + description: "Cell diameter in pixels. If not set, cellpose runs a size model to estimate it (slower)." + info: + test_default: 30 + - name: --flow_threshold type: double - default: 0.4 - - name: --cellprob_threshold - type: double - default: 0.0 + description: "Flow error threshold. Set to 0 to skip flow quality check for faster execution." + info: + test_default: 0 + + - name: --niter + type: integer + description: "Number of iterations for dynamics. Lower values are faster but less accurate." + info: + test_default: 10 + - name: --min_size type: integer - default: 15 - - name: --stitch_threshold - type: double - default: 0.0 + description: "Minimum number of pixels per mask. Set to -1 to skip small mask removal." + info: + test_default: -1 + + - name: --resample + type: boolean + description: "Whether to run dynamics at original image size. Disabling is faster." + info: + test_default: false resources: - type: python_script @@ -84,9 +53,10 @@ engines: image: openproblems/base_python:1 setup: - type: python - pypi: cellpose<4.0.0 + pypi: cellpose + - type: python + script: from cellpose.models import CellposeModel; model = CellposeModel() __merge__: - - /src/base/setup_txsim_partial.yaml - /src/base/setup_spatialdata_partial.yaml - type: native diff --git a/src/methods/cellpose/script.py b/src/methods/cellpose/script.py index 15f8590..f0d3706 100644 --- a/src/methods/cellpose/script.py +++ b/src/methods/cellpose/script.py @@ -1,57 +1,50 @@ -import txsim as tx import numpy as np import os -import yaml -import spatialdata as sd -import anndata as ad import shutil -import numpy as np -from spatialdata.models import Labels2DModel +import spatialdata as sd import xarray as xr - - -def convert_to_lower_dtype(arr): - max_val = arr.max() - if max_val <= np.iinfo(np.uint8).max: - new_dtype = np.uint8 - elif max_val <= np.iinfo(np.uint16).max: - new_dtype = np.uint16 - elif max_val <= np.iinfo(np.uint32).max: - new_dtype = np.uint32 - else: - new_dtype = np.uint64 - - return arr.astype(new_dtype) +from cellpose.models import CellposeModel +from spatialdata.models import Labels2DModel ## VIASH START par = { - "input": "../task_ist_preprocessing/resources_test/common/2023_10x_mouse_brain_xenium/dataset.zarr", - "output": "segmentation.zarr" + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', + 'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad' +} +meta = { + 'name': 'cellpose' } - ## VIASH END -hyperparameters = par.copy() - -hyperparameters = {k:(v if v != "None" else None) for k,v in hyperparameters.items()} -del hyperparameters['input'] -del hyperparameters['output'] - +print('Reading input', flush=True) sdata = sd.read_zarr(par["input"]) image = sdata['morphology_mip']['scale0'].image.compute().to_numpy() transformation = sdata['morphology_mip']['scale0'].image.transform.copy() +print('Initializing Cellpose model', flush=True) +model = CellposeModel() + +eval_params = {k: par[k] for k in ("diameter", "flow_threshold", "niter", "min_size", "resample") if par.get(k) is not None} +print(f"Running Cellpose segmentation with parameters: {eval_params}") +masks, _, _ = model.eval(image[0], progress=True, **eval_params) + +print('Cellpose segmentation finished, post-processing results', flush=True) +# Convert to smallest sufficient unsigned int dtype +max_val = masks.max() +for dtype in (np.uint8, np.uint16, np.uint32, np.uint64): + if max_val <= np.iinfo(dtype).max: + masks = masks.astype(dtype) + break + +print('Segmentation done, preparing output', flush=True) sd_output = sd.SpatialData() -image = sdata['morphology_mip']['scale0'].image.compute().to_numpy() -transformation = sdata['morphology_mip']['scale0'].image.transform.copy() -img_arr = tx.preprocessing.segment_cellpose(image[0], hyperparameters) -image = convert_to_lower_dtype(img_arr) -data_array = xr.DataArray(image, name=f'segmentation', dims=('y', 'x')) -parsed_data = Labels2DModel.parse(data_array, transformations=transformation) -sd_output.labels['segmentation'] = parsed_data +data_array = xr.DataArray(masks, name='segmentation', dims=('y', 'x')) +parsed = Labels2DModel.parse(data_array, transformations=transformation) +# Rechunk to flat integer tuples (zarr v3 rejects nested dask chunk tuples) +parsed = parsed.chunk({'y': masks.shape[0], 'x': masks.shape[1]}) +sd_output.labels['segmentation'] = parsed -print("Writing output", flush=True) +print('Saving output', flush=True) if os.path.exists(par["output"]): - shutil.rmtree(par["output"]) + shutil.rmtree(par["output"]) sd_output.write(par["output"]) - From a43034ab97af38b2c9cc16bb23cbd2526b1574f1 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 12:07:05 +0200 Subject: [PATCH 10/17] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8d4beee..806a5c1 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ Arguments: A single-cell reference dataset, preprocessed for this benchmark. Example file: -`resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad` +`resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad` Description: @@ -187,7 +187,7 @@ Data structure: A spatial transcriptomics dataset, preprocessed for this benchmark. Example file: -`resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr` +`resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr` Description: From 7b38f99cb1699afbfaf6edb3ebe88588a633be90 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 12:11:25 +0200 Subject: [PATCH 11/17] update wf --- .../process_datasets/config.vsh.yaml | 20 +++---- src/workflows/process_datasets/main.nf | 58 ++++++++++--------- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index d2e4915..c71286a 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -4,24 +4,24 @@ namespace: workflows argument_groups: - name: Inputs arguments: - - name: "--input" - __merge__: /src/api/file_common_dataset.yaml + - name: "--input_sp" + __merge__: /src/api/file_common_ist.yaml + required: true + direction: input + - name: "--input_sc" + __merge__: /src/api/file_common_scrnaseq.yaml required: true direction: input - name: Outputs arguments: - - name: "--output_train" - __merge__: /src/api/file_train.yaml - required: true + - name: "--output_spatial_dataset" + __merge__: /src/api/file_spatial_dataset.yaml direction: output - - name: "--output_test" - __merge__: /src/api/file_test.yaml required: true + - name: "--output_scrnaseq_reference" + __merge__: /src/api/file_scrnaseq_reference.yaml direction: output - - name: "--output_solution" - __merge__: /src/api/file_solution.yaml required: true - direction: output resources: - type: nextflow_script diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 2732475..947a8f1 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -14,41 +14,43 @@ workflow run_wf { main: output_ch = input_ch - | check_dataset_with_schema.run( - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset": checks["exit_code"] == 0 ? state.input : null, - ] - } - ) - - // remove datasets which didn't pass the schema check - | filter { id, state -> - state.dataset != null - } + // | check_dataset_with_schema.run( + // fromState: { id, state -> + // def schema = findArgumentSchema(meta.config, "input") + // def schemaYaml = tempFile("schema.yaml") + // writeYaml(schema, schemaYaml) + // [ + // "input": state.input, + // "schema": schemaYaml + // ] + // }, + // toState: { id, output, state -> + // // read the output to see if dataset passed the qc + // def checks = readYaml(output.output) + // state + [ + // "dataset": checks["exit_code"] == 0 ? state.input : null, + // ] + // } + // ) + + // // remove datasets which didn't pass the schema check + // | filter { id, state -> + // state.dataset != null + // } | process_dataset.run( - fromState: [ input: "dataset" ], + fromState: [ + input_sp: "input_sp", + "input_sc": "input_sc" + ], toState: [ - output_train: "output_train", - output_test: "output_test", - output_solution: "output_solution" + output_spatial_dataset: "output_spatial_dataset", + output_scrnaseq_reference: "output_scrnaseq_reference" ] ) // only output the files for which an output file was specified - | setState(["output_train", "output_test", "output_solution"]) + | setState(["output_spatial_dataset", "output_scrnaseq_reference"]) emit: output_ch From d4b3742f2ae6026f74b0dfa3500af9bdfd452917 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 12:19:07 +0200 Subject: [PATCH 12/17] wip update benchmark --- src/workflows/run_benchmark/config.vsh.yaml | 21 +++++++-------------- src/workflows/run_benchmark/main.nf | 2 +- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 4c1602d..4ab5f83 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -4,20 +4,13 @@ namespace: workflows argument_groups: - name: Inputs arguments: - - name: "--input_train" - __merge__: /src/api/file_train.yaml - type: file - direction: input - required: true - - name: "--input_test" - __merge__: /src/api/file_test.yaml - type: file - direction: input + - name: "--input_spatial_dataset" + __merge__: /src/api/file_spatial_dataset.yaml + direction: output required: true - - name: "--input_solution" - __merge__: /src/api/file_solution.yaml - type: file - direction: input + - name: "--input_scrnaseq_reference" + __merge__: /src/api/file_scrnaseq_reference.yaml + direction: output required: true - name: Outputs arguments: @@ -65,7 +58,7 @@ dependencies: - name: utils/extract_uns_metadata repository: openproblems - name: control_methods/true_labels - - name: methods/logistic_regression + - name: methods/cellpose - name: metrics/accuracy runners: diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 826dec4..fe25140 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -8,7 +8,7 @@ workflow auto { // construct list of methods and control methods methods = [ true_labels, - logistic_regression + cellpose ] // construct list of metrics From bd2618d3712b6dd55c375df2c9fd56680bde4e21 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 12:27:47 +0200 Subject: [PATCH 13/17] fix test resource path --- src/api/comp_data_processor.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 8c02b1d..2f3fbb3 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -28,8 +28,8 @@ argument_groups: direction: output required: true test_resources: - - path: /resources_test/common/cxg_mouse_pancreas_atlas - dest: resources_test/common/cxg_mouse_pancreas_atlas + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined - type: python_script path: /common/component_tests/run_and_check_output.py From 6fa44e75cf4499726b522675b3e7079a886aa4f0 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 12:30:36 +0200 Subject: [PATCH 14/17] revert some api example path changes --- README.md | 4 ++-- _viash.yaml | 6 ++++++ src/api/comp_data_processor.yaml | 4 ++-- src/api/file_common_ist.yaml | 2 +- src/api/file_common_scrnaseq.yaml | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 806a5c1..3e67d7a 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ flowchart TB An unprocessed spatial imaging dataset stored as a zarr file. Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr` +`resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr` Description: @@ -307,7 +307,7 @@ Data structure: An unprocessed dataset as output by a dataset loader. Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad` +`resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad` Description: diff --git a/_viash.yaml b/_viash.yaml index 38c3830..31ad320 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -48,6 +48,12 @@ references: info: image: The name of the image file to use for the component on the website. test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium_rep1/ + dest: resources_test/common/2023_10x_mouse_brain_xenium_rep1/ + - type: s3 + path: s3://openproblems-data/resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/ + dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/ - type: s3 path: s3://openproblems-data/resources_test/task_spatial_segmentation/ dest: resources_test/task_spatial_segmentation diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 2f3fbb3..8c02b1d 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -28,8 +28,8 @@ argument_groups: direction: output required: true test_resources: - - path: /resources_test/task_spatial_segmentation/mouse_brain_combined - dest: resources_test/task_spatial_segmentation/mouse_brain_combined + - path: /resources_test/common/cxg_mouse_pancreas_atlas + dest: resources_test/common/cxg_mouse_pancreas_atlas - type: python_script path: /common/component_tests/run_and_check_output.py diff --git a/src/api/file_common_ist.yaml b/src/api/file_common_ist.yaml index 529b5d2..ec45900 100644 --- a/src/api/file_common_ist.yaml +++ b/src/api/file_common_ist.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr" +example: "resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr" label: "Common iST Dataset" summary: An unprocessed spatial imaging dataset stored as a zarr file. description: | diff --git a/src/api/file_common_scrnaseq.yaml b/src/api/file_common_scrnaseq.yaml index 0a15596..6986efb 100644 --- a/src/api/file_common_scrnaseq.yaml +++ b/src/api/file_common_scrnaseq.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad" +example: "resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad" label: "Common SC Dataset" summary: An unprocessed dataset as output by a dataset loader. description: | From 8263a21339911bcbc48a7b78ca1883541dd86c25 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 12:32:51 +0200 Subject: [PATCH 15/17] fix test resources --- src/api/comp_data_processor.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 8c02b1d..22c77aa 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -28,8 +28,10 @@ argument_groups: direction: output required: true test_resources: - - path: /resources_test/common/cxg_mouse_pancreas_atlas - dest: resources_test/common/cxg_mouse_pancreas_atlas + - path: /resources_test/common/2023_10x_mouse_brain_xenium_rep1 + dest: resources_test/common/2023_10x_mouse_brain_xenium_rep1 + - path: /resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2 + dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2 - type: python_script path: /common/component_tests/run_and_check_output.py From 8bf5e0fc0b7ca17dcf4d7af065fbc8eb19756399 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 13:32:27 +0200 Subject: [PATCH 16/17] update script --- src/methods/cellpose/script.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/methods/cellpose/script.py b/src/methods/cellpose/script.py index f0d3706..4949b8d 100644 --- a/src/methods/cellpose/script.py +++ b/src/methods/cellpose/script.py @@ -1,3 +1,4 @@ +import dask.array as da import numpy as np import os import shutil @@ -38,10 +39,10 @@ print('Segmentation done, preparing output', flush=True) sd_output = sd.SpatialData() -data_array = xr.DataArray(masks, name='segmentation', dims=('y', 'x')) +# Wrap masks as a single-chunk dask array with flat chunk shape for zarr v3 compat +dask_masks = da.from_array(masks, chunks=masks.shape) +data_array = xr.DataArray(dask_masks, name='segmentation', dims=('y', 'x')) parsed = Labels2DModel.parse(data_array, transformations=transformation) -# Rechunk to flat integer tuples (zarr v3 rejects nested dask chunk tuples) -parsed = parsed.chunk({'y': masks.shape[0], 'x': masks.shape[1]}) sd_output.labels['segmentation'] = parsed print('Saving output', flush=True) From 1fd420b86f96cbfdae2afd94781d823fa97aa1a8 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 20 Apr 2026 13:40:17 +0200 Subject: [PATCH 17/17] update readme --- README.md | 262 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 219 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 3e67d7a..ccf6db4 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,9 @@ should convince readers of the significance and relevance of your task. ## Authors & contributors -| name | roles | -|:---------|:-------------------| -| John Doe | author, maintainer | +| Name | Roles | Linkedin | Twitter | Email | Github | Orcid | +|:---|:---|:---|:---|:---|:---|:---| +| John Doe | author, maintainer | johndoe | johndoe | john@doe.me | johndoe | 0000-0000-0000-0000 | ## API @@ -38,24 +38,24 @@ should convince readers of the significance and relevance of your task. flowchart TB file_common_ist("Common iST Dataset") comp_data_processor[/"Data processor"/] - file_scrnaseq_reference("scRNA-seq Reference") file_spatial_dataset("Raw iST Dataset") + file_scrnaseq_reference("scRNA-seq Reference") comp_control_method[/"Control Method"/] - comp_metric[/"Metric"/] comp_method[/"Method"/] + comp_metric[/"Metric"/] file_prediction("Predicted data") file_score("Score") file_common_scrnaseq("Common SC Dataset") file_common_ist---comp_data_processor - comp_data_processor-->file_scrnaseq_reference comp_data_processor-->file_spatial_dataset - file_scrnaseq_reference---comp_control_method - file_scrnaseq_reference---comp_metric + comp_data_processor-->file_scrnaseq_reference file_spatial_dataset---comp_control_method file_spatial_dataset---comp_method + file_scrnaseq_reference---comp_control_method + file_scrnaseq_reference---comp_metric comp_control_method-->file_prediction - comp_metric-->file_score comp_method-->file_prediction + comp_metric-->file_score file_prediction---comp_metric file_common_scrnaseq---comp_data_processor ``` @@ -76,12 +76,91 @@ Format:
+ SpatialData object + images: 'image', 'image_3D', 'he_image' + labels: 'cell_labels', 'nucleus_labels' + points: 'transcripts' + shapes: 'cell_boundaries', 'nucleus_boundaries' + tables: 'metadata' + coordinate_systems: 'global' +
Data structure:
+*images* + +| Name | Description | +|:-----------|:------------------------------------| +| `image` | The raw image data. | +| `image_3D` | (*Optional*) The raw 3D image data. | +| `he_image` | (*Optional*) H&E image data. | + +*labels* + +| Name | Description | +|:-----------------|:---------------------------------------| +| `cell_labels` | (*Optional*) Cell segmentation labels. | +| `nucleus_labels` | (*Optional*) Cell segmentation labels. | + +*points* + +`transcripts`: Point cloud data of transcripts. + +| Column | Type | Description | +|:---|:---|:---| +| `x` | `float` | x-coordinate of the point. | +| `y` | `float` | y-coordinate of the point. | +| `z` | `float` | (*Optional*) z-coordinate of the point. | +| `feature_name` | `categorical` | Name of the feature. | +| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | +| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | +| `cell_type` | `string` | (*Optional*) Cell type of the cell. | +| `qv` | `float` | (*Optional*) Quality value of the point. | +| `transcript_id` | `long` | Unique identifier of the transcript. | +| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | + +*shapes* + +`cell_boundaries`: Cell boundaries. + +| Column | Type | Description | +|:-----------|:---------|:-------------------------------| +| `geometry` | `object` | Geometry of the cell boundary. | + +`nucleus_boundaries`: Nucleus boundaries. + +| Column | Type | Description | +|:-----------|:---------|:----------------------------------| +| `geometry` | `object` | Geometry of the nucleus boundary. | + +*tables* + +`metadata`: Metadata of spatial dataset. + +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_id"]` | `string` | A unique identifier for the cell. | +| `var["gene_ids"]` | `string` | Unique identifier for the gene. | +| `var["feature_types"]` | `string` | Type of the feature. | +| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | +| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | + +*coordinate_systems* + +| Name | Description | +|:---------|:------------------------------------| +| `global` | Coordinate system of the replicate. | +
## Component type: Data processor @@ -101,6 +180,109 @@ Arguments: +## File format: Raw iST Dataset + +A spatial transcriptomics dataset, preprocessed for this benchmark. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr` + +Description: + +This dataset contains preprocessed images, labels, points, shapes, and +tables for spatial transcriptomics data. + +Format: + +
+ + SpatialData object + images: 'image', 'image_3D', 'he_image' + labels: 'cell_labels', 'nucleus_labels' + points: 'transcripts' + shapes: 'cell_boundaries', 'nucleus_boundaries' + tables: 'metadata' + coordinate_systems: 'global' + +
+ +Data structure: + +
+ +*images* + +| Name | Description | +|:-----------|:------------------------------------| +| `image` | The raw image data. | +| `image_3D` | (*Optional*) The raw 3D image data. | +| `he_image` | (*Optional*) H&E image data. | + +*labels* + +| Name | Description | +|:-----------------|:---------------------------------------| +| `cell_labels` | (*Optional*) Cell segmentation labels. | +| `nucleus_labels` | (*Optional*) Cell segmentation labels. | + +*points* + +`transcripts`: Point cloud data of transcripts. + +| Column | Type | Description | +|:---|:---|:---| +| `x` | `float` | x-coordinate of the point. | +| `y` | `float` | y-coordinate of the point. | +| `z` | `float` | (*Optional*) z-coordinate of the point. | +| `feature_name` | `categorical` | Name of the feature. | +| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | +| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | +| `cell_type` | `string` | (*Optional*) Cell type of the cell. | +| `qv` | `float` | (*Optional*) Quality value of the point. | +| `transcript_id` | `long` | Unique identifier of the transcript. | +| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | + +*shapes* + +`cell_boundaries`: Cell boundaries. + +| Column | Type | Description | +|:-----------|:---------|:-------------------------------| +| `geometry` | `object` | Geometry of the cell boundary. | + +`nucleus_boundaries`: Nucleus boundaries. + +| Column | Type | Description | +|:-----------|:---------|:----------------------------------| +| `geometry` | `object` | Geometry of the nucleus boundary. | + +*tables* + +`metadata`: Metadata of spatial dataset. + +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_id"]` | `string` | A unique identifier for the cell. | +| `var["gene_ids"]` | `string` | Unique identifier for the gene. | +| `var["feature_types"]` | `string` | Type of the feature. | +| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | +| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | + +*coordinate_systems* + +| Name | Description | +|:---------|:------------------------------------| +| `global` | Coordinate system of the replicate. | + +
+ ## File format: scRNA-seq Reference A single-cell reference dataset, preprocessed for this benchmark. @@ -182,30 +364,6 @@ Data structure: -## File format: Raw iST Dataset - -A spatial transcriptomics dataset, preprocessed for this benchmark. - -Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr` - -Description: - -This dataset contains preprocessed images, labels, points, shapes, and -tables for spatial transcriptomics data. - -Format: - -
- -
- -Data structure: - -
- -
- ## Component type: Control Method Quality control methods for verifying the pipeline. @@ -222,9 +380,9 @@ Arguments: -## Component type: Metric +## Component type: Method -A task template metric. +A method. Arguments: @@ -232,15 +390,14 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input_prediction` | `file` | A predicted dataset as output by a method. | -| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | -| `--output` | `file` | (*Output*) File indicating the score of a metric. | +| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | -## Component type: Method +## Component type: Metric -A method. +A task template metric. Arguments: @@ -248,8 +405,9 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | -| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | +| `--input_prediction` | `file` | A predicted dataset as output by a method. | +| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) File indicating the score of a metric. | @@ -264,12 +422,31 @@ Format:
+ SpatialData object + labels: 'segmentation' + tables: 'table' +
Data structure:
+*labels* + +| Name | Description | +|:---------------|:--------------------------| +| `segmentation` | Segmentation of the data. | + +*tables* + +`table`: AnnData table. + +| Slot | Type | Description | +|:-----------------|:---------|:------------| +| `obs["cell_id"]` | `string` | Cell ID. | +| `obs["region"]` | `string` | Region. | +
## File format: Score @@ -385,4 +562,3 @@ Data structure: | `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -