From 8c1bfa64eb4babc9d41c1439d479e4a3e9cf78d3 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Wed, 6 May 2026 16:32:41 +1000 Subject: [PATCH 1/4] adding schema changes and reporting --- docs/assay-schema.md | 138 + docs/assay-schema.yaml | 66 + docs/panel-schema.md | 69 +- docs/panel-schema.yaml | 23 + docs/variant-schema.md | 87 + docs/variant-schema.yaml | 28 +- rust/Cargo.lock | 1 + rust/bioscript-cli/Cargo.toml | 1 + rust/bioscript-cli/src/main.rs | 2351 ++++++++++++++++- rust/bioscript-cli/tests/cli.rs | 69 + rust/bioscript-formats/src/genotype.rs | 36 +- rust/bioscript-schema/src/lib.rs | 9 +- rust/bioscript-schema/src/validator.rs | 744 +++++- .../tests/validate_variants.rs | 56 +- 14 files changed, 3532 insertions(+), 146 deletions(-) create mode 100644 docs/assay-schema.md create mode 100644 docs/assay-schema.yaml diff --git a/docs/assay-schema.md b/docs/assay-schema.md new file mode 100644 index 0000000..d087f4e --- /dev/null +++ b/docs/assay-schema.md @@ -0,0 +1,138 @@ +# Assay Schema + +Use an assay when a named test observes one or more variants and emits custom derived report fields. + +An assay is different from a panel: a panel is a collection of mostly independent observations, while an assay has its own interpretation logic. APOL1 is an assay because it observes G1/G2 sites and reports one derived APOL1 status. + +## Schema Identity + +```yaml +schema: "bioscript:assay:1.0" +version: "1.0" +``` + +## Minimal Shape + +```yaml +schema: "bioscript:assay:1.0" +version: "1.0" +name: "APOL1" +label: "APOL1 Risk Assay" +tags: + - "type:risk" + - "gene:APOL1" + +members: + - kind: "variant" + path: "g1-site-1.yaml" + version: "1.0" + - kind: "variant" + path: "g1-site-2.yaml" + version: "1.0" + - kind: "variant" + path: "g2-site.yaml" + version: "1.0" + +analyses: + - id: "apol1_status" + kind: "bioscript" + path: "apol1.py" + output_format: "tsv" + label: "APOL1 risk genotype" + derived_from: + - "g1-site-1.yaml" + - "g1-site-2.yaml" + - "g2-site.yaml" + emits: + - key: "apol1_status" + label: "APOL1 status" + value_type: "string" + format: "badge" + logic: + source: + name: "Example derivation source" + url: "https://example.org/assay-logic" + description: > + Optional human-readable description of the derivation logic implemented by the analysis script. +``` + +## Members + +Assay members are currently local variant YAML files: + +```yaml +- kind: "variant" + path: "g1-site-1.yaml" + version: "1.0" +``` + +Rules: + +- `kind` is required and currently must be `variant` +- `path` is required +- `version` is recommended +- keep variant identity, coordinates, alleles, findings, and provenance in the variant YAML files + +## Analyses + +Use `analyses` for custom output derived from the member variants. The older `interpretations` key is accepted for compatibility, but new manifests should use `analyses`. + +Rules: + +- `id`, `kind`, `path`, and `derived_from` are required +- `kind` is currently `bioscript` +- `path` points to a BioScript-compatible Python file +- `output_format` is optional and defaults to `tsv`; use `json` or `jsonl` when the script writes structured JSON output +- `derived_from` lists the variant YAML files used by the interpretation +- `emits` is optional but recommended so report generators know which output columns to display and how to label them +- `logic` is optional; use `logic.description` and `logic.source.url` to document where the script's derivation rules came from + +## Findings + +Use `findings` for evidence that binds either to a variant observation or an emitted analysis value. Keep the executable logic in `analyses`; keep PGx evidence and reporting semantics in YAML. + +```yaml +findings: + - schema: "bioscript:pgx-label:1.0" + id: "clinpgx_PA166313401" + label: "ClinPGx drug label annotation PA166313401" + authority_type: "regulatory_label" + binding: + source: "analysis" + analysis_id: "apoe_epsilon" + key: "apoe_status" + operator: "equals" + value: "e4/e4" + drugs: + - name: "lecanemab" + aliases: + - "LEQEMBI" + evidence: + source: "ClinPGx" + kind: "label_annotation" + id: "PA166313401" + url: "https://www.clinpgx.org/labelAnnotation/PA166313401" + notes: "Drug label annotation applies when APOE status is e4/e4." +``` + +Binding rules: + +- `source` is `analysis` or `variant` +- `analysis` bindings require `analysis_id`, `key`, and either `operator: equals` with `value` or `operator: in` with `values` +- `variant` bindings require `variant` or `path`, `key`, and either `equals`/`value` or `in`/`values` +- PGx label findings use `schema: "bioscript:pgx-label:1.0"` and should include `regulatory_sources`, `pgx_action_level` or `prescribing_actions` when known +- PGx summary findings use `schema: "bioscript:pgx-summary:1.0"` and should include `evidence_level`, `phenotype_categories`, and genotype-specific `effects` when known +- PGx findings should include `drugs` and should link to the exact ClinPGx/PharmGKB/ClinVar evidence page + +## Inclusion In Panels + +A larger panel may include an assay as a member: + +```yaml +members: + - kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" +``` + +When a panel includes an assay, the assay's variant observations can be expanded into the panel output, while report tooling can also run the assay's interpretation and include its emitted fields. diff --git a/docs/assay-schema.yaml b/docs/assay-schema.yaml new file mode 100644 index 0000000..7019e2f --- /dev/null +++ b/docs/assay-schema.yaml @@ -0,0 +1,66 @@ +schema: "bioscript:assay:1.0" +version: "1.0" +name: "APOL1" +label: "APOL1 Risk Assay" +summary: "APOL1 assay that observes G1 and G2 sites and emits the derived APOL1 risk genotype." +tags: + - "type:risk" + - "gene:APOL1" + +members: + - kind: "variant" + path: "g1-site-1.yaml" + version: "1.0" + - kind: "variant" + path: "g1-site-2.yaml" + version: "1.0" + - kind: "variant" + path: "g2-site.yaml" + version: "1.0" + +analyses: + - id: "apol1_status" + kind: "bioscript" + path: "apol1.py" + output_format: "tsv" + label: "APOL1 risk genotype" + derived_from: + - "g1-site-1.yaml" + - "g1-site-2.yaml" + - "g2-site.yaml" + emits: + - key: "apol1_status" + label: "APOL1 status" + value_type: "string" + format: "badge" + logic: + source: + name: "Example derivation source" + url: "https://example.org/assay-logic" + description: > + Optional human-readable description of the derivation logic implemented by the analysis script. + +findings: + - schema: "bioscript:pgx-label:1.0" + id: "example_analysis_bound_pgx_finding" + label: "Example analysis-bound PGx finding" + authority_type: "regulatory_label" + binding: + source: "analysis" + analysis_id: "apol1_status" + key: "apol1_status" + operator: "equals" + value: "G2/G2" + drugs: + - name: "example drug" + aliases: + - "example brand" + regulatory_sources: + - "FDA" + pgx_action_level: "Actionable PGx" + evidence: + source: "ClinPGx" + kind: "label_annotation" + id: "PA..." + url: "https://www.clinpgx.org/labelAnnotation/PA..." + notes: "Findings can bind to emitted analysis keys using equals or in." diff --git a/docs/panel-schema.md b/docs/panel-schema.md index 708daee..ac155a1 100644 --- a/docs/panel-schema.md +++ b/docs/panel-schema.md @@ -1,8 +1,8 @@ # Panel Schema -Use a panel when you want one manifest that points to a curated set of runnable variant records. +Use a panel when you want one manifest that points to a curated set of runnable variant records, assay manifests, and optional interpretation scripts derived from those records. -Right now the Rust runner supports variant members directly. Keep the shape simple. +The Rust runner supports variant members directly. Test tooling can also run declared interpretation scripts and add their emitted fields to the generated report. ## Schema Identity @@ -25,9 +25,26 @@ members: - kind: "variant" path: "variants/rs671.yaml" version: "1.0" + - kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" - kind: "variant" path: "variants/rs713598.yaml" version: "1.0" + +analyses: + - id: "taste_status" + kind: "bioscript" + path: "interpretations/taste.py" + output_format: "tsv" + label: "Taste status" + derived_from: + - "variants/rs713598.yaml" + emits: + - key: "taste_status" + label: "Taste status" + value_type: "string" + format: "badge" ``` ## Purpose @@ -37,30 +54,74 @@ A panel is: - a selection manifest - a stable name for a bundle of variants - something the Rust `bioscript` command can run directly +- a way to include smaller assay manifests in a broader bundle +- a place to declare interpretation chunks that derive custom report fields from member variants It is not: - a full remote package manager - a replacement for richer assay manifests +- a place to hide variant metadata inside Python when YAML can describe it ## Members -Each member must currently be: +Each member must currently be a local variant or assay: ```yaml - kind: "variant" path: "variants/rs671.yaml" version: "1.0" +- kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" ``` Rules: - `kind` is required - exactly one of `path` or `download` is required -- current runner support is `variant` members only +- current runner support is local `variant` and `assay` members - `version` is recommended for local members - `sha256` is optional for local members +## Analyses + +Use `analyses` when a panel needs custom derived output that is not the same thing as a single variant observation. Examples include APOE epsilon genotype from rs429358/rs7412 or APOL1 G0/G1/G2 status from three sites. The older `interpretations` key is accepted for compatibility, but new manifests should use `analyses`. + +```yaml +analyses: + - id: "apoe_epsilon" + kind: "bioscript" + path: "variants/APOE/apoe.py" + output_format: "tsv" + label: "APOE epsilon genotype" + derived_from: + - "variants/APOE/rs429358.yaml" + - "variants/APOE/rs7412.yaml" + emits: + - key: "apoe_status" + label: "APOE status" + value_type: "string" + format: "badge" + logic: + source: + name: "ClinPGx / PharmGKB" + url: "https://www.clinpgx.org/variant/PA166155341/overview" + description: > + Optional human-readable description of the derivation logic implemented by the analysis script. +``` + +Rules: + +- `id`, `kind`, `path`, and `derived_from` are required +- `kind` is currently `bioscript` +- `path` points to a BioScript-compatible Python file +- `output_format` is optional and defaults to `tsv`; use `json` or `jsonl` when the script writes structured JSON output +- `derived_from` lists the variant YAML files used by the interpretation +- `emits` is optional but recommended so report generators know which output columns to display and how to label them +- `logic` is optional; use `logic.description` and `logic.source.url` to document where the script's derivation rules came from +- keep variant identity, coordinates, alleles, findings, and provenance in YAML; keep cross-variant logic in the interpretation script + ## Permissions And Downloads Panels may declare remote downloads up front even if the current runner only executes local members. diff --git a/docs/panel-schema.yaml b/docs/panel-schema.yaml index a35dcd3..d9d7fa1 100644 --- a/docs/panel-schema.yaml +++ b/docs/panel-schema.yaml @@ -9,6 +9,29 @@ members: - kind: "variant" path: "variants/rs671.yaml" version: "1.0" + - kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" - kind: "variant" path: "variants/rs713598.yaml" version: "1.0" + +analyses: + - id: "taste_status" + kind: "bioscript" + path: "interpretations/taste.py" + output_format: "tsv" + label: "Taste status" + derived_from: + - "variants/rs713598.yaml" + emits: + - key: "taste_status" + label: "Taste status" + value_type: "string" + format: "badge" + logic: + source: + name: "Example derivation source" + url: "https://example.org/panel-analysis-logic" + description: > + Optional human-readable description of the derivation logic implemented by the analysis script. diff --git a/docs/variant-schema.md b/docs/variant-schema.md index 84e5ac8..f6c9de6 100644 --- a/docs/variant-schema.md +++ b/docs/variant-schema.md @@ -166,6 +166,7 @@ Fields: - `kind`: `snv | deletion | insertion | indel` - `ref` - `alts` +- `observed_alts` optional - `deletion_length` optional - `insertion_sequence` optional - `motifs` optional @@ -174,6 +175,10 @@ Fields: Stored YAML should describe the biological allele. Do not use symbolic `I` / `D` allele values in this schema. +`alts` is the curated set of alternate alleles that matter for this catalogue entry and should drive app flagging. `observed_alts` is the full set of source-reported alternate alleles observed at the same locus, usually from dbSNP. If `observed_alts` is omitted, tools treat `alts` as both curated and observed. + +When a source such as dbSNP reports multiple alternates but only one has the clinical or PGx evidence being catalogued, keep that allele in `alts` and put the full dbSNP set in `observed_alts`. + Example SNV: ```yaml @@ -181,6 +186,8 @@ alleles: kind: "snv" ref: "G" alts: + - "T" + observed_alts: - "A" - "C" - "T" @@ -205,10 +212,90 @@ Envelope fields: - `schema` required - `alt` optional, but required for allele-specific findings; use `"*"` when the finding applies to any alternate allele at a multiallelic locus +- `binding` optional; use it when report logic should match a specific variant observation field instead of relying on `alt` - `label` optional - `summary` optional - `notes` optional +Variant-bound PGx sidecar include: + +```yaml +findings: + - schema: "bioscript:pgx-summary:1.0" + id: "rs123_pgx_sidecar" + include: "rs123-pgx.yaml" + notes: "Detailed PGx findings are stored in the sidecar file." +``` + +Sidecar files use `schema: "bioscript:pgx-findings:1.0"` and contain dense PGx evidence for one variant. This keeps large ClinPGx/PharmGKB annotation tables out of the core variant identity file. A sidecar may contain both summary annotations and drug label annotations. + +Summary annotations are variant/drug evidence interpretations. They use `schema: "bioscript:pgx-summary:1.0"` and normally include evidence levels, phenotype categories, and genotype-specific effects. + +```yaml +findings: + - schema: "bioscript:pgx-summary:1.0" + id: "example_summary_annotation" + authority_type: "evidence_summary" + drugs: + - name: "example drug" + phenotype_categories: + - "Toxicity" + evidence_level: "3" + evidence: + source: "ClinPGx" + kind: "summary_annotation" + id: "1448427005" + url: "https://www.clinpgx.org/variant/PA.../summaryAnnotation" + effects: + - id: "example_variant_bound_pgx_finding_alt_carrier" + label: "C carrier" + binding: + source: "variant" + variant: "rs123.yaml" + allele: "C" + operator: "dosage_in" + values: [1, 2] + description: "Applies when the participant carries one or two copies of C." + text: "Reportable text for this allele dosage." +``` + +Drug label annotations are regulatory label statements. They use `schema: "bioscript:pgx-label:1.0"` and should carry regulatory/action fields instead of summary evidence levels. + +```yaml +findings: + - schema: "bioscript:pgx-label:1.0" + id: "example_label_annotation" + authority_type: "regulatory_label" + genes: + - "ABCG2" + drugs: + - name: "rosuvastatin" + aliases: + - "Crestor" + regulatory_sources: + - "FDA" + pgx_action_level: "testing_recommended" + prescribing_actions: + - "dose_adjustment" + evidence: + source: "ClinPGx" + kind: "label_annotation" + id: "PA..." + url: "https://www.clinpgx.org/variant/PA.../labelAnnotation" + notes: "Regulatory drug label annotation." +``` + +Supported binding operators are: + +- `equals` and `in` for matching literal analysis outputs or observation fields +- `dosage_equals` and `dosage_in` for variant allele dosage, where `allele` is the reference allele or one of the alternate alleles and dosage values are `0`, `1`, or `2` + +Known PGx finding schemas are: + +- `bioscript:pgx-summary:1.0` for ClinPGx/PharmGKB summary annotations +- `bioscript:pgx-label:1.0` for ClinPGx/PharmGKB drug label annotations +- `bioscript:pgx:1.0` is legacy; prefer one of the two specific schemas above + Unknown finding schemas are allowed. ## Optional Metadata diff --git a/docs/variant-schema.yaml b/docs/variant-schema.yaml index e99f8a6..3c2a26d 100644 --- a/docs/variant-schema.yaml +++ b/docs/variant-schema.yaml @@ -28,6 +28,9 @@ alleles: ref: "A" alts: - "G" + observed_alts: + - "G" + - "T" deletion_length: 1 insertion_sequence: "AT" motifs: @@ -46,9 +49,28 @@ findings: label: "string" summary: "string" notes: "string" - - schema: "bioscript:pgx:1.0" - alt: "*" - notes: "Finding applies to any alternate allele at this multiallelic locus." + - schema: "bioscript:pgx-summary:1.0" + id: "example_pgx_summary" + label: "Example summary annotation" + authority_type: "evidence_summary" + binding: + source: "variant" + variant: "rs123.yaml" + allele: "G" + operator: "dosage_in" + values: + - 1 + - 2 + description: "Applies when the participant carries at least one G allele." + drugs: + - name: "example drug" + evidence_level: "3" + evidence: + source: "ClinPGx" + kind: "summary_annotation" + id: "144..." + url: "https://www.clinpgx.org/variant/PA.../summaryAnnotation" + notes: "Summary annotations capture variant-drug evidence and can be stored directly or through a pgx-findings sidecar." provenance: sources: diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 2e747e4..3e734fe 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -109,6 +109,7 @@ dependencies = [ "bioscript-runtime", "bioscript-schema", "monty", + "serde_json", "serde_yaml", "zip", ] diff --git a/rust/bioscript-cli/Cargo.toml b/rust/bioscript-cli/Cargo.toml index 374329d..cb78301 100644 --- a/rust/bioscript-cli/Cargo.toml +++ b/rust/bioscript-cli/Cargo.toml @@ -13,6 +13,7 @@ bioscript-formats = { path = "../bioscript-formats" } bioscript-runtime = { path = "../bioscript-runtime" } bioscript-schema = { path = "../bioscript-schema" } monty = { path = "../../monty/crates/monty" } +serde_json = "1.0.133" serde_yaml = "0.9.34" [dev-dependencies] diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index 37a34b2..e987509 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -14,8 +14,9 @@ use bioscript_formats::{ }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; use bioscript_schema::{ - PanelManifest, VariantManifest, load_panel_manifest, load_variant_manifest, - validate_panels_path, validate_variants_path, + AssayManifest, PanelInterpretation, PanelManifest, VariantManifest, load_assay_manifest, + load_panel_manifest, load_variant_manifest, validate_assays_path, validate_panels_path, + validate_variants_path, }; use monty::ResourceLimits; @@ -33,12 +34,18 @@ fn main() -> ExitCode { fn run_cli() -> Result<(), String> { let mut args = env::args().skip(1); if let Some(first) = args.next() { + if first == "report" { + return run_app_report(args.collect()); + } if first == "validate-variants" { return run_validate_variants(args.collect()); } if first == "validate-panels" { return run_validate_panels(args.collect()); } + if first == "validate-assays" { + return run_validate_assays(args.collect()); + } if first == "prepare" { return run_prepare(args.collect()); } @@ -177,7 +184,7 @@ fn run_cli() -> Result<(), String> { let Some(script_path) = script_path else { return Err( - "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--root ] [--input-format auto|text|zip|vcf|cram]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" .to_owned(), ); }; @@ -459,164 +466,2272 @@ fn run_validate_variants(args: Vec) -> Result<(), String> { Ok(()) } -fn run_validate_panels(args: Vec) -> Result<(), String> { - let mut path: Option = None; - let mut report_path: Option = None; +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AppOutputFormat { + Tsv, + Json, + Jsonl, + Both, +} + +struct AppReportOptions { + manifest_path: PathBuf, + input_files: Vec, + output_dir: PathBuf, + root: PathBuf, + html: bool, + observations_format: AppOutputFormat, + reports_format: AppOutputFormat, + loader: GenotypeLoadOptions, + filters: Vec, +} + +#[allow(clippy::too_many_lines)] +fn run_app_report(args: Vec) -> Result<(), String> { + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let mut manifest_path: Option = None; + let mut input_files: Vec = Vec::new(); + let mut output_dir: Option = None; + let mut root: Option = None; + let mut html = false; + let mut observations_format = AppOutputFormat::Tsv; + let mut reports_format = AppOutputFormat::Jsonl; + let mut filters = Vec::new(); + let mut loader = GenotypeLoadOptions::default(); let mut iter = args.into_iter(); while let Some(arg) = iter.next() { - if arg == "--report" { - let Some(value) = iter.next() else { - return Err("--report requires a path".to_owned()); - }; - report_path = Some(PathBuf::from(value)); - } else if path.is_none() { - path = Some(PathBuf::from(arg)); - } else { - return Err(format!("unexpected argument: {arg}")); + match arg.as_str() { + "--input-file" => input_files.push(PathBuf::from( + iter.next().ok_or("--input-file requires a path")?, + )), + "--output-dir" => { + output_dir = Some(PathBuf::from( + iter.next().ok_or("--output-dir requires a path")?, + )); + } + "--root" => { + root = Some(PathBuf::from( + iter.next().ok_or("--root requires a directory")?, + )); + } + "--html" => html = true, + "--filter" => filters.push(iter.next().ok_or("--filter requires key=value")?), + "--observations-format" => { + observations_format = parse_app_output_format( + &iter + .next() + .ok_or("--observations-format requires a value")?, + )?; + } + "--reports-format" => { + reports_format = parse_app_output_format( + &iter.next().ok_or("--reports-format requires a value")?, + )?; + } + "--input-format" => { + let value = iter.next().ok_or("--input-format requires a value")?; + if value.eq_ignore_ascii_case("auto") { + loader.format = None; + } else { + loader.format = + Some(value.parse::().map_err(|err| { + format!("invalid --input-format value {value}: {err}") + })?); + } + } + "--input-index" => { + loader.input_index = Some(PathBuf::from( + iter.next().ok_or("--input-index requires a path")?, + )); + } + "--reference-file" => { + loader.reference_file = Some(PathBuf::from( + iter.next().ok_or("--reference-file requires a path")?, + )); + } + "--reference-index" => { + loader.reference_index = Some(PathBuf::from( + iter.next().ok_or("--reference-index requires a path")?, + )); + } + value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")), + value => { + if manifest_path.is_none() { + manifest_path = Some(PathBuf::from(value)); + } else { + input_files.push(PathBuf::from(value)); + } + } } } - let Some(path) = path else { - return Err("usage: bioscript validate-panels [--report ]".to_owned()); + let Some(manifest_path) = manifest_path else { + return Err("usage: bioscript report --input-file [--input-file ...] --output-dir [--html]".to_owned()); }; + if input_files.is_empty() { + return Err("bioscript report requires at least one --input-file".to_owned()); + } + let output_dir = output_dir.ok_or("bioscript report requires --output-dir")?; + let root = root.unwrap_or(cwd); + normalize_loader_paths(&root, &mut loader); - let report = validate_panels_path(&path)?; - let text = report.render_text(); - print!("{text}"); + let options = AppReportOptions { + manifest_path: absolutize(&root, &manifest_path), + input_files: input_files + .iter() + .map(|path| absolutize(&root, path)) + .collect(), + output_dir: absolutize(&root, &output_dir), + root, + html, + observations_format, + reports_format, + loader, + filters, + }; + generate_app_report(&options) +} - if let Some(report_path) = report_path { - if let Some(parent) = report_path.parent() { - std::fs::create_dir_all(parent).map_err(|err| { - format!("failed to create report dir {}: {err}", parent.display()) - })?; - } - std::fs::write(&report_path, text) - .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; +fn parse_app_output_format(value: &str) -> Result { + match value { + "tsv" => Ok(AppOutputFormat::Tsv), + "json" => Ok(AppOutputFormat::Json), + "jsonl" => Ok(AppOutputFormat::Jsonl), + "both" => Ok(AppOutputFormat::Both), + other => Err(format!( + "unsupported output format '{other}'; expected tsv, json, jsonl, or both" + )), } +} - if report.has_errors() { - return Err(format!( - "validation found {} errors and {} warnings", - report.total_errors(), - report.total_warnings() - )); +fn absolutize(root: &Path, path: &Path) -> PathBuf { + if path.is_absolute() { + path.to_path_buf() + } else { + root.join(path) } - - Ok(()) } -fn is_yaml_manifest(path: &Path) -> bool { - path.extension() - .and_then(|ext| ext.to_str()) - .is_some_and(|ext| matches!(ext, "yaml" | "yml")) -} +fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { + fs::create_dir_all(&options.output_dir).map_err(|err| { + format!( + "failed to create output dir {}: {err}", + options.output_dir.display() + ) + })?; -struct ManifestRunOptions<'a> { - input_file: Option<&'a str>, - output_file: Option<&'a str>, - participant_id: Option<&'a str>, - trace_report: Option<&'a Path>, - loader: &'a GenotypeLoadOptions, - filters: &'a [String], + let assay_id = app_assay_id(&options.manifest_path)?; + let findings = load_manifest_findings(&options.root, &options.manifest_path)?; + let provenance = load_manifest_provenance_links(&options.root, &options.manifest_path)?; + let mut observations = Vec::new(); + let mut analyses = Vec::new(); + let mut reports = Vec::new(); + + for input_file in &options.input_files { + let participant_id = participant_id_from_path(input_file); + let rows = run_manifest_rows_for_report( + &options.root, + &options.manifest_path, + input_file, + &participant_id, + &options.loader, + &options.filters, + )?; + let input_observations = rows + .iter() + .map(|row| app_observation_from_manifest_row(&options.root, row, &assay_id)) + .collect::, _>>()?; + observations.extend(input_observations.clone()); + let input_analyses = run_manifest_analyses_for_report( + &options.root, + &options.manifest_path, + input_file, + &participant_id, + &options.loader, + &options.output_dir, + )?; + analyses.extend(input_analyses.clone()); + let matched_findings = match_app_findings(&findings, &input_observations, &input_analyses); + reports.push(app_report_json( + &assay_id, + &participant_id, + input_file, + &input_observations, + &input_analyses, + &matched_findings, + &provenance, + )); + } + + write_app_observations( + &options.output_dir, + &observations, + options.observations_format, + )?; + write_app_analyses(&options.output_dir, &analyses)?; + write_app_reports(&options.output_dir, &reports, options.reports_format)?; + if options.html { + write_app_html(&options.output_dir, &observations, &reports)?; + } + + println!( + "observations: {}", + options.output_dir.join("observations.tsv").display() + ); + println!( + "analysis: {}", + options.output_dir.join("analysis.jsonl").display() + ); + println!( + "reports: {}", + options.output_dir.join("reports.jsonl").display() + ); + if options.html { + println!("html: {}", options.output_dir.join("index.html").display()); + } + Ok(()) } -fn run_manifest( +fn run_manifest_rows_for_report( runtime_root: &Path, manifest_path: &Path, - options: &ManifestRunOptions<'_>, -) -> Result<(), String> { - let schema = manifest_schema(manifest_path)?; - let resolved_input = options - .input_file - .map(|value| resolve_cli_path(runtime_root, value)); - let resolved_output = options - .output_file - .map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); - let resolved_trace = options - .trace_report - .map(|value| resolve_cli_path_buf(runtime_root, value)); - match schema.as_str() { + input_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_text = input_file.display().to_string(); + match manifest_schema(manifest_path)?.as_str() { "bioscript:variant:1.0" | "bioscript:variant" => { let manifest = load_variant_manifest(manifest_path)?; - let row = run_variant_manifest( + Ok(vec![run_variant_manifest( runtime_root, &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - )?; - write_manifest_outputs( - std::slice::from_ref(&row), - resolved_output.as_deref(), - resolved_trace.as_deref(), - )?; - Ok(()) + Some(&input_text), + Some(participant_id), + loader, + )?]) } "bioscript:panel:1.0" => { let manifest = load_panel_manifest(manifest_path)?; - let rows = run_panel_manifest( + run_panel_manifest( runtime_root, &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - options.filters, - )?; - write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; - Ok(()) + Some(&input_text), + Some(participant_id), + loader, + filters, + ) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + run_assay_manifest( + runtime_root, + &manifest, + Some(&input_text), + Some(participant_id), + loader, + filters, + ) } other => Err(format!("unsupported manifest schema '{other}'")), } } -fn run_variant_manifest( +fn run_manifest_analyses_for_report( runtime_root: &Path, - manifest: &VariantManifest, - input_file: Option<&str>, - participant_id: Option<&str>, + manifest_path: &Path, + input_file: &Path, + participant_id: &str, loader: &GenotypeLoadOptions, -) -> Result, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - Ok(variant_row( - runtime_root, - &manifest.path, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )) + output_dir: &Path, +) -> Result, String> { + match manifest_schema(manifest_path)?.as_str() { + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + let mut analyses = Vec::new(); + analyses.extend(run_interpretations_for_report( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.interpretations, + input_file, + participant_id, + loader, + output_dir, + )?); + for member in &manifest.members { + if member.kind != "assay" { + continue; + } + let Some(path) = &member.path else { + continue; + }; + let resolved = resolve_manifest_path(runtime_root, &manifest.path, path)?; + analyses.extend(run_manifest_analyses_for_report( + runtime_root, + &resolved, + input_file, + participant_id, + loader, + output_dir, + )?); + } + Ok(analyses) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + run_interpretations_for_report( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.interpretations, + input_file, + participant_id, + loader, + output_dir, + ) + } + "bioscript:variant:1.0" | "bioscript:variant" => Ok(Vec::new()), + other => Err(format!("unsupported manifest schema '{other}'")), + } } -fn run_panel_manifest( +#[allow(clippy::too_many_arguments)] +fn run_interpretations_for_report( runtime_root: &Path, - panel: &PanelManifest, - input_file: Option<&str>, - participant_id: Option<&str>, + manifest_path: &Path, + manifest_name: &str, + interpretations: &[PanelInterpretation], + input_file: &Path, + participant_id: &str, loader: &GenotypeLoadOptions, - filters: &[String], -) -> Result>, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - let mut rows = Vec::new(); - - for member in &panel.members { - if member.kind != "variant" { + output_dir: &Path, +) -> Result, String> { + let mut outputs = Vec::new(); + for interpretation in interpretations { + if interpretation.kind != "bioscript" { return Err(format!( - "panel member kind '{}' is not executable yet; panel execution is currently variant-only", - member.kind + "analysis '{}' uses unsupported kind '{}'", + interpretation.id, interpretation.kind )); } - let Some(path) = &member.path else { - return Err("remote panel members are not executable yet".to_owned()); + let script_path = resolve_manifest_path(runtime_root, manifest_path, &interpretation.path)?; + let format = interpretation + .output_format + .as_deref() + .unwrap_or("json") + .to_ascii_lowercase(); + let analysis_dir = output_dir.join("analysis").join(participant_id); + fs::create_dir_all(&analysis_dir).map_err(|err| { + format!( + "failed to create analysis output dir {}: {err}", + analysis_dir.display() + ) + })?; + let extension = match format.as_str() { + "tsv" => "tsv", + "json" => "json", + "jsonl" => "jsonl", + other => return Err(format!("unsupported analysis output_format '{other}'")), }; - let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + let output_file = analysis_dir.join(format!("{}.{}", interpretation.id, extension)); + run_bioscript_analysis_script( + runtime_root, + &script_path, + input_file, + &output_file, + participant_id, + loader, + )?; + let rows = parse_analysis_output(&output_file, &format)?; + outputs.push(serde_json::json!({ + "schema": "bioscript:analysis-output:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": manifest_name, + "analysis_id": interpretation.id, + "kind": interpretation.kind, + "output_format": format, + "manifest_path": manifest_path.strip_prefix(runtime_root).unwrap_or(manifest_path).display().to_string(), + "script_path": script_path.strip_prefix(runtime_root).unwrap_or(&script_path).display().to_string(), + "output_file": output_file.strip_prefix(runtime_root).unwrap_or(&output_file).display().to_string(), + "derived_from": interpretation.derived_from.clone(), + "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ + "key": emit.key.clone(), + "label": emit.label.clone(), + "value_type": emit.value_type.clone(), + "format": emit.format.clone(), + })).collect::>(), + "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ + "description": logic.description.clone(), + "source": logic.source.as_ref().map(|source| serde_json::json!({ + "name": source.name.clone(), + "url": source.url.clone(), + })), + })), + "rows": rows, + })); + } + Ok(outputs) +} + +fn run_bioscript_analysis_script( + runtime_root: &Path, + script_path: &Path, + input_file: &Path, + output_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, +) -> Result<(), String> { + let limits = ResourceLimits::new() + .max_duration(Duration::from_millis(1000)) + .max_memory(16 * 1024 * 1024) + .max_allocations(400_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + let runtime = BioscriptRuntime::with_config( + runtime_root.to_path_buf(), + RuntimeConfig { + limits, + loader: loader.clone(), + }, + ) + .map_err(|err| err.to_string())?; + runtime + .run_file( + script_path, + None, + vec![ + ( + "input_file", + monty::MontyObject::String(runtime_path_string(runtime_root, input_file)), + ), + ( + "output_file", + monty::MontyObject::String(runtime_path_string(runtime_root, output_file)), + ), + ( + "participant_id", + monty::MontyObject::String(participant_id.to_owned()), + ), + ], + ) + .map(|_| ()) + .map_err(|err| err.to_string()) +} + +fn runtime_path_string(runtime_root: &Path, path: &Path) -> String { + path.strip_prefix(runtime_root) + .unwrap_or(path) + .display() + .to_string() +} + +fn parse_analysis_output(path: &Path, format: &str) -> Result, String> { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read analysis output {}: {err}", path.display()))?; + match format { + "tsv" => parse_analysis_tsv(&text), + "json" => { + let value: serde_json::Value = serde_json::from_str(&text).map_err(|err| { + format!("failed to parse analysis JSON {}: {err}", path.display()) + })?; + Ok(match value { + serde_json::Value::Array(rows) => rows, + serde_json::Value::Object(mut object) => object + .remove("rows") + .and_then(|rows| rows.as_array().cloned()) + .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), + other => vec![other], + }) + } + "jsonl" => text + .lines() + .filter(|line| !line.trim().is_empty()) + .map(|line| serde_json::from_str(line).map_err(|err| err.to_string())) + .collect(), + other => Err(format!("unsupported analysis output_format '{other}'")), + } +} + +fn parse_analysis_tsv(text: &str) -> Result, String> { + let mut lines = text.lines().filter(|line| !line.trim().is_empty()); + let Some(header_line) = lines.next() else { + return Ok(Vec::new()); + }; + let headers: Vec<&str> = header_line.split('\t').collect(); + let mut rows = Vec::new(); + for line in lines { + let values: Vec<&str> = line.split('\t').collect(); + let mut object = serde_json::Map::new(); + for (idx, header) in headers.iter().enumerate() { + object.insert( + (*header).to_owned(), + serde_json::Value::String(values.get(idx).copied().unwrap_or_default().to_owned()), + ); + } + rows.push(serde_json::Value::Object(object)); + } + Ok(rows) +} + +fn app_assay_id(path: &Path) -> Result { + match manifest_schema(path)?.as_str() { + "bioscript:panel:1.0" => Ok(load_panel_manifest(path)?.name), + "bioscript:assay:1.0" => Ok(load_assay_manifest(path)?.name), + "bioscript:variant:1.0" | "bioscript:variant" => Ok(load_variant_manifest(path)?.name), + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn participant_id_from_path(path: &Path) -> String { + let file_name = path + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or("participant"); + file_name + .trim_end_matches(".txt.zip") + .trim_end_matches(".csv.zip") + .trim_end_matches(".vcf.gz") + .trim_end_matches(".cram") + .trim_end_matches(".zip") + .trim_end_matches(".txt") + .trim_end_matches(".csv") + .to_owned() +} + +fn app_observation_from_manifest_row( + runtime_root: &Path, + row: &BTreeMap, + assay_id: &str, +) -> Result { + let row_path = row.get("path").cloned().unwrap_or_default(); + let manifest_path = if Path::new(&row_path).is_absolute() { + PathBuf::from(&row_path) + } else { + runtime_root.join(&row_path) + }; + let manifest = load_variant_manifest(&manifest_path)?; + let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); + let genotype_display = row.get("genotype").cloned().unwrap_or_default(); + let alt_alleles = variant_alt_alleles(&manifest_path)?; + let alt_allele = observed_alt_allele(&genotype_display, &ref_allele, &alt_alleles) + .or_else(|| manifest.spec.alternate.clone()) + .unwrap_or_default(); + let (genotype, zygosity) = normalize_app_genotype(&genotype_display, &ref_allele, &alt_allele); + let depth = parse_optional_u32(row.get("depth")); + let ref_count = parse_optional_u32(row.get("ref_count")); + let alt_count = parse_optional_u32(row.get("alt_count")); + let allele_balance = match (alt_count, depth) { + (Some(alt_count), Some(depth)) if depth > 0 => { + Some(f64::from(alt_count) / f64::from(depth)) + } + _ => None, + }; + let assembly = row.get("assembly").cloned().unwrap_or_default(); + let locus = if assembly.eq_ignore_ascii_case("grch37") { + manifest.spec.grch37.as_ref() + } else { + manifest + .spec + .grch38 + .as_ref() + .or(manifest.spec.grch37.as_ref()) + }; + let outcome = if genotype == "./." { + "no_call" + } else if zygosity == "hom_ref" { + "reference" + } else if zygosity == "het" || zygosity == "hom_alt" { + "variant" + } else { + "unknown" + }; + let evidence_raw = row.get("evidence").cloned().unwrap_or_default(); + Ok(serde_json::json!({ + "participant_id": row.get("participant_id").cloned().unwrap_or_default(), + "assay_id": assay_id, + "assay_version": "1.0", + "variant_key": manifest.name, + "variant_path": row_path, + "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), + "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, + "chrom": locus.map_or(String::new(), |locus| locus.chrom.clone()), + "pos_start": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), + "pos_end": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), + "ref": ref_allele, + "alt": alt_allele, + "kind": manifest.spec.kind.map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()), + "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, + "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), + "call_status": if genotype == "./." { "no_call" } else { "called" }, + "genotype": genotype, + "genotype_display": genotype_display, + "zygosity": zygosity, + "ref_count": ref_count, + "alt_count": alt_count, + "depth": depth, + "genotype_quality": serde_json::Value::Null, + "allele_balance": allele_balance, + "outcome": outcome, + "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, + "evidence_raw": evidence_raw, + "facets": serde_json::Value::Null, + })) +} + +fn variant_alt_alleles(path: &Path) -> Result, String> { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read variant YAML {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse variant YAML {}: {err}", path.display()))?; + let Some(items) = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("alleles".to_owned()))) + .and_then(serde_yaml::Value::as_mapping) + .and_then(|mapping| { + mapping + .get(serde_yaml::Value::String("observed_alts".to_owned())) + .or_else(|| mapping.get(serde_yaml::Value::String("alts".to_owned()))) + }) + .and_then(serde_yaml::Value::as_sequence) + else { + return Ok(Vec::new()); + }; + Ok(items + .iter() + .filter_map(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .collect()) +} + +fn observed_alt_allele( + genotype_display: &str, + ref_allele: &str, + alts: &[String], +) -> Option { + if ref_allele.len() != 1 { + return None; + } + let ref_ch = ref_allele.chars().next()?; + genotype_display + .chars() + .filter(|ch| ch.is_ascii_alphabetic() && *ch != ref_ch) + .find_map(|ch| { + alts.iter() + .find(|alt| alt.len() == 1 && alt.starts_with(ch)) + .cloned() + }) +} + +fn normalize_app_genotype(display: &str, ref_allele: &str, alt_allele: &str) -> (String, String) { + if display.is_empty() { + return ("./.".to_owned(), "unknown".to_owned()); + } + let alleles: Vec = display + .chars() + .filter(|ch| ch.is_ascii_alphabetic()) + .collect(); + if alleles.len() != 2 || ref_allele.len() != 1 || alt_allele.len() != 1 { + return (display.to_owned(), "unknown".to_owned()); + } + let ref_ch = ref_allele.chars().next().unwrap_or_default(); + let alt_ch = alt_allele.chars().next().unwrap_or_default(); + let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); + let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); + match (ref_count, alt_count) { + (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), + (1, 1) => ("0/1".to_owned(), "het".to_owned()), + (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), + _ => (display.to_owned(), "unknown".to_owned()), + } +} + +fn parse_optional_u32(value: Option<&String>) -> Option { + value.and_then(|value| value.parse::().ok()) +} + +fn load_manifest_findings( + root: &Path, + manifest_path: &Path, +) -> Result, String> { + let value = load_yaml_value(manifest_path)?; + let schema = value + .get("schema") + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default(); + let mut findings = Vec::new(); + + if matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) { + if let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let include = json_item + .get("include") + .and_then(serde_json::Value::as_str) + .map(str::to_owned); + if let Some(include) = include { + let include_path = resolve_manifest_path(root, manifest_path, &include)?; + let mut included = load_manifest_findings(root, &include_path)?; + let inherited_binding = json_item.get("binding").cloned(); + for included_item in &mut included { + if inherited_binding.is_some() + && included_item.get("binding").is_none() + && included_item.get("effects").is_none() + { + if let Some(object) = included_item.as_object_mut() { + object.insert( + "binding".to_owned(), + inherited_binding.clone().unwrap_or(serde_json::Value::Null), + ); + } + } + } + findings.extend(included); + continue; + } + if json_item.get("include").is_none() { + findings.push(json_item); + } + } + } + } + + if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") { + if let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { + continue; + }; + let member_path = resolve_manifest_path(root, manifest_path, path)?; + findings.extend(load_manifest_findings(root, &member_path)?); + } + } + } + + Ok(findings) +} + +fn load_yaml_value(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read YAML {}: {err}", path.display()))?; + serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) +} + +fn yaml_to_json(value: serde_yaml::Value) -> Result { + serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) +} + +fn load_manifest_provenance_links( + root: &Path, + manifest_path: &Path, +) -> Result, String> { + let value = load_yaml_value(manifest_path)?; + let schema = value + .get("schema") + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default(); + let mut links = BTreeMap::::new(); + collect_manifest_provenance_entries(&value, &mut links)?; + + if matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) { + if let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) + else { + continue; + }; + let include_path = resolve_manifest_path(root, manifest_path, include)?; + for item in load_manifest_provenance_links(root, &include_path)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + } + + if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") { + if let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { + continue; + }; + let member_path = resolve_manifest_path(root, manifest_path, path)?; + for item in load_manifest_provenance_links(root, &member_path)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + } + + Ok(links.into_values().collect()) +} + +fn collect_manifest_provenance_entries( + value: &serde_yaml::Value, + links: &mut BTreeMap, +) -> Result<(), String> { + if let Some(sources) = value + .get("provenance") + .and_then(|provenance| provenance.get("sources")) + .and_then(serde_yaml::Value::as_sequence) + { + for source in sources { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + } + if let Some(source) = value.get("source") { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + Ok(()) +} + +fn match_app_findings( + findings: &[serde_json::Value], + observations: &[serde_json::Value], + analyses: &[serde_json::Value], +) -> Vec { + let mut matched = Vec::new(); + let mut seen = std::collections::BTreeSet::new(); + for finding in findings { + if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { + for effect in effects { + if let Some(observation) = app_finding_match_observation(effect, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert("matched_analysis".to_owned(), analysis); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } + } + } else if let Some(observation) = app_finding_match_observation(finding, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_analysis".to_owned(), analysis); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } + } + matched +} + +fn app_finding_match_observation<'a>( + finding: &serde_json::Value, + observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let Some(binding) = finding.get("binding") else { + return None; + }; + match binding.get("source").and_then(serde_json::Value::as_str) { + Some("variant") => app_variant_binding_match_observation(binding, observations), + _ => None, + } +} + +fn app_finding_match_analysis( + finding: &serde_json::Value, + analyses: &[serde_json::Value], +) -> Option { + let binding = finding.get("binding")?; + if binding.get("source").and_then(serde_json::Value::as_str) != Some("analysis") { + return None; + } + let analysis_id = binding + .get("analysis_id") + .or_else(|| binding.get("analysis")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let key = binding.get("key").and_then(serde_json::Value::as_str)?; + for analysis in analyses { + if !analysis_id.is_empty() + && analysis + .get("analysis_id") + .and_then(serde_json::Value::as_str) + != Some(analysis_id) + { + continue; + } + let Some(rows) = analysis.get("rows").and_then(serde_json::Value::as_array) else { + continue; + }; + for row in rows { + if app_binding_matches_value(row.get(key), binding) { + return Some(serde_json::json!({ + "participant_id": analysis.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), + "assay_id": analysis.get("assay_id").cloned().unwrap_or(serde_json::Value::Null), + "analysis_id": analysis.get("analysis_id").cloned().unwrap_or(serde_json::Value::Null), + "key": key, + "value": row.get(key).cloned().unwrap_or(serde_json::Value::Null), + "row": row, + })); + } + } + } + None +} + +fn app_variant_binding_match_observation<'a>( + binding: &serde_json::Value, + observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let operator = binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or("equals"); + if matches!(operator, "dosage_equals" | "dosage_in") { + let allele = binding + .get("allele") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + return observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .find(|observation| { + let dosage = app_observation_allele_dosage(observation, allele); + app_binding_matches_dosage(dosage, binding) + }); + } + + let key = binding + .get("key") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if key.is_empty() { + return None; + } + observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .find(|observation| app_binding_matches_value(observation.get(key), binding)) +} + +fn app_finding_observation_context(observation: &serde_json::Value) -> serde_json::Value { + serde_json::json!({ + "participant_id": observation.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), + "rsid": observation.get("rsid").cloned().unwrap_or(serde_json::Value::Null), + "ref": observation.get("ref").cloned().unwrap_or(serde_json::Value::Null), + "alt": observation.get("alt").cloned().unwrap_or(serde_json::Value::Null), + "genotype_display": observation.get("genotype_display").cloned().unwrap_or(serde_json::Value::Null), + "outcome": observation.get("outcome").cloned().unwrap_or(serde_json::Value::Null), + }) +} + +fn app_variant_ref_mismatch(binding: &serde_json::Value, observation: &serde_json::Value) -> bool { + let variant_ref = binding + .get("variant") + .or_else(|| binding.get("path")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if variant_ref.is_empty() { + return false; + } + let basename = Path::new(variant_ref) + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or(variant_ref); + let candidates = [ + observation + .get("variant_key") + .and_then(serde_json::Value::as_str), + observation + .get("variant_path") + .and_then(serde_json::Value::as_str), + observation.get("rsid").and_then(serde_json::Value::as_str), + ]; + !candidates.into_iter().flatten().any(|candidate| { + candidate == variant_ref + || Path::new(candidate) + .file_name() + .and_then(|value| value.to_str()) + .is_some_and(|value| value == basename) + }) +} + +fn app_observation_allele_dosage(observation: &serde_json::Value, allele: &str) -> Option { + if allele.is_empty() { + return None; + } + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let zygosity = observation + .get("zygosity") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if allele == ref_allele { + return match zygosity { + "hom_ref" => Some(2), + "het" => Some(1), + "hom_alt" => Some(0), + _ => None, + }; + } + if allele == alt_allele { + return match zygosity { + "hom_ref" => Some(0), + "het" => Some(1), + "hom_alt" => Some(2), + _ => None, + }; + } + let display = observation + .get("genotype_display") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if allele.len() == 1 { + let allele_ch = allele.chars().next()?.to_ascii_uppercase(); + return Some( + display + .chars() + .filter(|ch| ch.to_ascii_uppercase() == allele_ch) + .count() + .try_into() + .ok()?, + ); + } + None +} + +fn app_binding_matches_value( + actual: Option<&serde_json::Value>, + binding: &serde_json::Value, +) -> bool { + let actual = actual.and_then(value_as_string).unwrap_or_default(); + match binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or("equals") + { + "equals" => binding + .get("value") + .and_then(value_as_string) + .is_some_and(|value| value == actual), + "in" => binding + .get("values") + .and_then(serde_json::Value::as_array) + .is_some_and(|values| { + values + .iter() + .filter_map(value_as_string) + .any(|value| value == actual) + }), + _ => false, + } +} + +fn app_binding_matches_dosage(dosage: Option, binding: &serde_json::Value) -> bool { + let Some(dosage) = dosage else { + return false; + }; + match binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "dosage_equals" => binding + .get("value") + .and_then(serde_json::Value::as_i64) + .is_some_and(|value| value == dosage), + "dosage_in" => binding + .get("values") + .and_then(serde_json::Value::as_array) + .is_some_and(|values| { + values + .iter() + .filter_map(serde_json::Value::as_i64) + .any(|value| value == dosage) + }), + _ => false, + } +} + +fn value_as_string(value: &serde_json::Value) -> Option { + match value { + serde_json::Value::String(value) => Some(value.clone()), + serde_json::Value::Number(value) => Some(value.to_string()), + serde_json::Value::Bool(value) => Some(value.to_string()), + _ => None, + } +} + +fn app_finding_dedupe_key(finding: &serde_json::Value) -> String { + let effect_key = finding + .get("matched_effect") + .and_then(|effect| { + effect + .get("id") + .or_else(|| effect.get("label")) + .or_else(|| effect.get("text")) + }) + .and_then(value_as_string) + .unwrap_or_default(); + if let Some(evidence) = finding.get("evidence") { + let source = evidence + .get("source") + .and_then(value_as_string) + .unwrap_or_default(); + let kind = evidence + .get("kind") + .and_then(value_as_string) + .unwrap_or_default(); + let id = evidence + .get("id") + .and_then(value_as_string) + .unwrap_or_default(); + if !source.is_empty() || !kind.is_empty() || !id.is_empty() { + return format!("evidence|{source}|{kind}|{id}|{effect_key}"); + } + if let Some(url) = evidence.get("url").and_then(value_as_string) { + return format!("evidence_url|{url}|{effect_key}"); + } + } + if let Some(id) = finding.get("id").and_then(value_as_string) { + return format!("id|{id}|{effect_key}"); + } + format!( + "content|{}|{}|{}|{}", + finding + .get("schema") + .and_then(value_as_string) + .unwrap_or_default(), + finding + .get("label") + .and_then(value_as_string) + .unwrap_or_default(), + finding + .get("notes") + .and_then(value_as_string) + .unwrap_or_default(), + effect_key + ) +} + +fn app_report_json( + assay_id: &str, + participant_id: &str, + input_file: &Path, + observations: &[serde_json::Value], + analyses: &[serde_json::Value], + findings: &[serde_json::Value], + provenance: &[serde_json::Value], +) -> serde_json::Value { + let called = observations + .iter() + .filter(|item| { + item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") + }) + .count(); + serde_json::json!({ + "schema": "bioscript:report:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": assay_id, + "assay_version": "1.0", + "input": { + "file_name": input_file.file_name().and_then(|value| value.to_str()).unwrap_or_default(), + "file_path": input_file.display().to_string(), + }, + "report_status": if called == observations.len() { "complete" } else { "partial" }, + "derived_from": observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), + "analyses": analyses, + "findings": findings, + "provenance": provenance, + "metrics": { + "n_sites_tested": observations.len(), + "n_sites_called": called, + "n_sites_missing": observations.len().saturating_sub(called), + "n_analyses": analyses.len(), + "n_findings_matched": findings.len(), + } + }) +} + +fn write_app_observations( + output_dir: &Path, + observations: &[serde_json::Value], + format: AppOutputFormat, +) -> Result<(), String> { + if matches!(format, AppOutputFormat::Tsv | AppOutputFormat::Both) { + let mut out = bioscript_core::OBSERVATION_TSV_HEADERS.join("\t"); + out.push('\n'); + for observation in observations { + let line = bioscript_core::OBSERVATION_TSV_HEADERS + .iter() + .map(|header| json_field_as_tsv(observation.get(*header))) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + fs::write(output_dir.join("observations.tsv"), out) + .map_err(|err| format!("failed to write observations.tsv: {err}"))?; + } + if matches!(format, AppOutputFormat::Jsonl | AppOutputFormat::Both) { + write_jsonl(&output_dir.join("observations.jsonl"), observations)?; + } + if matches!(format, AppOutputFormat::Json) { + write_json_pretty( + &output_dir.join("observations.json"), + &serde_json::json!({"observations": observations}), + )?; + } + Ok(()) +} + +fn write_app_analyses(output_dir: &Path, analyses: &[serde_json::Value]) -> Result<(), String> { + write_jsonl(&output_dir.join("analysis.jsonl"), analyses) +} + +fn write_app_reports( + output_dir: &Path, + reports: &[serde_json::Value], + format: AppOutputFormat, +) -> Result<(), String> { + if matches!(format, AppOutputFormat::Jsonl | AppOutputFormat::Both) { + write_jsonl(&output_dir.join("reports.jsonl"), reports)?; + } + if matches!(format, AppOutputFormat::Json | AppOutputFormat::Both) { + write_json_pretty( + &output_dir.join("reports.json"), + &serde_json::json!({ + "schema": "bioscript:report-set:1.0", + "version": "1.0", + "reports": reports, + }), + )?; + } + Ok(()) +} + +fn write_jsonl(path: &Path, rows: &[serde_json::Value]) -> Result<(), String> { + let mut out = String::new(); + for row in rows { + let line = serde_json::to_string(row).map_err(|err| err.to_string())?; + out.push_str(&line); + out.push('\n'); + } + fs::write(path, out).map_err(|err| format!("failed to write {}: {err}", path.display())) +} + +fn write_json_pretty(path: &Path, value: &serde_json::Value) -> Result<(), String> { + let text = serde_json::to_string_pretty(value).map_err(|err| err.to_string())?; + fs::write(path, text).map_err(|err| format!("failed to write {}: {err}", path.display())) +} + +fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { + match value { + Some(serde_json::Value::Null) | None => String::new(), + Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), + Some(value) => value.to_string().replace(['\t', '\n'], " "), + } +} + +fn write_app_html( + output_dir: &Path, + observations: &[serde_json::Value], + reports: &[serde_json::Value], +) -> Result<(), String> { + let mut out = String::from( + r##"BioScript report

BioScript Report

"##, + ); + let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); + let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); + let analysis_outputs = collect_report_analyses(reports); + let _ = write!( + out, + "
{} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
", + observations.len(), + analysis_outputs.len(), + label_findings.len(), + summary_findings.len() + ); + out.push_str(""); + out.push_str("

Observations

"); + render_observation_table(&mut out, observations); + out.push_str("
"); + out.push_str("

Analysis

"); + render_analysis_tables(&mut out, &analysis_outputs); + out.push_str("
"); + out.push_str("

PGx Label Annotations

"); + render_pgx_label_table(&mut out, &label_findings); + out.push_str("
"); + out.push_str("

PGx Summary Annotations

"); + render_pgx_summary_table(&mut out, &summary_findings); + out.push_str("
"); + out.push_str("

Provenance

"); + render_provenance_links(&mut out, reports); + out.push_str("
"); + out.push_str("

Raw Reports JSON

"); + for report in reports { + let text = serde_json::to_string_pretty(report).map_err(|err| err.to_string())?; + let _ = write!(out, "
{}
", html_escape(&text)); + } + out.push_str("
"); + fs::write(output_dir.join("index.html"), out) + .map_err(|err| format!("failed to write index.html: {err}")) +} + +fn collect_report_analyses(reports: &[serde_json::Value]) -> Vec { + reports + .iter() + .filter_map(|report| report.get("analyses").and_then(serde_json::Value::as_array)) + .flat_map(|analyses| analyses.iter()) + .cloned() + .collect() +} + +fn collect_report_findings(reports: &[serde_json::Value], schema: &str) -> Vec { + reports + .iter() + .filter_map(|report| report.get("findings").and_then(serde_json::Value::as_array)) + .flat_map(|findings| findings.iter()) + .filter(|finding| finding.get("schema").and_then(serde_json::Value::as_str) == Some(schema)) + .cloned() + .collect() +} + +fn render_analysis_tables(out: &mut String, analyses: &[serde_json::Value]) { + if analyses.is_empty() { + out.push_str("

No analysis outputs.

"); + return; + } + for (index, analysis) in analyses.iter().enumerate() { + let table_id = format!("analysis-table-{index}"); + let title = format!( + "{} / {}", + value_str(analysis, "participant_id"), + value_str(analysis, "analysis_id") + ); + let _ = write!(out, "

{}

", html_escape(&title)); + render_analysis_logic(out, analysis); + let rows = analysis + .get("rows") + .and_then(serde_json::Value::as_array) + .cloned() + .unwrap_or_default(); + if rows.is_empty() { + out.push_str("

No rows emitted.

"); + continue; + } + let headers = analysis_row_headers(&rows); + let header_refs = headers.iter().map(String::as_str).collect::>(); + render_table_start(out, &table_id, &header_refs); + for row in rows { + out.push_str(""); + for header in &headers { + table_cell(out, &json_field_as_tsv(row.get(header))); + } + out.push_str(""); + } + render_table_end(out); + } +} + +fn analysis_row_headers(rows: &[serde_json::Value]) -> Vec { + let mut headers = Vec::new(); + for row in rows { + let Some(object) = row.as_object() else { + continue; + }; + for key in object.keys() { + if !headers.contains(key) { + headers.push(key.clone()); + } + } + } + headers +} + +fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { + let Some(logic) = analysis.get("logic") else { + return; + }; + if logic.is_null() { + return; + } + let description = logic + .get("description") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let source = logic.get("source").unwrap_or(&serde_json::Value::Null); + let source_name = source + .get("name") + .and_then(serde_json::Value::as_str) + .unwrap_or("source"); + let source_url = source + .get("url") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + out.push_str("
"); + if !description.is_empty() { + let _ = write!(out, "

{}

", html_escape(description)); + } + if !source_url.is_empty() { + let _ = write!( + out, + "

Logic source: {}

", + html_escape(source_url), + html_escape(source_name) + ); + } + out.push_str("
"); +} + +fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { + let mut links = BTreeMap::::new(); + for report in reports { + collect_provenance_links_from_value(report, &mut links); + } + if links.is_empty() { + out.push_str("

No provenance links.

"); + return; + } + out.push_str("
    "); + for (url, label) in links { + let display = if label.is_empty() { &url } else { &label }; + let _ = write!( + out, + "
  • {}
    {}
  • ", + html_escape(&url), + html_escape(display), + html_escape(&url) + ); + } + out.push_str("
"); +} + +fn collect_provenance_links_from_value( + value: &serde_json::Value, + links: &mut BTreeMap, +) { + match value { + serde_json::Value::Object(object) => { + if let Some(url) = object.get("url").and_then(serde_json::Value::as_str) + && url.starts_with("http") + { + let label = object + .get("name") + .or_else(|| object.get("label")) + .or_else(|| object.get("source")) + .and_then(value_as_string) + .unwrap_or_default(); + links.entry(url.to_owned()).or_insert(label); + } + for child in object.values() { + collect_provenance_links_from_value(child, links); + } + } + serde_json::Value::Array(items) => { + for item in items { + collect_provenance_links_from_value(item, links); + } + } + _ => {} + } +} + +fn render_observation_table(out: &mut String, observations: &[serde_json::Value]) { + let headers = [ + "participant_id", + "rsid", + "ref", + "alt", + "genotype_display", + "genotype", + "zygosity", + "outcome", + "match_status", + "coverage_status", + "call_status", + "assembly", + "chrom", + "pos_start", + "pos_end", + "kind", + "ref_count", + "alt_count", + "depth", + "genotype_quality", + "allele_balance", + "evidence_type", + "evidence_raw", + "facets", + "assay_id", + "assay_version", + "variant_key", + ]; + render_table_start(out, "observations-table", &headers); + for observation in observations { + let _ = write!(out, "", observation_row_class(observation)); + for header in headers { + render_observation_cell(out, observation, header); + } + out.push_str(""); + } + out.push_str(""); +} + +fn observation_row_class(observation: &serde_json::Value) -> &'static str { + match observation + .get("outcome") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "variant" => "row-variant", + "reference" => "row-reference", + _ => "", + } +} + +fn render_observation_cell(out: &mut String, observation: &serde_json::Value, header: &str) { + if header == "genotype_display" { + let outcome = observation + .get("outcome") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let value = json_field_as_tsv(observation.get(header)); + if outcome == "variant" { + let alt = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let _ = write!( + out, + "{}", + highlight_allele(&value, alt) + ); + return; + } + } + let _ = write!( + out, + "{}", + html_escape(&json_field_as_tsv(observation.get(header))) + ); +} + +fn highlight_allele(value: &str, allele: &str) -> String { + if value.is_empty() || allele.is_empty() { + return html_escape(value); + } + if allele.chars().count() == 1 { + let target = allele + .chars() + .next() + .unwrap_or_default() + .to_ascii_uppercase(); + let mut out = String::new(); + for ch in value.chars() { + let escaped = html_escape(&ch.to_string()); + if ch.to_ascii_uppercase() == target { + let _ = write!(out, "{escaped}"); + } else { + out.push_str(&escaped); + } + } + return out; + } + let escaped_value = html_escape(value); + let escaped_allele = html_escape(allele); + escaped_value.replace( + &escaped_allele, + &format!("{escaped_allele}"), + ) +} + +fn render_pgx_label_table(out: &mut String, findings: &[serde_json::Value]) { + let headers = [ + "Variant", + "Ref/Alt", + "Genes", + "Drug(s)", + "Regulator", + "Action", + "Label", + "Evidence", + ]; + render_pgx_label_filters(out); + render_table_start(out, "labels-table", &headers); + for finding in findings { + let evidence = finding.get("evidence"); + let url = evidence + .and_then(|value| value.get("url")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let pgx_level = value_str(finding, "pgx_action_level"); + let _ = write!( + out, + "", + html_escape(&pgx_level_slug(pgx_level)) + ); + table_cell(out, value_str(finding, "variant")); + class_cell(out, &matched_ref_alt(finding), "mono"); + table_cell(out, &join_string_array(finding.get("genes"))); + table_cell(out, &join_drugs(finding)); + table_cell(out, &join_string_array(finding.get("regulatory_sources"))); + pgx_level_cell(out, pgx_level); + table_cell(out, value_str(finding, "label")); + link_cell(out, url); + out.push_str(""); + } + render_table_end(out); +} + +fn render_pgx_summary_table(out: &mut String, findings: &[serde_json::Value]) { + let headers = [ + "Variant", + "Ref/Alt", + "Genotype", + "Drug(s)", + "Category", + "Level", + "Phenotype", + "Effect", + "Evidence", + ]; + render_evidence_level_filters(out); + render_table_start(out, "summaries-table", &headers); + for finding in findings { + let effect = finding + .get("matched_effect") + .unwrap_or(&serde_json::Value::Null); + let evidence = finding.get("evidence"); + let url = evidence + .and_then(|value| value.get("url")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let evidence_level = value_str(finding, "evidence_level"); + let _ = write!( + out, + "", + html_escape(&evidence_level_group(evidence_level)) + ); + table_cell(out, value_str(finding, "variant")); + class_cell(out, &matched_ref_alt(finding), "mono"); + table_cell(out, value_str(effect, "label")); + table_cell(out, &join_drugs(finding)); + table_cell(out, &join_string_array(finding.get("phenotype_categories"))); + evidence_level_cell(out, evidence_level); + table_cell(out, &join_string_array(finding.get("phenotypes"))); + class_cell(out, value_str(effect, "text"), "effect"); + link_cell(out, url); + out.push_str(""); + } + render_table_end(out); +} + +fn render_evidence_level_filters(out: &mut String) { + out.push_str("
Evidence:"); + for (level, label) in [ + ("1", "Level 1"), + ("1a", "Level 1A"), + ("1b", "Level 1B"), + ("2", "Level 2"), + ("2a", "Level 2A"), + ("2b", "Level 2B"), + ("3", "Level 3"), + ("4", "Level 4"), + ] { + let _ = write!( + out, + "" + ); + } + out.push_str(""); + out.push_str("i
"); +} + +fn render_pgx_label_filters(out: &mut String) { + out.push_str("
PGx level:"); + for (level, label) in [ + ("required", "Testing Required"), + ("recommended", "Testing Recommended"), + ("actionable", "Actionable PGx"), + ("informative", "Informative PGx"), + ("no-clinical", "No Clinical PGx"), + ("criteria", "Criteria Not Met"), + ] { + let _ = write!( + out, + "" + ); + } + out.push_str(""); + out.push_str("i
"); +} + +fn matched_ref_alt(finding: &serde_json::Value) -> String { + let Some(observation) = finding.get("matched_observation") else { + return String::new(); + }; + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if ref_allele.is_empty() && alt_allele.is_empty() { + String::new() + } else { + let alt_display = alt_allele.replace(',', "/"); + format!("{ref_allele}->{alt_display}") + } +} + +fn evidence_level_group(level: &str) -> String { + let normalized = level.trim().to_ascii_lowercase(); + if normalized.starts_with("1a") { + "1a".to_owned() + } else if normalized.starts_with("1b") { + "1b".to_owned() + } else if normalized.starts_with('1') { + "1".to_owned() + } else if normalized.starts_with("2a") { + "2a".to_owned() + } else if normalized.starts_with("2b") { + "2b".to_owned() + } else if normalized.starts_with('2') { + "2".to_owned() + } else if normalized.starts_with('3') { + "3".to_owned() + } else if normalized.starts_with('4') { + "4".to_owned() + } else { + "unknown".to_owned() + } +} + +fn evidence_level_color_group(level: &str) -> String { + level + .chars() + .find(|ch| ch.is_ascii_digit()) + .map(|ch| ch.to_string()) + .unwrap_or_else(|| "unknown".to_owned()) +} + +fn evidence_level_cell(out: &mut String, level: &str) { + if level.is_empty() { + out.push_str(""); + return; + } + let group = evidence_level_color_group(level); + let _ = write!( + out, + "{}", + html_escape(&group), + html_escape(level) + ); +} + +fn pgx_level_slug(level: &str) -> String { + let normalized = level.to_ascii_lowercase(); + if normalized.contains("required") { + "required".to_owned() + } else if normalized.contains("recommended") { + "recommended".to_owned() + } else if normalized.contains("actionable") { + "actionable".to_owned() + } else if normalized.contains("informative") { + "informative".to_owned() + } else if normalized.contains("no clinical") { + "no-clinical".to_owned() + } else if normalized.contains("criteria") { + "criteria".to_owned() + } else { + "unknown".to_owned() + } +} + +fn pgx_level_cell(out: &mut String, level: &str) { + if level.is_empty() { + out.push_str(""); + return; + } + let slug = pgx_level_slug(level); + let _ = write!( + out, + "{}", + html_escape(&slug), + html_escape(level) + ); +} + +fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { + let escaped_id = html_escape(table_id); + let refs_control = if table_id == "observations-table" { + "" + } else { + "" + }; + let _ = write!( + out, + "
{refs_control}
" + ); + for (index, header) in headers.iter().enumerate() { + let _ = write!( + out, + "", + escaped_id, + index, + html_escape(header) + ); + } + out.push_str(""); +} + +fn render_table_end(out: &mut String) { + out.push_str("
{}
"); +} + +fn table_cell(out: &mut String, value: &str) { + class_cell(out, value, ""); +} + +fn class_cell(out: &mut String, value: &str, class_name: &str) { + if class_name.is_empty() { + let _ = write!(out, "{}", html_escape(value)); + } else { + let _ = write!( + out, + "{}", + class_name, + html_escape(value) + ); + } +} + +fn link_cell(out: &mut String, url: &str) { + if url.is_empty() { + out.push_str(""); + } else { + let escaped = html_escape(url); + let _ = write!( + out, + "source" + ); + } +} + +fn value_str<'a>(value: &'a serde_json::Value, key: &str) -> &'a str { + value + .get(key) + .and_then(serde_json::Value::as_str) + .unwrap_or_default() +} + +fn join_string_array(value: Option<&serde_json::Value>) -> String { + value + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .filter_map(serde_json::Value::as_str) + .collect::>() + .join(", ") + }) + .unwrap_or_default() +} + +fn join_drugs(finding: &serde_json::Value) -> String { + finding + .get("drugs") + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .filter_map(|drug| drug.get("name").and_then(serde_json::Value::as_str)) + .collect::>() + .join(", ") + }) + .unwrap_or_default() +} + +fn html_escape(value: &str) -> String { + value + .replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) +} + +fn run_validate_panels(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-panels [--report ]".to_owned()); + }; + + let report = validate_panels_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} + +fn run_validate_assays(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-assays [--report ]".to_owned()); + }; + + let report = validate_assays_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} + +fn is_yaml_manifest(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| matches!(ext, "yaml" | "yml")) +} + +struct ManifestRunOptions<'a> { + input_file: Option<&'a str>, + output_file: Option<&'a str>, + participant_id: Option<&'a str>, + trace_report: Option<&'a Path>, + loader: &'a GenotypeLoadOptions, + filters: &'a [String], +} + +fn run_manifest( + runtime_root: &Path, + manifest_path: &Path, + options: &ManifestRunOptions<'_>, +) -> Result<(), String> { + let schema = manifest_schema(manifest_path)?; + let resolved_input = options + .input_file + .map(|value| resolve_cli_path(runtime_root, value)); + let resolved_output = options + .output_file + .map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); + let resolved_trace = options + .trace_report + .map(|value| resolve_cli_path_buf(runtime_root, value)); + match schema.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = load_variant_manifest(manifest_path)?; + let row = run_variant_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + )?; + write_manifest_outputs( + std::slice::from_ref(&row), + resolved_output.as_deref(), + resolved_trace.as_deref(), + )?; + Ok(()) + } + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + let rows = run_panel_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + options.filters, + )?; + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + let rows = run_assay_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + options.filters, + )?; + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn run_variant_manifest( + runtime_root: &Path, + manifest: &VariantManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, +) -> Result, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + Ok(variant_row( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )) +} + +fn run_panel_manifest( + runtime_root: &Path, + panel: &PanelManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let mut rows = Vec::new(); + + for member in &panel.members { + let Some(path) = &member.path else { + return Err("remote panel members are not executable yet".to_owned()); + }; + let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + if member.kind == "variant" { + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + rows.push(variant_row( + runtime_root, + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } else if member.kind == "assay" { + let assay = load_assay_manifest(&resolved)?; + rows.extend(run_assay_manifest_with_store( + runtime_root, + &assay, + &store, + participant_id, + filters, + )?); + } else { + return Err(format!( + "panel member kind '{}' is not executable", + member.kind + )); + } + } + + Ok(rows) +} + +fn run_assay_manifest( + runtime_root: &Path, + assay: &AssayManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + run_assay_manifest_with_store(runtime_root, assay, &store, participant_id, filters) +} + +fn run_assay_manifest_with_store( + runtime_root: &Path, + assay: &AssayManifest, + store: &GenotypeStore, + participant_id: Option<&str>, + filters: &[String], +) -> Result>, String> { + let mut rows = Vec::new(); + + for member in &assay.members { + if member.kind != "variant" { + return Err(format!( + "assay member kind '{}' is not executable", + member.kind + )); + } + let Some(path) = &member.path else { + return Err("remote assay members are not executable yet".to_owned()); + }; + let resolved = resolve_manifest_path(runtime_root, &assay.path, path)?; let manifest = load_variant_manifest(&resolved)?; if !matches_filters(&manifest, &resolved, filters) { continue; diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index 5936b1d..e7e4bb8 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -310,3 +310,72 @@ members: assert!(stdout.contains("example-rs73885319")); assert!(!stdout.contains("example-rs60910145")); } + +#[test] +fn assay_manifest_runs_directly_via_cli() { + let root = repo_root(); + let dir = temp_dir("assay-manifest"); + fs::write( + dir.join("rs73885319.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + let assay = dir.join("assay.yaml"); + fs::write( + &assay, + r#" +schema: "bioscript:assay:1.0" +version: "1.0" +name: "example-assay" +members: + - kind: "variant" + path: "rs73885319.yaml" + version: "1.0" +interpretations: + - id: "example_status" + kind: "bioscript" + path: "example.py" + derived_from: + - "rs73885319.yaml" + emits: + - key: "example_status" + label: "Example status" + value_type: "string" + format: "badge" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&assay) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("example-rs73885319")); + assert!(stdout.contains("AG")); +} diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 5e2c349..f4a8661 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -375,6 +375,7 @@ struct ParsedDelimitedRow { chrom: Option, position: Option, genotype: String, + raw_line: String, } impl DelimitedBackend { @@ -1454,6 +1455,7 @@ impl RowParser { chrom, position, genotype: normalize_genotype(&genotype), + raw_line: sanitize_evidence_line(line), })) } @@ -1526,6 +1528,17 @@ fn strip_inline_comment(value: &str) -> String { value.trim().to_owned() } +fn sanitize_evidence_line(line: &str) -> String { + line.trim_end_matches(['\n', '\r']) + .chars() + .map(|ch| match ch { + '\t' => " ".to_owned(), + ch if ch.is_control() => " ".to_owned(), + ch => ch.to_string(), + }) + .collect::() +} + fn normalize_genotype(value: &str) -> String { let cleaned = value.trim().replace(' ', "").to_ascii_uppercase(); if cleaned.is_empty() || matches!(cleaned.as_str(), "NA" | "N/A" | "#N/A" | "NONE") { @@ -1569,6 +1582,7 @@ struct ParsedVcfRow { reference: String, alternates: Vec, genotype: String, + raw_line: String, } fn scan_vcf_variants( @@ -1718,7 +1732,10 @@ fn resolve_vcf_row( matched_rsid: Some(rsid.clone()), assembly: targets.detected_assembly, genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], + evidence: vec![ + format!("resolved by rsid {rsid}"), + format!("source line: {}", row.raw_line), + ], ..VariantObservation::default() }; *unresolved = (*unresolved).saturating_sub(1); @@ -1745,7 +1762,10 @@ fn resolve_vcf_row( matched_rsid: row.rsid.clone(), assembly: targets.detected_assembly, genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by locus {}:{}", row.chrom, row.position)], + evidence: vec![ + format!("resolved by locus {}:{}", row.chrom, row.position), + format!("source line: {}", row.raw_line), + ], ..VariantObservation::default() }; *unresolved = (*unresolved).saturating_sub(1); @@ -1801,6 +1821,7 @@ fn parse_vcf_record(line: &str) -> Result, RuntimeError> { reference: reference.to_owned(), alternates, genotype, + raw_line: sanitize_evidence_line(line), })) } @@ -2033,7 +2054,10 @@ fn scan_delimited_variants( backend: backend.backend_name().to_owned(), matched_rsid: Some(rsid.clone()), genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], + evidence: vec![ + format!("resolved by rsid {rsid}"), + format!("source line: {}", row.raw_line), + ], ..VariantObservation::default() }; unresolved = unresolved.saturating_sub(1); @@ -2057,7 +2081,10 @@ fn scan_delimited_variants( backend: backend.backend_name().to_owned(), matched_rsid: row.rsid.clone(), genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by locus {}:{}", chrom, position)], + evidence: vec![ + format!("resolved by locus {}:{}", chrom, position), + format!("source line: {}", row.raw_line), + ], ..VariantObservation::default() }; unresolved = unresolved.saturating_sub(1); @@ -2251,6 +2278,7 @@ fn parse_streaming_row( chrom, position, genotype: normalize_genotype(&genotype), + raw_line: sanitize_evidence_line(line), })) } diff --git a/rust/bioscript-schema/src/lib.rs b/rust/bioscript-schema/src/lib.rs index 174a358..3da1f4c 100644 --- a/rust/bioscript-schema/src/lib.rs +++ b/rust/bioscript-schema/src/lib.rs @@ -5,8 +5,9 @@ pub use remote_resource::{ RemoteDependency, RemoteResourceKind, RemoteResourceResolution, resolve_remote_resource_text, }; pub use validator::{ - Download, FileReport, Issue, PanelManifest, PanelMember, Permissions, Severity, - ValidationReport, VariantManifest, load_panel_manifest, load_variant_manifest, - load_variant_manifest_text, load_variant_manifest_text_for_lookup, validate_panels_path, - validate_variants_path, + AssayManifest, Download, FileReport, Issue, PanelInterpretation, PanelInterpretationLogic, + PanelInterpretationLogicSource, PanelManifest, PanelMember, Permissions, Severity, + ValidationReport, VariantManifest, load_assay_manifest, load_panel_manifest, + load_variant_manifest, load_variant_manifest_text, load_variant_manifest_text_for_lookup, + validate_assays_path, validate_panels_path, validate_variants_path, }; diff --git a/rust/bioscript-schema/src/validator.rs b/rust/bioscript-schema/src/validator.rs index 8494bb4..64d3211 100644 --- a/rust/bioscript-schema/src/validator.rs +++ b/rust/bioscript-schema/src/validator.rs @@ -113,6 +113,16 @@ pub struct PanelManifest { pub permissions: Permissions, pub downloads: Vec, pub members: Vec, + pub interpretations: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct AssayManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub members: Vec, + pub interpretations: Vec, } #[derive(Debug, Clone, PartialEq, Eq, Default)] @@ -138,6 +148,37 @@ pub struct PanelMember { pub version: Option, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretation { + pub id: String, + pub kind: String, + pub path: String, + pub output_format: Option, + pub derived_from: Vec, + pub emits: Vec, + pub logic: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationLogic { + pub source: Option, + pub description: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationLogicSource { + pub name: Option, + pub url: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationEmit { + pub key: String, + pub label: Option, + pub value_type: Option, + pub format: Option, +} + /// Validate a variant file or directory of variant files. /// /// # Errors @@ -158,6 +199,16 @@ pub fn validate_panels_path(path: &Path) -> Result { validate_manifest_path(path, ManifestSelector::Panel) } +/// Validate an assay file or directory of assay files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +pub fn validate_assays_path(path: &Path) -> Result { + validate_manifest_path(path, ManifestSelector::Assay) +} + /// Load a single variant manifest from YAML. /// /// # Errors @@ -254,6 +305,7 @@ pub fn load_panel_manifest(path: &Path) -> Result { }; let downloads = parse_downloads(&value)?; let members = parse_panel_members(&value)?; + let interpretations = parse_panel_interpretations(&value)?; Ok(PanelManifest { path: path.to_path_buf(), @@ -262,11 +314,36 @@ pub fn load_panel_manifest(path: &Path) -> Result { permissions, downloads, members, + interpretations, + }) +} + +/// Load a single assay manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid assay +/// manifest. +pub fn load_assay_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + let mut issues = Vec::new(); + validate_assay_root(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + Ok(AssayManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + members: parse_panel_members(&value)?, + interpretations: parse_panel_interpretations(&value)?, }) } #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum ManifestSelector { + Assay, Variant, Panel, } @@ -279,6 +356,7 @@ fn validate_manifest_path( let mut reports = Vec::new(); for file in &files { let report = match selector { + ManifestSelector::Assay => validate_assay_file(file)?, ManifestSelector::Variant => validate_variant_file(file)?, ManifestSelector::Panel => validate_panel_file(file)?, }; @@ -324,6 +402,33 @@ fn collect_yaml_files_recursive(path: &Path, files: &mut Vec) -> Result Ok(()) } +fn validate_assay_file(path: &Path) -> Result { + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], + }); + }; + if !schema.contains("assay") { + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), + }); + } + + let mut issues = Vec::new(); + validate_assay_root(&value, &mut issues); + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} + fn validate_variant_file(path: &Path) -> Result { let value = load_yaml(path)?; let Some(schema) = scalar_at(&value, &["schema"]) else { @@ -337,6 +442,14 @@ fn validate_variant_file(path: &Path) -> Result { }); }; if !schema.contains("variant") { + if schema == "bioscript:pgx-findings:1.0" { + let mut issues = Vec::new(); + validate_pgx_findings_root(&value, &mut issues); + return Ok(FileReport { + file: path.to_path_buf(), + issues, + }); + } return Ok(FileReport { file: path.to_path_buf(), issues: Vec::new(), @@ -414,7 +527,45 @@ fn validate_panel_root(root: &Value, issues: &mut Vec) { validate_tags(root, issues); validate_permissions(root, issues); validate_downloads(root, issues); - validate_panel_members(root, issues); + validate_panel_members(root, &["variant", "assay"], issues); + validate_panel_interpretations(root, issues); + validate_findings(root, issues); +} + +fn validate_assay_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity(root, "bioscript:assay:1.0", None, issues); + validate_optional_strings(root, &["name", "label", "summary"], issues); + validate_tags(root, issues); + validate_panel_members(root, &["variant"], issues); + validate_panel_interpretations(root, issues); + validate_findings(root, issues); +} + +fn validate_pgx_findings_root(root: &Value, issues: &mut Vec) { + require_const(root, &["schema"], "bioscript:pgx-findings:1.0", issues); + require_const(root, &["version"], "1.0", issues); + validate_optional_strings(root, &["variant", "gene", "rsid", "variant_pa_id"], issues); + if value_at(root, &["variant"]).is_none() && value_at(root, &["rsid"]).is_none() { + issues.push(Issue { + severity: Severity::Error, + path: "variant/rsid".to_owned(), + message: "expected at least one variant identifier".to_owned(), + }); + } + match value_at(root, &["findings"]) { + Some(Value::Sequence(_)) => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: "findings".to_owned(), + message: "expected a sequence of findings".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: "findings".to_owned(), + message: "missing required field".to_owned(), + }), + } + validate_findings(root, issues); } fn validate_schema_and_identity( @@ -750,8 +901,30 @@ fn validate_alleles(root: &Value, issues: &mut Vec) { } alts.push(alt.to_owned()); } - validate_symbolic_alleles(&reference, &alts, issues); - validate_snv_alleles(&kind, &reference, &alts, issues); + let observed_alts = match seq_of_strings(root, &["alleles", "observed_alts"]) { + Some(items) => { + if items.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.observed_alts".to_owned(), + message: "expected a non-empty sequence of strings when present".to_owned(), + }); + } + for alt in &alts { + if !items.iter().any(|item| item == alt) { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.observed_alts".to_owned(), + message: format!("significant alt '{alt}' is not present in observed_alts"), + }); + } + } + items + } + None => alts.clone(), + }; + validate_symbolic_alleles(&reference, &observed_alts, issues); + validate_snv_alleles(&kind, &reference, &observed_alts, issues); } fn validate_symbolic_alleles(reference: &str, alts: &[String], issues: &mut Vec) { @@ -832,9 +1005,17 @@ fn validate_findings(root: &Value, issues: &mut Vec) { message: "empty string".to_owned(), }); } + if schema == "bioscript:pgx:1.0" { + issues.push(Issue { + severity: Severity::Warning, + path: format!("findings[{idx}].schema"), + message: "legacy PGx finding schema; prefer bioscript:pgx-summary:1.0 or bioscript:pgx-label:1.0".to_owned(), + }); + } if let Some(alt) = mapping .get(Value::String("alt".to_owned())) .and_then(Value::as_str) + && !alts.is_empty() && alt != "*" && !alts.iter().any(|item| item == alt) { @@ -859,6 +1040,173 @@ fn validate_findings(root: &Value, issues: &mut Vec) { message: "finding has neither summary nor notes".to_owned(), }); } + validate_finding_binding(&format!("findings[{idx}]"), mapping, issues); + validate_finding_effects(idx, mapping, issues); + } +} + +fn validate_finding_effects(idx: usize, mapping: &Mapping, issues: &mut Vec) { + let Some(effects) = mapping.get(Value::String("effects".to_owned())) else { + return; + }; + let Some(effects) = effects.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].effects"), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (effect_idx, effect) in effects.iter().enumerate() { + let Some(effect) = effect.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].effects[{effect_idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_finding_binding( + &format!("findings[{idx}].effects[{effect_idx}]"), + effect, + issues, + ); + } +} + +fn validate_finding_binding(parent: &str, mapping: &Mapping, issues: &mut Vec) { + let Some(binding) = mapping.get(Value::String("binding".to_owned())) else { + return; + }; + let Some(binding) = binding.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_required_mapping_string(binding, "source", &format!("{parent}.binding"), issues); + let source = binding + .get(Value::String("source".to_owned())) + .and_then(Value::as_str); + match source { + Some("variant") => { + if !binding.contains_key(Value::String("variant".to_owned())) + && !binding.contains_key(Value::String("path".to_owned())) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.variant"), + message: "variant findings require variant or path".to_owned(), + }); + } + } + Some("analysis") => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + validate_required_mapping_string( + binding, + "analysis_id", + &format!("{parent}.binding"), + issues, + ); + } + Some(other) => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.source"), + message: format!("unsupported source '{other}'"), + }), + None => {} + } + + let operator = binding + .get(Value::String("operator".to_owned())) + .and_then(Value::as_str) + .unwrap_or("equals"); + match operator { + "equals" => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + if !binding.contains_key(Value::String("value".to_owned())) { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.value"), + message: "equals requires value".to_owned(), + }); + } + } + "in" => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + let values = binding + .get(Value::String("values".to_owned())) + .and_then(Value::as_sequence); + if values.is_none_or(Vec::is_empty) { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.values"), + message: "in requires non-empty values".to_owned(), + }); + } + } + "dosage_equals" => { + if binding + .get(Value::String("allele".to_owned())) + .and_then(Value::as_str) + .is_none_or(|value| value.trim().is_empty()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.allele"), + message: "dosage_equals requires allele".to_owned(), + }); + } + if binding + .get(Value::String("value".to_owned())) + .and_then(Value::as_i64) + .is_none_or(|value| !(0..=2).contains(&value)) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.value"), + message: "dosage_equals requires integer value 0, 1, or 2".to_owned(), + }); + } + } + "dosage_in" => { + if binding + .get(Value::String("allele".to_owned())) + .and_then(Value::as_str) + .is_none_or(|value| value.trim().is_empty()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.allele"), + message: "dosage_in requires allele".to_owned(), + }); + } + let values = binding + .get(Value::String("values".to_owned())) + .and_then(Value::as_sequence); + let invalid_values = match values { + Some(items) if !items.is_empty() => items + .iter() + .any(|value| value.as_i64().is_none_or(|n| !(0..=2).contains(&n))), + _ => true, + }; + if invalid_values { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.values"), + message: "dosage_in requires integer values from 0 to 2".to_owned(), + }); + } + } + other => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.operator"), + message: format!( + "unsupported operator '{other}'; expected 'equals', 'in', 'dosage_equals', or 'dosage_in'" + ), + }), } } @@ -1035,7 +1383,7 @@ fn validate_downloads(root: &Value, issues: &mut Vec) { } } -fn validate_panel_members(root: &Value, issues: &mut Vec) { +fn validate_panel_members(root: &Value, allowed_kinds: &[&str], issues: &mut Vec) { let Some(members) = value_at(root, &["members"]).and_then(Value::as_sequence) else { issues.push(Issue { severity: Severity::Error, @@ -1064,7 +1412,7 @@ fn validate_panel_members(root: &Value, issues: &mut Vec) { }); continue; }; - validate_panel_member(idx, mapping, &download_ids, issues); + validate_panel_member(idx, mapping, allowed_kinds, &download_ids, issues); } } @@ -1085,6 +1433,7 @@ fn panel_download_ids(root: &Value) -> BTreeSet { fn validate_panel_member( idx: usize, mapping: &Mapping, + allowed_kinds: &[&str], download_ids: &BTreeSet, issues: &mut Vec, ) { @@ -1092,13 +1441,11 @@ fn validate_panel_member( .get(Value::String("kind".to_owned())) .and_then(Value::as_str); match kind { - Some("variant") => {} + Some(kind) if allowed_kinds.contains(&kind) => {} Some(other) => issues.push(Issue { severity: Severity::Error, path: format!("members[{idx}].kind"), - message: format!( - "unsupported member kind '{other}'; panel support is currently variant-only" - ), + message: format!("unsupported member kind '{other}'"), }), None => issues.push(Issue { severity: Severity::Error, @@ -1186,13 +1533,258 @@ fn validate_panel_member_metadata(idx: usize, mapping: &Mapping, issues: &mut Ve } } +fn validate_panel_interpretations(root: &Value, issues: &mut Vec) { + if value_at(root, &["analyses"]).is_some() && value_at(root, &["interpretations"]).is_some() { + issues.push(Issue { + severity: Severity::Warning, + path: "interpretations".to_owned(), + message: "use analyses instead of interpretations; do not define both".to_owned(), + }); + } + let key = if value_at(root, &["analyses"]).is_some() { + "analyses" + } else { + "interpretations" + }; + let Some(items) = value_at(root, &[key]) else { + return; + }; + let Some(items) = items.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: key.to_owned(), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_panel_interpretation(key, idx, mapping, issues); + } +} + +fn validate_panel_interpretation( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + for field in ["id", "kind", "path"] { + validate_required_mapping_string(mapping, field, &format!("{key}[{idx}]"), issues); + } + if let Some(kind) = mapping + .get(Value::String("kind".to_owned())) + .and_then(Value::as_str) + && kind != "bioscript" + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].kind"), + message: "expected 'bioscript'".to_owned(), + }); + } + if let Some(output_format) = mapping + .get(Value::String("output_format".to_owned())) + .and_then(Value::as_str) + && !matches!(output_format, "tsv" | "json" | "jsonl") + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].output_format"), + message: "expected 'tsv', 'json', or 'jsonl'".to_owned(), + }); + } + let Some(derived_from) = mapping + .get(Value::String("derived_from".to_owned())) + .and_then(Value::as_sequence) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from"), + message: "expected a non-empty sequence of strings".to_owned(), + }); + return; + }; + if derived_from.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from"), + message: "expected at least one source variant".to_owned(), + }); + } + for (source_idx, source) in derived_from.iter().enumerate() { + match source.as_str() { + Some(value) if !value.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from[{source_idx}]"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from[{source_idx}]"), + message: "expected string".to_owned(), + }), + } + } + validate_panel_interpretation_logic(key, idx, mapping, issues); + validate_panel_interpretation_emits(key, idx, mapping, issues); +} + +fn validate_panel_interpretation_logic( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + let Some(logic) = mapping.get(Value::String("logic".to_owned())) else { + return; + }; + let Some(logic) = logic.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].logic"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_optional_mapping_string(logic, "description", &format!("{key}[{idx}].logic"), issues); + let Some(source) = logic.get(Value::String("source".to_owned())) else { + return; + }; + let Some(source) = source.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].logic.source"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_optional_mapping_string( + source, + "name", + &format!("{key}[{idx}].logic.source"), + issues, + ); + validate_optional_mapping_string(source, "url", &format!("{key}[{idx}].logic.source"), issues); + if let Some(url) = source + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + validate_url_string( + url, + &format!("{key}[{idx}].logic.source.url"), + false, + issues, + ); + } +} + +fn validate_panel_interpretation_emits( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + let Some(emits) = mapping.get(Value::String("emits".to_owned())) else { + return; + }; + let Some(emits) = emits.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].emits"), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (emit_idx, item) in emits.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].emits[{emit_idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_required_mapping_string( + mapping, + "key", + &format!("{key}[{idx}].emits[{emit_idx}]"), + issues, + ); + for field in ["label", "value_type", "format"] { + validate_optional_mapping_string( + mapping, + field, + &format!("{key}[{idx}].emits[{emit_idx}]"), + issues, + ); + } + } +} + +fn validate_required_mapping_string( + mapping: &Mapping, + field: &str, + parent: &str, + issues: &mut Vec, +) { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(value) if !value.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "missing required field".to_owned(), + }), + } +} + +fn validate_optional_mapping_string( + mapping: &Mapping, + field: &str, + parent: &str, + issues: &mut Vec, +) { + if let Some(value) = mapping.get(Value::String(field.to_owned())) { + match value.as_str() { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Warning, + path: format!("{parent}.{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "expected string".to_owned(), + }), + } + } +} + fn variant_spec_from_root(root: &Value) -> Result { let rsids = seq_of_strings(root, &["identifiers", "rsids"]).unwrap_or_default(); let grch37 = locus_from_root(root, "grch37")?; let grch38 = locus_from_root(root, "grch38")?; let reference = scalar_at(root, &["alleles", "ref"]); - let alternate = - seq_of_strings(root, &["alleles", "alts"]).and_then(|alts| alts.first().cloned()); + let alternate = seq_of_strings(root, &["alleles", "observed_alts"]) + .or_else(|| seq_of_strings(root, &["alleles", "alts"])) + .and_then(|alts| alts.first().cloned()); let deletion_length = value_at(root, &["alleles", "deletion_length"]) .and_then(Value::as_u64) .and_then(|value| usize::try_from(value).ok()); @@ -1298,6 +1890,136 @@ fn parse_panel_members(root: &Value) -> Result, String> { Ok(members) } +fn parse_panel_interpretations(root: &Value) -> Result, String> { + let mut interpretations = Vec::new(); + let key = if value_at(root, &["analyses"]).is_some() { + "analyses" + } else { + "interpretations" + }; + let Some(items) = value_at(root, &[key]).and_then(Value::as_sequence) else { + return Ok(interpretations); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("{key}[{idx}] must be a mapping")); + }; + interpretations.push(PanelInterpretation { + id: mapping_required_string(mapping, "id", idx, key)?, + kind: mapping_required_string(mapping, "kind", idx, key)?, + path: mapping_required_string(mapping, "path", idx, key)?, + output_format: mapping + .get(Value::String("output_format".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + derived_from: mapping_sequence_of_strings(mapping, "derived_from", idx, key)?, + emits: parse_panel_interpretation_emits(mapping, idx)?, + logic: parse_panel_interpretation_logic(mapping)?, + }); + } + Ok(interpretations) +} + +fn parse_panel_interpretation_logic( + mapping: &Mapping, +) -> Result, String> { + let Some(logic) = mapping.get(Value::String("logic".to_owned())) else { + return Ok(None); + }; + let Some(logic_mapping) = logic.as_mapping() else { + return Err("analysis logic must be a mapping".to_owned()); + }; + let source = match logic_mapping.get(Value::String("source".to_owned())) { + Some(source) => { + let Some(source_mapping) = source.as_mapping() else { + return Err("analysis logic.source must be a mapping".to_owned()); + }; + Some(PanelInterpretationLogicSource { + name: source_mapping + .get(Value::String("name".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + url: source_mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }) + } + None => None, + }; + Ok(Some(PanelInterpretationLogic { + source, + description: logic_mapping + .get(Value::String("description".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + })) +} + +fn parse_panel_interpretation_emits( + mapping: &Mapping, + interpretation_idx: usize, +) -> Result, String> { + let Some(items) = mapping + .get(Value::String("emits".to_owned())) + .and_then(Value::as_sequence) + else { + return Ok(Vec::new()); + }; + let mut emits = Vec::new(); + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!( + "interpretations[{interpretation_idx}].emits[{idx}] must be a mapping" + )); + }; + emits.push(PanelInterpretationEmit { + key: mapping_required_string( + mapping, + "key", + idx, + &format!("interpretations[{interpretation_idx}].emits"), + )?, + label: mapping + .get(Value::String("label".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + value_type: mapping + .get(Value::String("value_type".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + format: mapping + .get(Value::String("format".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }); + } + Ok(emits) +} + +fn mapping_sequence_of_strings( + mapping: &Mapping, + field: &str, + idx: usize, + parent: &str, +) -> Result, String> { + let value = mapping + .get(Value::String(field.to_owned())) + .ok_or_else(|| format!("{parent}[{idx}].{field} is required"))?; + let items = value + .as_sequence() + .ok_or_else(|| format!("{parent}[{idx}].{field} must be a sequence"))?; + items + .iter() + .enumerate() + .map(|(item_idx, item)| { + item.as_str() + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{parent}[{idx}].{field}[{item_idx}] must be a string")) + }) + .collect() +} + fn mapping_required_string( mapping: &Mapping, field: &str, diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index 32f0563..82a0fa0 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -6,7 +6,8 @@ use std::{ use bioscript_schema::{ RemoteResourceKind, load_variant_manifest_text, load_variant_manifest_text_for_lookup, - resolve_remote_resource_text, validate_panels_path, validate_variants_path, + resolve_remote_resource_text, validate_assays_path, validate_panels_path, + validate_variants_path, }; fn temp_dir(label: &str) -> PathBuf { @@ -88,7 +89,7 @@ findings: - schema: "bioscript:trait:1.0" alt: "A" summary: "Example finding" - - schema: "bioscript:pgx:1.0" + - schema: "bioscript:pgx-summary:1.0" alt: "*" summary: "Example multiallelic finding" provenance: @@ -213,6 +214,21 @@ members: - kind: "variant" path: "variants/rs671.yaml" version: "1.0" + - kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" +interpretations: + - id: "taste_status" + kind: "bioscript" + path: "interpretations/taste.py" + label: "Taste status" + derived_from: + - "variants/rs671.yaml" + emits: + - key: "taste_status" + label: "Taste status" + value_type: "string" + format: "badge" "#, ) .unwrap(); @@ -222,6 +238,42 @@ members: assert_eq!(report.total_warnings(), 0); } +#[test] +fn validate_assays_accepts_variant_members_and_interpretations() { + let dir = temp_dir("validate-assay-ok"); + let fixture = dir.join("assay.yaml"); + fs::write( + &fixture, + r#" +schema: "bioscript:assay:1.0" +version: "1.0" +name: "APOL1" +tags: + - "type:risk" +members: + - kind: "variant" + path: "g1-site-1.yaml" + version: "1.0" +interpretations: + - id: "apol1_status" + kind: "bioscript" + path: "apol1.py" + derived_from: + - "g1-site-1.yaml" + emits: + - key: "apol1_status" + label: "APOL1 status" + value_type: "string" + format: "badge" +"#, + ) + .unwrap(); + + let report = validate_assays_path(&fixture).unwrap(); + assert_eq!(report.total_errors(), 0); + assert_eq!(report.total_warnings(), 0); +} + #[test] fn remote_resource_resolution_detects_panel_members() { let text = r#" From 4b9089284a54f53ce2b7784eb998919b86e05a67 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 7 May 2026 09:17:13 +1000 Subject: [PATCH 2/4] refactored large files for report and schema --- AGENTS.md | 10 +- rust/bioscript-cli/src/cli_bootstrap.rs | 414 +++ rust/bioscript-cli/src/cli_commands.rs | 257 ++ rust/bioscript-cli/src/main.rs | 2988 +---------------- rust/bioscript-cli/src/manifest_runner.rs | 424 +++ rust/bioscript-cli/src/report_execution.rs | 309 ++ rust/bioscript-cli/src/report_html.rs | 3 + rust/bioscript-cli/src/report_html_helpers.rs | 98 + rust/bioscript-cli/src/report_html_pgx.rs | 218 ++ .../bioscript-cli/src/report_html_sections.rs | 268 ++ rust/bioscript-cli/src/report_matching.rs | 292 ++ rust/bioscript-cli/src/report_observations.rs | 391 +++ rust/bioscript-cli/src/report_options.rs | 225 ++ rust/bioscript-cli/src/report_output.rs | 165 + rust/bioscript-schema/src/validator.rs | 2207 +----------- .../bioscript-schema/src/validator_alleles.rs | 146 + .../src/validator_alleles_findings.rs | 3 + .../src/validator_findings.rs | 246 ++ .../bioscript-schema/src/validator_helpers.rs | 161 + rust/bioscript-schema/src/validator_load.rs | 347 ++ rust/bioscript-schema/src/validator_panel.rs | 450 +++ rust/bioscript-schema/src/validator_parse.rs | 201 ++ .../src/validator_resources.rs | 173 + rust/bioscript-schema/src/validator_roots.rs | 334 ++ rust/bioscript-schema/src/validator_types.rs | 180 + 25 files changed, 5334 insertions(+), 5176 deletions(-) create mode 100644 rust/bioscript-cli/src/cli_bootstrap.rs create mode 100644 rust/bioscript-cli/src/cli_commands.rs create mode 100644 rust/bioscript-cli/src/manifest_runner.rs create mode 100644 rust/bioscript-cli/src/report_execution.rs create mode 100644 rust/bioscript-cli/src/report_html.rs create mode 100644 rust/bioscript-cli/src/report_html_helpers.rs create mode 100644 rust/bioscript-cli/src/report_html_pgx.rs create mode 100644 rust/bioscript-cli/src/report_html_sections.rs create mode 100644 rust/bioscript-cli/src/report_matching.rs create mode 100644 rust/bioscript-cli/src/report_observations.rs create mode 100644 rust/bioscript-cli/src/report_options.rs create mode 100644 rust/bioscript-cli/src/report_output.rs create mode 100644 rust/bioscript-schema/src/validator_alleles.rs create mode 100644 rust/bioscript-schema/src/validator_alleles_findings.rs create mode 100644 rust/bioscript-schema/src/validator_findings.rs create mode 100644 rust/bioscript-schema/src/validator_helpers.rs create mode 100644 rust/bioscript-schema/src/validator_load.rs create mode 100644 rust/bioscript-schema/src/validator_panel.rs create mode 100644 rust/bioscript-schema/src/validator_parse.rs create mode 100644 rust/bioscript-schema/src/validator_resources.rs create mode 100644 rust/bioscript-schema/src/validator_roots.rs create mode 100644 rust/bioscript-schema/src/validator_types.rs diff --git a/AGENTS.md b/AGENTS.md index d3fc05f..ac2c44f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -5,6 +5,11 @@ Keep first-party production Rust source files at or below 500 lines. This applies to files under `rust/bioscript-*/src/**/*.rs`. +When editing BioScript Rust, prefer adding behavior to a small, named module +whose filename describes the responsibility. If a file is approaching 500 lines, +split it along a real domain boundary before adding more code. Do not satisfy +the guard by creating arbitrary numbered chunks or `*_part_*` files. + The 500-line rule does not apply to: - integration tests and unit-test modules @@ -16,6 +21,5 @@ production limit measures production code, not test scaffolding. Test files should still be split when they mix unrelated behavior or become hard to scan. When a production file grows past 500 lines, split it before adding more -behavior. Temporary exceptions must be listed in this file under -`Current Refactor Backlog`; the source-size guard reads that list and fails when -it drifts from the code. +behavior. Keep the include list in the parent file short and logical, and leave +file names meaningful enough that future agents can find the right place to edit. diff --git a/rust/bioscript-cli/src/cli_bootstrap.rs b/rust/bioscript-cli/src/cli_bootstrap.rs new file mode 100644 index 0000000..5337229 --- /dev/null +++ b/rust/bioscript-cli/src/cli_bootstrap.rs @@ -0,0 +1,414 @@ +use std::{ + collections::BTreeMap, + env, + fmt::Write as _, + fs, + path::{Path, PathBuf}, + process::ExitCode, + time::{Duration, Instant}, +}; + +use bioscript_formats::{ + GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, InspectOptions, PrepareRequest, + inspect_file, prepare_indexes, shell_flags, +}; +use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; +use bioscript_schema::{ + AssayManifest, PanelInterpretation, PanelManifest, VariantManifest, load_assay_manifest, + load_panel_manifest, load_variant_manifest, validate_assays_path, validate_panels_path, + validate_variants_path, +}; +use monty::ResourceLimits; + +fn main() -> ExitCode { + match run_cli() { + Ok(()) => ExitCode::SUCCESS, + Err(err) => { + eprintln!("bioscript: {err}"); + ExitCode::FAILURE + } + } +} + +fn run_cli() -> Result<(), String> { + let args: Vec = env::args().skip(1).collect(); + if dispatch_subcommand(&args)? { + return Ok(()); + } + + let mut options = parse_cli_options(args)?; + let script_path = options.script_path.clone().ok_or_else(|| USAGE.to_owned())?; + let runtime_root = options + .root + .clone() + .map_or_else(env::current_dir, Ok) + .map_err(|err| format!("failed to get current directory: {err}"))?; + normalize_loader_paths(&runtime_root, &mut options.loader); + let mut cli_timings = prepare_cli_indexes(&runtime_root, &mut options)?; + + if is_yaml_manifest(&script_path) { + run_cli_manifest(&runtime_root, &script_path, &options, &mut cli_timings)?; + } else { + run_cli_script(&script_path, options, cli_timings)?; + } + Ok(()) +} + +const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--root ] [--input-format auto|text|zip|vcf|cram]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]"; + +struct CliOptions { + script_path: Option, + root: Option, + input_file: Option, + output_file: Option, + participant_id: Option, + trace_report: Option, + timing_report: Option, + filters: Vec, + auto_index: bool, + cache_dir: Option, + loader: GenotypeLoadOptions, + limits: ResourceLimits, +} + +fn dispatch_subcommand(args: &[String]) -> Result { + let Some((first, rest)) = args.split_first() else { + return Ok(false); + }; + let rest = rest.to_vec(); + match first.as_str() { + "report" => run_app_report(rest).map(|()| true), + "validate-variants" => run_validate_variants(rest).map(|()| true), + "validate-panels" => run_validate_panels(rest).map(|()| true), + "validate-assays" => run_validate_assays(rest).map(|()| true), + "prepare" => run_prepare(rest).map(|()| true), + "inspect" => run_inspect(rest).map(|()| true), + _ => Ok(false), + } +} + +fn parse_cli_options(args: Vec) -> Result { + let mut args = args.into_iter(); + let mut options = default_cli_options(); + while let Some(arg) = args.next() { + parse_cli_arg(arg, &mut args, &mut options)?; + } + Ok(options) +} + +fn default_cli_options() -> CliOptions { + CliOptions { + script_path: None, + root: None, + input_file: None, + output_file: None, + participant_id: None, + trace_report: None, + timing_report: None, + filters: Vec::new(), + auto_index: false, + cache_dir: None, + loader: GenotypeLoadOptions::default(), + limits: ResourceLimits::new() + .max_duration(Duration::from_millis(100)) + .max_memory(8 * 1024 * 1024) + .max_allocations(200_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)), + } +} + +fn parse_cli_arg( + arg: String, + args: &mut impl Iterator, + options: &mut CliOptions, +) -> Result<(), String> { + if parse_cli_path_arg(&arg, args, options)? { + return Ok(()); + } + if parse_cli_loader_arg(&arg, args, options)? { + return Ok(()); + } + if parse_cli_limit_arg(&arg, args, options)? { + return Ok(()); + } + if arg == "--auto-index" { + options.auto_index = true; + } else if options.script_path.is_none() { + options.script_path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + Ok(()) +} + +fn parse_cli_path_arg( + arg: &str, + args: &mut impl Iterator, + options: &mut CliOptions, +) -> Result { + if arg == "--root" { + let Some(value) = args.next() else { + return Err("--root requires a directory".to_owned()); + }; + options.root = Some(PathBuf::from(value)); + } else if arg == "--input-file" { + let Some(value) = args.next() else { + return Err("--input-file requires a path".to_owned()); + }; + options.input_file = Some(value); + } else if arg == "--output-file" { + let Some(value) = args.next() else { + return Err("--output-file requires a path".to_owned()); + }; + options.output_file = Some(value); + } else if arg == "--participant-id" { + let Some(value) = args.next() else { + return Err("--participant-id requires a value".to_owned()); + }; + options.participant_id = Some(value); + } else if arg == "--trace-report" { + let Some(value) = args.next() else { + return Err("--trace-report requires a path".to_owned()); + }; + options.trace_report = Some(PathBuf::from(value)); + } else if arg == "--timing-report" { + let Some(value) = args.next() else { + return Err("--timing-report requires a path".to_owned()); + }; + options.timing_report = Some(PathBuf::from(value)); + } else if arg == "--filter" { + let Some(value) = args.next() else { + return Err("--filter requires key=value".to_owned()); + }; + options.filters.push(value); + } else if arg == "--cache-dir" { + let Some(value) = args.next() else { + return Err("--cache-dir requires a path".to_owned()); + }; + options.cache_dir = Some(PathBuf::from(value)); + } else { + return Ok(false); + } + Ok(true) +} + +fn parse_cli_loader_arg( + arg: &str, + args: &mut impl Iterator, + options: &mut CliOptions, +) -> Result { + if arg == "--input-format" { + let Some(value) = args.next() else { + return Err("--input-format requires a value".to_owned()); + }; + if value.eq_ignore_ascii_case("auto") { + options.loader.format = None; + } else { + let parsed = value + .parse::() + .map_err(|err| format!("invalid --input-format value {value}: {err}"))?; + options.loader.format = Some(parsed); + } + } else if arg == "--input-index" { + let Some(value) = args.next() else { + return Err("--input-index requires a path".to_owned()); + }; + options.loader.input_index = Some(PathBuf::from(value)); + } else if arg == "--reference-file" { + let Some(value) = args.next() else { + return Err("--reference-file requires a path".to_owned()); + }; + options.loader.reference_file = Some(PathBuf::from(value)); + } else if arg == "--reference-index" { + let Some(value) = args.next() else { + return Err("--reference-index requires a path".to_owned()); + }; + options.loader.reference_index = Some(PathBuf::from(value)); + } else { + return Ok(false); + } + Ok(true) +} + +fn parse_cli_limit_arg( + arg: &str, + args: &mut impl Iterator, + options: &mut CliOptions, +) -> Result { + if arg == "--max-duration-ms" { + let Some(value) = args.next() else { + return Err("--max-duration-ms requires an integer".to_owned()); + }; + let parsed = value + .parse::() + .map_err(|err| format!("invalid --max-duration-ms value {value}: {err}"))?; + options.limits = options.limits.clone().max_duration(Duration::from_millis(parsed)); + } else if arg == "--max-memory-bytes" { + let Some(value) = args.next() else { + return Err("--max-memory-bytes requires an integer".to_owned()); + }; + let parsed = value + .parse::() + .map_err(|err| format!("invalid --max-memory-bytes value {value}: {err}"))?; + options.limits = options.limits.clone().max_memory(parsed); + } else if arg == "--max-allocations" { + let Some(value) = args.next() else { + return Err("--max-allocations requires an integer".to_owned()); + }; + let parsed = value + .parse::() + .map_err(|err| format!("invalid --max-allocations value {value}: {err}"))?; + options.limits = options.limits.clone().max_allocations(parsed); + } else if arg == "--max-recursion-depth" { + let Some(value) = args.next() else { + return Err("--max-recursion-depth requires an integer".to_owned()); + }; + let parsed = value + .parse::() + .map_err(|err| format!("invalid --max-recursion-depth value {value}: {err}"))?; + options.limits = options.limits.clone().max_recursion_depth(Some(parsed)); + } else { + return Ok(false); + } + Ok(true) +} + +fn prepare_cli_indexes( + runtime_root: &Path, + options: &mut CliOptions, +) -> Result, String> { + let mut cli_timings: Vec = Vec::new(); + if options.auto_index { + let auto_index_started = Instant::now(); + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let effective_cache = options + .cache_dir + .clone() + .unwrap_or_else(|| cwd.join(".bioscript-cache")); + let request = PrepareRequest { + root: runtime_root.to_path_buf(), + cwd, + cache_dir: effective_cache, + input_file: options.input_file.clone(), + input_format: options.loader.format, + reference_file: options + .loader + .reference_file + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + }; + let prepared = prepare_indexes(&request)?; + if let Some(idx) = prepared.input_index + && options.loader.input_index.is_none() + { + eprintln!("bioscript: auto-indexed input -> {}", idx.display()); + options.loader.input_index = Some(idx); + } + if let Some(ref_file) = prepared.reference_file { + options.loader.reference_file = Some(ref_file); + } + if let Some(ref_idx) = prepared.reference_index + && options.loader.reference_index.is_none() + { + eprintln!("bioscript: auto-indexed reference -> {}", ref_idx.display()); + options.loader.reference_index = Some(ref_idx); + } + cli_timings.push(StageTiming { + stage: "auto_index".to_owned(), + duration_ms: auto_index_started.elapsed().as_millis(), + detail: "prepare_indexes".to_owned(), + }); + } + Ok(cli_timings) +} + +fn run_cli_manifest( + runtime_root: &Path, + script_path: &Path, + options: &CliOptions, + cli_timings: &mut Vec, +) -> Result<(), String> { + let manifest_started = Instant::now(); + let manifest_options = ManifestRunOptions { + input_file: options.input_file.as_deref(), + output_file: options.output_file.as_deref(), + participant_id: options.participant_id.as_deref(), + trace_report: options.trace_report.as_deref(), + loader: &options.loader, + filters: &options.filters, + }; + run_manifest(runtime_root, script_path, &manifest_options)?; + cli_timings.push(StageTiming { + stage: "manifest_run".to_owned(), + duration_ms: manifest_started.elapsed().as_millis(), + detail: script_path.display().to_string(), + }); + if let Some(timing_path) = &options.timing_report { + write_timing_report(timing_path, cli_timings)?; + } + Ok(()) +} + +fn run_cli_script( + script_path: &Path, + options: CliOptions, + cli_timings: Vec, +) -> Result<(), String> { + let runtime_root = options + .root + .map_or_else(env::current_dir, Ok) + .map_err(|err| format!("failed to get current directory: {err}"))?; + let runtime = BioscriptRuntime::with_config( + runtime_root, + RuntimeConfig { + limits: options.limits, + loader: options.loader, + }, + ) + .map_err(|err| err.to_string())?; + let mut inputs = Vec::new(); + if let Some(input_file) = options.input_file { + inputs.push(("input_file", monty::MontyObject::String(input_file))); + } + if let Some(output_file) = options.output_file { + inputs.push(("output_file", monty::MontyObject::String(output_file))); + } + if let Some(participant_id) = options.participant_id { + inputs.push(("participant_id", monty::MontyObject::String(participant_id))); + } + + runtime + .run_file(script_path, options.trace_report.as_deref(), inputs) + .map_err(|err| err.to_string())?; + if let Some(timing_path) = options.timing_report { + let mut all_timings = cli_timings; + all_timings.extend(runtime.timing_snapshot()); + write_timing_report(&timing_path, &all_timings)?; + } + Ok(()) +} + +fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!( + "failed to create timing report dir {}: {err}", + parent.display() + ) + })?; + } + let mut output = String::from("stage\tduration_ms\tdetail\n"); + for timing in timings { + let _ = writeln!( + output, + "{}\t{}\t{}", + timing.stage, + timing.duration_ms, + timing.detail.replace('\t', " ") + ); + } + fs::write(path, output) + .map_err(|err| format!("failed to write timing report {}: {err}", path.display())) +} + diff --git a/rust/bioscript-cli/src/cli_commands.rs b/rust/bioscript-cli/src/cli_commands.rs new file mode 100644 index 0000000..2b5e577 --- /dev/null +++ b/rust/bioscript-cli/src/cli_commands.rs @@ -0,0 +1,257 @@ +fn run_prepare(args: Vec) -> Result<(), String> { + let mut root: Option = None; + let mut input_file: Option = None; + let mut reference_file: Option = None; + let mut input_format: Option = None; + let mut cache_dir: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--root" => { + root = Some(PathBuf::from( + iter.next().ok_or("--root requires a directory")?, + )); + } + "--input-file" => { + input_file = Some(iter.next().ok_or("--input-file requires a path")?); + } + "--reference-file" => { + reference_file = Some(iter.next().ok_or("--reference-file requires a path")?); + } + "--input-format" => { + let value = iter.next().ok_or("--input-format requires a value")?; + if !value.eq_ignore_ascii_case("auto") { + input_format = Some( + value + .parse::() + .map_err(|err| format!("invalid --input-format: {err}"))?, + ); + } + } + "--cache-dir" => { + cache_dir = Some(PathBuf::from( + iter.next().ok_or("--cache-dir requires a path")?, + )); + } + other => { + return Err(format!("unexpected argument: {other}")); + } + } + } + + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let effective_root = root.unwrap_or_else(|| cwd.clone()); + let effective_cache = cache_dir.unwrap_or_else(|| cwd.join(".bioscript-cache")); + + let request = PrepareRequest { + root: effective_root, + cwd, + cache_dir: effective_cache, + input_file, + input_format, + reference_file, + }; + + let prepared = prepare_indexes(&request)?; + + // print the flags that should be passed to a subsequent bioscript run + let flags = shell_flags(&prepared); + if flags.is_empty() { + eprintln!("bioscript prepare: nothing to index"); + } else { + println!("{flags}"); + } + + Ok(()) +} + +fn run_inspect(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut options = InspectOptions::default(); + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--input-index" => { + options.input_index = Some(PathBuf::from( + iter.next().ok_or("--input-index requires a path")?, + )); + } + "--reference-file" => { + options.reference_file = Some(PathBuf::from( + iter.next().ok_or("--reference-file requires a path")?, + )); + } + "--reference-index" => { + options.reference_index = Some(PathBuf::from( + iter.next().ok_or("--reference-index requires a path")?, + )); + } + other if path.is_none() => { + path = Some(PathBuf::from(other)); + } + other => { + return Err(format!("unexpected argument: {other}")); + } + } + } + + let Some(path) = path else { + return Err( + "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + .to_owned(), + ); + }; + + let inspection = inspect_file(&path, &options).map_err(|err| err.to_string())?; + println!("{}", inspection.render_text()); + Ok(()) +} + +fn run_validate_variants(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-variants [--report ]".to_owned()); + }; + + let report = validate_variants_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} + +fn run_validate_panels(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-panels [--report ]".to_owned()); + }; + + let report = validate_panels_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} + +fn is_yaml_manifest(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| matches!(ext, "yaml" | "yml")) +} + +fn run_validate_assays(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-assays [--report ]".to_owned()); + }; + + let report = validate_assays_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index e987509..7e0e20a 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -1,2976 +1,12 @@ -use std::{ - collections::BTreeMap, - env, - fmt::Write as _, - fs, - path::{Path, PathBuf}, - process::ExitCode, - time::{Duration, Instant}, -}; - -use bioscript_formats::{ - GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, InspectOptions, PrepareRequest, - inspect_file, prepare_indexes, shell_flags, -}; -use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; -use bioscript_schema::{ - AssayManifest, PanelInterpretation, PanelManifest, VariantManifest, load_assay_manifest, - load_panel_manifest, load_variant_manifest, validate_assays_path, validate_panels_path, - validate_variants_path, -}; -use monty::ResourceLimits; - -fn main() -> ExitCode { - match run_cli() { - Ok(()) => ExitCode::SUCCESS, - Err(err) => { - eprintln!("bioscript: {err}"); - ExitCode::FAILURE - } - } -} - -#[allow(clippy::too_many_lines)] -fn run_cli() -> Result<(), String> { - let mut args = env::args().skip(1); - if let Some(first) = args.next() { - if first == "report" { - return run_app_report(args.collect()); - } - if first == "validate-variants" { - return run_validate_variants(args.collect()); - } - if first == "validate-panels" { - return run_validate_panels(args.collect()); - } - if first == "validate-assays" { - return run_validate_assays(args.collect()); - } - if first == "prepare" { - return run_prepare(args.collect()); - } - if first == "inspect" { - return run_inspect(args.collect()); - } - } - - let mut args = env::args().skip(1); - let mut script_path: Option = None; - let mut root: Option = None; - let mut input_file: Option = None; - let mut output_file: Option = None; - let mut participant_id: Option = None; - let mut trace_report: Option = None; - let mut timing_report: Option = None; - let mut filters: Vec = Vec::new(); - let mut auto_index = false; - let mut cache_dir: Option = None; - let mut loader = GenotypeLoadOptions::default(); - let mut limits = ResourceLimits::new() - .max_duration(Duration::from_millis(100)) - .max_memory(8 * 1024 * 1024) - .max_allocations(200_000) - .gc_interval(1000) - .max_recursion_depth(Some(200)); - - while let Some(arg) = args.next() { - if arg == "--root" { - let Some(value) = args.next() else { - return Err("--root requires a directory".to_owned()); - }; - root = Some(PathBuf::from(value)); - } else if arg == "--input-file" { - let Some(value) = args.next() else { - return Err("--input-file requires a path".to_owned()); - }; - input_file = Some(value); - } else if arg == "--output-file" { - let Some(value) = args.next() else { - return Err("--output-file requires a path".to_owned()); - }; - output_file = Some(value); - } else if arg == "--participant-id" { - let Some(value) = args.next() else { - return Err("--participant-id requires a value".to_owned()); - }; - participant_id = Some(value); - } else if arg == "--trace-report" { - let Some(value) = args.next() else { - return Err("--trace-report requires a path".to_owned()); - }; - trace_report = Some(PathBuf::from(value)); - } else if arg == "--timing-report" { - let Some(value) = args.next() else { - return Err("--timing-report requires a path".to_owned()); - }; - timing_report = Some(PathBuf::from(value)); - } else if arg == "--filter" { - let Some(value) = args.next() else { - return Err("--filter requires key=value".to_owned()); - }; - filters.push(value); - } else if arg == "--input-format" { - let Some(value) = args.next() else { - return Err("--input-format requires a value".to_owned()); - }; - if value.eq_ignore_ascii_case("auto") { - loader.format = None; - } else { - let parsed = value - .parse::() - .map_err(|err| format!("invalid --input-format value {value}: {err}"))?; - loader.format = Some(parsed); - } - } else if arg == "--input-index" { - let Some(value) = args.next() else { - return Err("--input-index requires a path".to_owned()); - }; - loader.input_index = Some(PathBuf::from(value)); - } else if arg == "--reference-file" { - let Some(value) = args.next() else { - return Err("--reference-file requires a path".to_owned()); - }; - loader.reference_file = Some(PathBuf::from(value)); - } else if arg == "--reference-index" { - let Some(value) = args.next() else { - return Err("--reference-index requires a path".to_owned()); - }; - loader.reference_index = Some(PathBuf::from(value)); - } else if arg == "--max-duration-ms" { - let Some(value) = args.next() else { - return Err("--max-duration-ms requires an integer".to_owned()); - }; - let parsed = value - .parse::() - .map_err(|err| format!("invalid --max-duration-ms value {value}: {err}"))?; - limits = limits.max_duration(Duration::from_millis(parsed)); - } else if arg == "--max-memory-bytes" { - let Some(value) = args.next() else { - return Err("--max-memory-bytes requires an integer".to_owned()); - }; - let parsed = value - .parse::() - .map_err(|err| format!("invalid --max-memory-bytes value {value}: {err}"))?; - limits = limits.max_memory(parsed); - } else if arg == "--max-allocations" { - let Some(value) = args.next() else { - return Err("--max-allocations requires an integer".to_owned()); - }; - let parsed = value - .parse::() - .map_err(|err| format!("invalid --max-allocations value {value}: {err}"))?; - limits = limits.max_allocations(parsed); - } else if arg == "--auto-index" { - auto_index = true; - } else if arg == "--cache-dir" { - let Some(value) = args.next() else { - return Err("--cache-dir requires a path".to_owned()); - }; - cache_dir = Some(PathBuf::from(value)); - } else if arg == "--max-recursion-depth" { - let Some(value) = args.next() else { - return Err("--max-recursion-depth requires an integer".to_owned()); - }; - let parsed = value - .parse::() - .map_err(|err| format!("invalid --max-recursion-depth value {value}: {err}"))?; - limits = limits.max_recursion_depth(Some(parsed)); - } else if script_path.is_none() { - script_path = Some(PathBuf::from(arg)); - } else { - return Err(format!("unexpected argument: {arg}")); - } - } - - let Some(script_path) = script_path else { - return Err( - "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--root ] [--input-format auto|text|zip|vcf|cram]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" - .to_owned(), - ); - }; - - let runtime_root = match root { - Some(dir) => dir, - None => { - env::current_dir().map_err(|err| format!("failed to get current directory: {err}"))? - } - }; - normalize_loader_paths(&runtime_root, &mut loader); - - // auto-index: detect and build missing indexes for CRAM/BAM/FASTA - let mut cli_timings: Vec = Vec::new(); - if auto_index { - let auto_index_started = Instant::now(); - let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; - let effective_cache = cache_dir - .clone() - .unwrap_or_else(|| cwd.join(".bioscript-cache")); - let request = PrepareRequest { - root: runtime_root.clone(), - cwd: cwd.clone(), - cache_dir: effective_cache, - input_file: input_file.clone(), - input_format: loader.format, - reference_file: loader - .reference_file - .as_ref() - .map(|p| p.to_string_lossy().to_string()), - }; - let prepared = prepare_indexes(&request)?; - if let Some(idx) = prepared.input_index - && loader.input_index.is_none() - { - eprintln!("bioscript: auto-indexed input -> {}", idx.display()); - loader.input_index = Some(idx); - } - if let Some(ref_file) = prepared.reference_file { - loader.reference_file = Some(ref_file); - } - if let Some(ref_idx) = prepared.reference_index - && loader.reference_index.is_none() - { - eprintln!("bioscript: auto-indexed reference -> {}", ref_idx.display()); - loader.reference_index = Some(ref_idx); - } - cli_timings.push(StageTiming { - stage: "auto_index".to_owned(), - duration_ms: auto_index_started.elapsed().as_millis(), - detail: "prepare_indexes".to_owned(), - }); - } - - if is_yaml_manifest(&script_path) { - let manifest_started = Instant::now(); - let manifest_options = ManifestRunOptions { - input_file: input_file.as_deref(), - output_file: output_file.as_deref(), - participant_id: participant_id.as_deref(), - trace_report: trace_report.as_deref(), - loader: &loader, - filters: &filters, - }; - run_manifest(&runtime_root, &script_path, &manifest_options)?; - cli_timings.push(StageTiming { - stage: "manifest_run".to_owned(), - duration_ms: manifest_started.elapsed().as_millis(), - detail: script_path.display().to_string(), - }); - if let Some(timing_path) = timing_report { - write_timing_report(&timing_path, &cli_timings)?; - } - return Ok(()); - } - - let runtime = BioscriptRuntime::with_config(runtime_root, RuntimeConfig { limits, loader }) - .map_err(|err| err.to_string())?; - let mut inputs = Vec::new(); - if let Some(input_file) = input_file { - inputs.push(("input_file", monty::MontyObject::String(input_file))); - } - if let Some(output_file) = output_file { - inputs.push(("output_file", monty::MontyObject::String(output_file))); - } - if let Some(participant_id) = participant_id { - inputs.push(("participant_id", monty::MontyObject::String(participant_id))); - } - - runtime - .run_file(&script_path, trace_report.as_deref(), inputs) - .map_err(|err| err.to_string())?; - if let Some(timing_path) = timing_report { - let mut all_timings = cli_timings; - all_timings.extend(runtime.timing_snapshot()); - write_timing_report(&timing_path, &all_timings)?; - } - Ok(()) -} - -fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { - if let Some(parent) = path.parent() { - fs::create_dir_all(parent).map_err(|err| { - format!( - "failed to create timing report dir {}: {err}", - parent.display() - ) - })?; - } - let mut output = String::from("stage\tduration_ms\tdetail\n"); - for timing in timings { - let _ = writeln!( - output, - "{}\t{}\t{}", - timing.stage, - timing.duration_ms, - timing.detail.replace('\t', " ") - ); - } - fs::write(path, output) - .map_err(|err| format!("failed to write timing report {}: {err}", path.display())) -} - -fn run_prepare(args: Vec) -> Result<(), String> { - let mut root: Option = None; - let mut input_file: Option = None; - let mut reference_file: Option = None; - let mut input_format: Option = None; - let mut cache_dir: Option = None; - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - match arg.as_str() { - "--root" => { - root = Some(PathBuf::from( - iter.next().ok_or("--root requires a directory")?, - )); - } - "--input-file" => { - input_file = Some(iter.next().ok_or("--input-file requires a path")?); - } - "--reference-file" => { - reference_file = Some(iter.next().ok_or("--reference-file requires a path")?); - } - "--input-format" => { - let value = iter.next().ok_or("--input-format requires a value")?; - if !value.eq_ignore_ascii_case("auto") { - input_format = Some( - value - .parse::() - .map_err(|err| format!("invalid --input-format: {err}"))?, - ); - } - } - "--cache-dir" => { - cache_dir = Some(PathBuf::from( - iter.next().ok_or("--cache-dir requires a path")?, - )); - } - other => { - return Err(format!("unexpected argument: {other}")); - } - } - } - - let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; - let effective_root = root.unwrap_or_else(|| cwd.clone()); - let effective_cache = cache_dir.unwrap_or_else(|| cwd.join(".bioscript-cache")); - - let request = PrepareRequest { - root: effective_root, - cwd, - cache_dir: effective_cache, - input_file, - input_format, - reference_file, - }; - - let prepared = prepare_indexes(&request)?; - - // print the flags that should be passed to a subsequent bioscript run - let flags = shell_flags(&prepared); - if flags.is_empty() { - eprintln!("bioscript prepare: nothing to index"); - } else { - println!("{flags}"); - } - - Ok(()) -} - -fn run_inspect(args: Vec) -> Result<(), String> { - let mut path: Option = None; - let mut options = InspectOptions::default(); - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - match arg.as_str() { - "--input-index" => { - options.input_index = Some(PathBuf::from( - iter.next().ok_or("--input-index requires a path")?, - )); - } - "--reference-file" => { - options.reference_file = Some(PathBuf::from( - iter.next().ok_or("--reference-file requires a path")?, - )); - } - "--reference-index" => { - options.reference_index = Some(PathBuf::from( - iter.next().ok_or("--reference-index requires a path")?, - )); - } - other if path.is_none() => { - path = Some(PathBuf::from(other)); - } - other => { - return Err(format!("unexpected argument: {other}")); - } - } - } - - let Some(path) = path else { - return Err( - "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" - .to_owned(), - ); - }; - - let inspection = inspect_file(&path, &options).map_err(|err| err.to_string())?; - println!("{}", inspection.render_text()); - Ok(()) -} - -fn run_validate_variants(args: Vec) -> Result<(), String> { - let mut path: Option = None; - let mut report_path: Option = None; - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - if arg == "--report" { - let Some(value) = iter.next() else { - return Err("--report requires a path".to_owned()); - }; - report_path = Some(PathBuf::from(value)); - } else if path.is_none() { - path = Some(PathBuf::from(arg)); - } else { - return Err(format!("unexpected argument: {arg}")); - } - } - - let Some(path) = path else { - return Err("usage: bioscript validate-variants [--report ]".to_owned()); - }; - - let report = validate_variants_path(&path)?; - let text = report.render_text(); - print!("{text}"); - - if let Some(report_path) = report_path { - if let Some(parent) = report_path.parent() { - std::fs::create_dir_all(parent).map_err(|err| { - format!("failed to create report dir {}: {err}", parent.display()) - })?; - } - std::fs::write(&report_path, text) - .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; - } - - if report.has_errors() { - return Err(format!( - "validation found {} errors and {} warnings", - report.total_errors(), - report.total_warnings() - )); - } - - Ok(()) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum AppOutputFormat { - Tsv, - Json, - Jsonl, - Both, -} - -struct AppReportOptions { - manifest_path: PathBuf, - input_files: Vec, - output_dir: PathBuf, - root: PathBuf, - html: bool, - observations_format: AppOutputFormat, - reports_format: AppOutputFormat, - loader: GenotypeLoadOptions, - filters: Vec, -} - -#[allow(clippy::too_many_lines)] -fn run_app_report(args: Vec) -> Result<(), String> { - let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; - let mut manifest_path: Option = None; - let mut input_files: Vec = Vec::new(); - let mut output_dir: Option = None; - let mut root: Option = None; - let mut html = false; - let mut observations_format = AppOutputFormat::Tsv; - let mut reports_format = AppOutputFormat::Jsonl; - let mut filters = Vec::new(); - let mut loader = GenotypeLoadOptions::default(); - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - match arg.as_str() { - "--input-file" => input_files.push(PathBuf::from( - iter.next().ok_or("--input-file requires a path")?, - )), - "--output-dir" => { - output_dir = Some(PathBuf::from( - iter.next().ok_or("--output-dir requires a path")?, - )); - } - "--root" => { - root = Some(PathBuf::from( - iter.next().ok_or("--root requires a directory")?, - )); - } - "--html" => html = true, - "--filter" => filters.push(iter.next().ok_or("--filter requires key=value")?), - "--observations-format" => { - observations_format = parse_app_output_format( - &iter - .next() - .ok_or("--observations-format requires a value")?, - )?; - } - "--reports-format" => { - reports_format = parse_app_output_format( - &iter.next().ok_or("--reports-format requires a value")?, - )?; - } - "--input-format" => { - let value = iter.next().ok_or("--input-format requires a value")?; - if value.eq_ignore_ascii_case("auto") { - loader.format = None; - } else { - loader.format = - Some(value.parse::().map_err(|err| { - format!("invalid --input-format value {value}: {err}") - })?); - } - } - "--input-index" => { - loader.input_index = Some(PathBuf::from( - iter.next().ok_or("--input-index requires a path")?, - )); - } - "--reference-file" => { - loader.reference_file = Some(PathBuf::from( - iter.next().ok_or("--reference-file requires a path")?, - )); - } - "--reference-index" => { - loader.reference_index = Some(PathBuf::from( - iter.next().ok_or("--reference-index requires a path")?, - )); - } - value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")), - value => { - if manifest_path.is_none() { - manifest_path = Some(PathBuf::from(value)); - } else { - input_files.push(PathBuf::from(value)); - } - } - } - } - - let Some(manifest_path) = manifest_path else { - return Err("usage: bioscript report --input-file [--input-file ...] --output-dir [--html]".to_owned()); - }; - if input_files.is_empty() { - return Err("bioscript report requires at least one --input-file".to_owned()); - } - let output_dir = output_dir.ok_or("bioscript report requires --output-dir")?; - let root = root.unwrap_or(cwd); - normalize_loader_paths(&root, &mut loader); - - let options = AppReportOptions { - manifest_path: absolutize(&root, &manifest_path), - input_files: input_files - .iter() - .map(|path| absolutize(&root, path)) - .collect(), - output_dir: absolutize(&root, &output_dir), - root, - html, - observations_format, - reports_format, - loader, - filters, - }; - generate_app_report(&options) -} - -fn parse_app_output_format(value: &str) -> Result { - match value { - "tsv" => Ok(AppOutputFormat::Tsv), - "json" => Ok(AppOutputFormat::Json), - "jsonl" => Ok(AppOutputFormat::Jsonl), - "both" => Ok(AppOutputFormat::Both), - other => Err(format!( - "unsupported output format '{other}'; expected tsv, json, jsonl, or both" - )), - } -} - -fn absolutize(root: &Path, path: &Path) -> PathBuf { - if path.is_absolute() { - path.to_path_buf() - } else { - root.join(path) - } -} - -fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { - fs::create_dir_all(&options.output_dir).map_err(|err| { - format!( - "failed to create output dir {}: {err}", - options.output_dir.display() - ) - })?; - - let assay_id = app_assay_id(&options.manifest_path)?; - let findings = load_manifest_findings(&options.root, &options.manifest_path)?; - let provenance = load_manifest_provenance_links(&options.root, &options.manifest_path)?; - let mut observations = Vec::new(); - let mut analyses = Vec::new(); - let mut reports = Vec::new(); - - for input_file in &options.input_files { - let participant_id = participant_id_from_path(input_file); - let rows = run_manifest_rows_for_report( - &options.root, - &options.manifest_path, - input_file, - &participant_id, - &options.loader, - &options.filters, - )?; - let input_observations = rows - .iter() - .map(|row| app_observation_from_manifest_row(&options.root, row, &assay_id)) - .collect::, _>>()?; - observations.extend(input_observations.clone()); - let input_analyses = run_manifest_analyses_for_report( - &options.root, - &options.manifest_path, - input_file, - &participant_id, - &options.loader, - &options.output_dir, - )?; - analyses.extend(input_analyses.clone()); - let matched_findings = match_app_findings(&findings, &input_observations, &input_analyses); - reports.push(app_report_json( - &assay_id, - &participant_id, - input_file, - &input_observations, - &input_analyses, - &matched_findings, - &provenance, - )); - } - - write_app_observations( - &options.output_dir, - &observations, - options.observations_format, - )?; - write_app_analyses(&options.output_dir, &analyses)?; - write_app_reports(&options.output_dir, &reports, options.reports_format)?; - if options.html { - write_app_html(&options.output_dir, &observations, &reports)?; - } - - println!( - "observations: {}", - options.output_dir.join("observations.tsv").display() - ); - println!( - "analysis: {}", - options.output_dir.join("analysis.jsonl").display() - ); - println!( - "reports: {}", - options.output_dir.join("reports.jsonl").display() - ); - if options.html { - println!("html: {}", options.output_dir.join("index.html").display()); - } - Ok(()) -} - -fn run_manifest_rows_for_report( - runtime_root: &Path, - manifest_path: &Path, - input_file: &Path, - participant_id: &str, - loader: &GenotypeLoadOptions, - filters: &[String], -) -> Result>, String> { - let input_text = input_file.display().to_string(); - match manifest_schema(manifest_path)?.as_str() { - "bioscript:variant:1.0" | "bioscript:variant" => { - let manifest = load_variant_manifest(manifest_path)?; - Ok(vec![run_variant_manifest( - runtime_root, - &manifest, - Some(&input_text), - Some(participant_id), - loader, - )?]) - } - "bioscript:panel:1.0" => { - let manifest = load_panel_manifest(manifest_path)?; - run_panel_manifest( - runtime_root, - &manifest, - Some(&input_text), - Some(participant_id), - loader, - filters, - ) - } - "bioscript:assay:1.0" => { - let manifest = load_assay_manifest(manifest_path)?; - run_assay_manifest( - runtime_root, - &manifest, - Some(&input_text), - Some(participant_id), - loader, - filters, - ) - } - other => Err(format!("unsupported manifest schema '{other}'")), - } -} - -fn run_manifest_analyses_for_report( - runtime_root: &Path, - manifest_path: &Path, - input_file: &Path, - participant_id: &str, - loader: &GenotypeLoadOptions, - output_dir: &Path, -) -> Result, String> { - match manifest_schema(manifest_path)?.as_str() { - "bioscript:panel:1.0" => { - let manifest = load_panel_manifest(manifest_path)?; - let mut analyses = Vec::new(); - analyses.extend(run_interpretations_for_report( - runtime_root, - &manifest.path, - &manifest.name, - &manifest.interpretations, - input_file, - participant_id, - loader, - output_dir, - )?); - for member in &manifest.members { - if member.kind != "assay" { - continue; - } - let Some(path) = &member.path else { - continue; - }; - let resolved = resolve_manifest_path(runtime_root, &manifest.path, path)?; - analyses.extend(run_manifest_analyses_for_report( - runtime_root, - &resolved, - input_file, - participant_id, - loader, - output_dir, - )?); - } - Ok(analyses) - } - "bioscript:assay:1.0" => { - let manifest = load_assay_manifest(manifest_path)?; - run_interpretations_for_report( - runtime_root, - &manifest.path, - &manifest.name, - &manifest.interpretations, - input_file, - participant_id, - loader, - output_dir, - ) - } - "bioscript:variant:1.0" | "bioscript:variant" => Ok(Vec::new()), - other => Err(format!("unsupported manifest schema '{other}'")), - } -} - -#[allow(clippy::too_many_arguments)] -fn run_interpretations_for_report( - runtime_root: &Path, - manifest_path: &Path, - manifest_name: &str, - interpretations: &[PanelInterpretation], - input_file: &Path, - participant_id: &str, - loader: &GenotypeLoadOptions, - output_dir: &Path, -) -> Result, String> { - let mut outputs = Vec::new(); - for interpretation in interpretations { - if interpretation.kind != "bioscript" { - return Err(format!( - "analysis '{}' uses unsupported kind '{}'", - interpretation.id, interpretation.kind - )); - } - let script_path = resolve_manifest_path(runtime_root, manifest_path, &interpretation.path)?; - let format = interpretation - .output_format - .as_deref() - .unwrap_or("json") - .to_ascii_lowercase(); - let analysis_dir = output_dir.join("analysis").join(participant_id); - fs::create_dir_all(&analysis_dir).map_err(|err| { - format!( - "failed to create analysis output dir {}: {err}", - analysis_dir.display() - ) - })?; - let extension = match format.as_str() { - "tsv" => "tsv", - "json" => "json", - "jsonl" => "jsonl", - other => return Err(format!("unsupported analysis output_format '{other}'")), - }; - let output_file = analysis_dir.join(format!("{}.{}", interpretation.id, extension)); - run_bioscript_analysis_script( - runtime_root, - &script_path, - input_file, - &output_file, - participant_id, - loader, - )?; - let rows = parse_analysis_output(&output_file, &format)?; - outputs.push(serde_json::json!({ - "schema": "bioscript:analysis-output:1.0", - "version": "1.0", - "participant_id": participant_id, - "assay_id": manifest_name, - "analysis_id": interpretation.id, - "kind": interpretation.kind, - "output_format": format, - "manifest_path": manifest_path.strip_prefix(runtime_root).unwrap_or(manifest_path).display().to_string(), - "script_path": script_path.strip_prefix(runtime_root).unwrap_or(&script_path).display().to_string(), - "output_file": output_file.strip_prefix(runtime_root).unwrap_or(&output_file).display().to_string(), - "derived_from": interpretation.derived_from.clone(), - "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ - "key": emit.key.clone(), - "label": emit.label.clone(), - "value_type": emit.value_type.clone(), - "format": emit.format.clone(), - })).collect::>(), - "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ - "description": logic.description.clone(), - "source": logic.source.as_ref().map(|source| serde_json::json!({ - "name": source.name.clone(), - "url": source.url.clone(), - })), - })), - "rows": rows, - })); - } - Ok(outputs) -} - -fn run_bioscript_analysis_script( - runtime_root: &Path, - script_path: &Path, - input_file: &Path, - output_file: &Path, - participant_id: &str, - loader: &GenotypeLoadOptions, -) -> Result<(), String> { - let limits = ResourceLimits::new() - .max_duration(Duration::from_millis(1000)) - .max_memory(16 * 1024 * 1024) - .max_allocations(400_000) - .gc_interval(1000) - .max_recursion_depth(Some(200)); - let runtime = BioscriptRuntime::with_config( - runtime_root.to_path_buf(), - RuntimeConfig { - limits, - loader: loader.clone(), - }, - ) - .map_err(|err| err.to_string())?; - runtime - .run_file( - script_path, - None, - vec![ - ( - "input_file", - monty::MontyObject::String(runtime_path_string(runtime_root, input_file)), - ), - ( - "output_file", - monty::MontyObject::String(runtime_path_string(runtime_root, output_file)), - ), - ( - "participant_id", - monty::MontyObject::String(participant_id.to_owned()), - ), - ], - ) - .map(|_| ()) - .map_err(|err| err.to_string()) -} - -fn runtime_path_string(runtime_root: &Path, path: &Path) -> String { - path.strip_prefix(runtime_root) - .unwrap_or(path) - .display() - .to_string() -} - -fn parse_analysis_output(path: &Path, format: &str) -> Result, String> { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read analysis output {}: {err}", path.display()))?; - match format { - "tsv" => parse_analysis_tsv(&text), - "json" => { - let value: serde_json::Value = serde_json::from_str(&text).map_err(|err| { - format!("failed to parse analysis JSON {}: {err}", path.display()) - })?; - Ok(match value { - serde_json::Value::Array(rows) => rows, - serde_json::Value::Object(mut object) => object - .remove("rows") - .and_then(|rows| rows.as_array().cloned()) - .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), - other => vec![other], - }) - } - "jsonl" => text - .lines() - .filter(|line| !line.trim().is_empty()) - .map(|line| serde_json::from_str(line).map_err(|err| err.to_string())) - .collect(), - other => Err(format!("unsupported analysis output_format '{other}'")), - } -} - -fn parse_analysis_tsv(text: &str) -> Result, String> { - let mut lines = text.lines().filter(|line| !line.trim().is_empty()); - let Some(header_line) = lines.next() else { - return Ok(Vec::new()); - }; - let headers: Vec<&str> = header_line.split('\t').collect(); - let mut rows = Vec::new(); - for line in lines { - let values: Vec<&str> = line.split('\t').collect(); - let mut object = serde_json::Map::new(); - for (idx, header) in headers.iter().enumerate() { - object.insert( - (*header).to_owned(), - serde_json::Value::String(values.get(idx).copied().unwrap_or_default().to_owned()), - ); - } - rows.push(serde_json::Value::Object(object)); - } - Ok(rows) -} - -fn app_assay_id(path: &Path) -> Result { - match manifest_schema(path)?.as_str() { - "bioscript:panel:1.0" => Ok(load_panel_manifest(path)?.name), - "bioscript:assay:1.0" => Ok(load_assay_manifest(path)?.name), - "bioscript:variant:1.0" | "bioscript:variant" => Ok(load_variant_manifest(path)?.name), - other => Err(format!("unsupported manifest schema '{other}'")), - } -} - -fn participant_id_from_path(path: &Path) -> String { - let file_name = path - .file_name() - .and_then(|value| value.to_str()) - .unwrap_or("participant"); - file_name - .trim_end_matches(".txt.zip") - .trim_end_matches(".csv.zip") - .trim_end_matches(".vcf.gz") - .trim_end_matches(".cram") - .trim_end_matches(".zip") - .trim_end_matches(".txt") - .trim_end_matches(".csv") - .to_owned() -} - -fn app_observation_from_manifest_row( - runtime_root: &Path, - row: &BTreeMap, - assay_id: &str, -) -> Result { - let row_path = row.get("path").cloned().unwrap_or_default(); - let manifest_path = if Path::new(&row_path).is_absolute() { - PathBuf::from(&row_path) - } else { - runtime_root.join(&row_path) - }; - let manifest = load_variant_manifest(&manifest_path)?; - let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); - let genotype_display = row.get("genotype").cloned().unwrap_or_default(); - let alt_alleles = variant_alt_alleles(&manifest_path)?; - let alt_allele = observed_alt_allele(&genotype_display, &ref_allele, &alt_alleles) - .or_else(|| manifest.spec.alternate.clone()) - .unwrap_or_default(); - let (genotype, zygosity) = normalize_app_genotype(&genotype_display, &ref_allele, &alt_allele); - let depth = parse_optional_u32(row.get("depth")); - let ref_count = parse_optional_u32(row.get("ref_count")); - let alt_count = parse_optional_u32(row.get("alt_count")); - let allele_balance = match (alt_count, depth) { - (Some(alt_count), Some(depth)) if depth > 0 => { - Some(f64::from(alt_count) / f64::from(depth)) - } - _ => None, - }; - let assembly = row.get("assembly").cloned().unwrap_or_default(); - let locus = if assembly.eq_ignore_ascii_case("grch37") { - manifest.spec.grch37.as_ref() - } else { - manifest - .spec - .grch38 - .as_ref() - .or(manifest.spec.grch37.as_ref()) - }; - let outcome = if genotype == "./." { - "no_call" - } else if zygosity == "hom_ref" { - "reference" - } else if zygosity == "het" || zygosity == "hom_alt" { - "variant" - } else { - "unknown" - }; - let evidence_raw = row.get("evidence").cloned().unwrap_or_default(); - Ok(serde_json::json!({ - "participant_id": row.get("participant_id").cloned().unwrap_or_default(), - "assay_id": assay_id, - "assay_version": "1.0", - "variant_key": manifest.name, - "variant_path": row_path, - "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), - "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, - "chrom": locus.map_or(String::new(), |locus| locus.chrom.clone()), - "pos_start": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), - "pos_end": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), - "ref": ref_allele, - "alt": alt_allele, - "kind": manifest.spec.kind.map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()), - "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, - "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), - "call_status": if genotype == "./." { "no_call" } else { "called" }, - "genotype": genotype, - "genotype_display": genotype_display, - "zygosity": zygosity, - "ref_count": ref_count, - "alt_count": alt_count, - "depth": depth, - "genotype_quality": serde_json::Value::Null, - "allele_balance": allele_balance, - "outcome": outcome, - "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, - "evidence_raw": evidence_raw, - "facets": serde_json::Value::Null, - })) -} - -fn variant_alt_alleles(path: &Path) -> Result, String> { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read variant YAML {}: {err}", path.display()))?; - let value: serde_yaml::Value = serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse variant YAML {}: {err}", path.display()))?; - let Some(items) = value - .as_mapping() - .and_then(|mapping| mapping.get(serde_yaml::Value::String("alleles".to_owned()))) - .and_then(serde_yaml::Value::as_mapping) - .and_then(|mapping| { - mapping - .get(serde_yaml::Value::String("observed_alts".to_owned())) - .or_else(|| mapping.get(serde_yaml::Value::String("alts".to_owned()))) - }) - .and_then(serde_yaml::Value::as_sequence) - else { - return Ok(Vec::new()); - }; - Ok(items - .iter() - .filter_map(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) - .collect()) -} - -fn observed_alt_allele( - genotype_display: &str, - ref_allele: &str, - alts: &[String], -) -> Option { - if ref_allele.len() != 1 { - return None; - } - let ref_ch = ref_allele.chars().next()?; - genotype_display - .chars() - .filter(|ch| ch.is_ascii_alphabetic() && *ch != ref_ch) - .find_map(|ch| { - alts.iter() - .find(|alt| alt.len() == 1 && alt.starts_with(ch)) - .cloned() - }) -} - -fn normalize_app_genotype(display: &str, ref_allele: &str, alt_allele: &str) -> (String, String) { - if display.is_empty() { - return ("./.".to_owned(), "unknown".to_owned()); - } - let alleles: Vec = display - .chars() - .filter(|ch| ch.is_ascii_alphabetic()) - .collect(); - if alleles.len() != 2 || ref_allele.len() != 1 || alt_allele.len() != 1 { - return (display.to_owned(), "unknown".to_owned()); - } - let ref_ch = ref_allele.chars().next().unwrap_or_default(); - let alt_ch = alt_allele.chars().next().unwrap_or_default(); - let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); - let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); - match (ref_count, alt_count) { - (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), - (1, 1) => ("0/1".to_owned(), "het".to_owned()), - (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), - _ => (display.to_owned(), "unknown".to_owned()), - } -} - -fn parse_optional_u32(value: Option<&String>) -> Option { - value.and_then(|value| value.parse::().ok()) -} - -fn load_manifest_findings( - root: &Path, - manifest_path: &Path, -) -> Result, String> { - let value = load_yaml_value(manifest_path)?; - let schema = value - .get("schema") - .and_then(serde_yaml::Value::as_str) - .unwrap_or_default(); - let mut findings = Vec::new(); - - if matches!( - schema, - "bioscript:variant:1.0" - | "bioscript:variant" - | "bioscript:assay:1.0" - | "bioscript:panel:1.0" - | "bioscript:pgx-findings:1.0" - ) { - if let Some(items) = value - .get("findings") - .and_then(serde_yaml::Value::as_sequence) - { - for item in items { - let json_item = yaml_to_json(item.clone())?; - let include = json_item - .get("include") - .and_then(serde_json::Value::as_str) - .map(str::to_owned); - if let Some(include) = include { - let include_path = resolve_manifest_path(root, manifest_path, &include)?; - let mut included = load_manifest_findings(root, &include_path)?; - let inherited_binding = json_item.get("binding").cloned(); - for included_item in &mut included { - if inherited_binding.is_some() - && included_item.get("binding").is_none() - && included_item.get("effects").is_none() - { - if let Some(object) = included_item.as_object_mut() { - object.insert( - "binding".to_owned(), - inherited_binding.clone().unwrap_or(serde_json::Value::Null), - ); - } - } - } - findings.extend(included); - continue; - } - if json_item.get("include").is_none() { - findings.push(json_item); - } - } - } - } - - if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") { - if let Some(items) = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - { - for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { - continue; - }; - if !matches!(kind, "variant" | "assay") { - continue; - } - let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { - continue; - }; - let member_path = resolve_manifest_path(root, manifest_path, path)?; - findings.extend(load_manifest_findings(root, &member_path)?); - } - } - } - - Ok(findings) -} - -fn load_yaml_value(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read YAML {}: {err}", path.display()))?; - serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) -} - -fn yaml_to_json(value: serde_yaml::Value) -> Result { - serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) -} - -fn load_manifest_provenance_links( - root: &Path, - manifest_path: &Path, -) -> Result, String> { - let value = load_yaml_value(manifest_path)?; - let schema = value - .get("schema") - .and_then(serde_yaml::Value::as_str) - .unwrap_or_default(); - let mut links = BTreeMap::::new(); - collect_manifest_provenance_entries(&value, &mut links)?; - - if matches!( - schema, - "bioscript:variant:1.0" - | "bioscript:variant" - | "bioscript:assay:1.0" - | "bioscript:panel:1.0" - | "bioscript:pgx-findings:1.0" - ) { - if let Some(items) = value - .get("findings") - .and_then(serde_yaml::Value::as_sequence) - { - for item in items { - let json_item = yaml_to_json(item.clone())?; - let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) - else { - continue; - }; - let include_path = resolve_manifest_path(root, manifest_path, include)?; - for item in load_manifest_provenance_links(root, &include_path)? { - if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(item); - } - } - } - } - } - - if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") { - if let Some(items) = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - { - for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { - continue; - }; - if !matches!(kind, "variant" | "assay") { - continue; - } - let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { - continue; - }; - let member_path = resolve_manifest_path(root, manifest_path, path)?; - for item in load_manifest_provenance_links(root, &member_path)? { - if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(item); - } - } - } - } - } - - Ok(links.into_values().collect()) -} - -fn collect_manifest_provenance_entries( - value: &serde_yaml::Value, - links: &mut BTreeMap, -) -> Result<(), String> { - if let Some(sources) = value - .get("provenance") - .and_then(|provenance| provenance.get("sources")) - .and_then(serde_yaml::Value::as_sequence) - { - for source in sources { - let json = yaml_to_json(source.clone())?; - if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(json); - } - } - } - if let Some(source) = value.get("source") { - let json = yaml_to_json(source.clone())?; - if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(json); - } - } - Ok(()) -} - -fn match_app_findings( - findings: &[serde_json::Value], - observations: &[serde_json::Value], - analyses: &[serde_json::Value], -) -> Vec { - let mut matched = Vec::new(); - let mut seen = std::collections::BTreeSet::new(); - for finding in findings { - if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { - for effect in effects { - if let Some(observation) = app_finding_match_observation(effect, observations) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.remove("effects"); - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_effect".to_owned(), effect.clone()); - object.insert( - "matched_observation".to_owned(), - app_finding_observation_context(observation), - ); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.remove("effects"); - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_effect".to_owned(), effect.clone()); - object.insert("matched_analysis".to_owned(), analysis); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } - } - } else if let Some(observation) = app_finding_match_observation(finding, observations) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert( - "matched_observation".to_owned(), - app_finding_observation_context(observation), - ); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_analysis".to_owned(), analysis); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } - } - matched -} - -fn app_finding_match_observation<'a>( - finding: &serde_json::Value, - observations: &'a [serde_json::Value], -) -> Option<&'a serde_json::Value> { - let Some(binding) = finding.get("binding") else { - return None; - }; - match binding.get("source").and_then(serde_json::Value::as_str) { - Some("variant") => app_variant_binding_match_observation(binding, observations), - _ => None, - } -} - -fn app_finding_match_analysis( - finding: &serde_json::Value, - analyses: &[serde_json::Value], -) -> Option { - let binding = finding.get("binding")?; - if binding.get("source").and_then(serde_json::Value::as_str) != Some("analysis") { - return None; - } - let analysis_id = binding - .get("analysis_id") - .or_else(|| binding.get("analysis")) - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let key = binding.get("key").and_then(serde_json::Value::as_str)?; - for analysis in analyses { - if !analysis_id.is_empty() - && analysis - .get("analysis_id") - .and_then(serde_json::Value::as_str) - != Some(analysis_id) - { - continue; - } - let Some(rows) = analysis.get("rows").and_then(serde_json::Value::as_array) else { - continue; - }; - for row in rows { - if app_binding_matches_value(row.get(key), binding) { - return Some(serde_json::json!({ - "participant_id": analysis.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), - "assay_id": analysis.get("assay_id").cloned().unwrap_or(serde_json::Value::Null), - "analysis_id": analysis.get("analysis_id").cloned().unwrap_or(serde_json::Value::Null), - "key": key, - "value": row.get(key).cloned().unwrap_or(serde_json::Value::Null), - "row": row, - })); - } - } - } - None -} - -fn app_variant_binding_match_observation<'a>( - binding: &serde_json::Value, - observations: &'a [serde_json::Value], -) -> Option<&'a serde_json::Value> { - let operator = binding - .get("operator") - .and_then(serde_json::Value::as_str) - .unwrap_or("equals"); - if matches!(operator, "dosage_equals" | "dosage_in") { - let allele = binding - .get("allele") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - return observations - .iter() - .filter(|observation| !app_variant_ref_mismatch(binding, observation)) - .find(|observation| { - let dosage = app_observation_allele_dosage(observation, allele); - app_binding_matches_dosage(dosage, binding) - }); - } - - let key = binding - .get("key") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - if key.is_empty() { - return None; - } - observations - .iter() - .filter(|observation| !app_variant_ref_mismatch(binding, observation)) - .find(|observation| app_binding_matches_value(observation.get(key), binding)) -} - -fn app_finding_observation_context(observation: &serde_json::Value) -> serde_json::Value { - serde_json::json!({ - "participant_id": observation.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), - "rsid": observation.get("rsid").cloned().unwrap_or(serde_json::Value::Null), - "ref": observation.get("ref").cloned().unwrap_or(serde_json::Value::Null), - "alt": observation.get("alt").cloned().unwrap_or(serde_json::Value::Null), - "genotype_display": observation.get("genotype_display").cloned().unwrap_or(serde_json::Value::Null), - "outcome": observation.get("outcome").cloned().unwrap_or(serde_json::Value::Null), - }) -} - -fn app_variant_ref_mismatch(binding: &serde_json::Value, observation: &serde_json::Value) -> bool { - let variant_ref = binding - .get("variant") - .or_else(|| binding.get("path")) - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - if variant_ref.is_empty() { - return false; - } - let basename = Path::new(variant_ref) - .file_name() - .and_then(|value| value.to_str()) - .unwrap_or(variant_ref); - let candidates = [ - observation - .get("variant_key") - .and_then(serde_json::Value::as_str), - observation - .get("variant_path") - .and_then(serde_json::Value::as_str), - observation.get("rsid").and_then(serde_json::Value::as_str), - ]; - !candidates.into_iter().flatten().any(|candidate| { - candidate == variant_ref - || Path::new(candidate) - .file_name() - .and_then(|value| value.to_str()) - .is_some_and(|value| value == basename) - }) -} - -fn app_observation_allele_dosage(observation: &serde_json::Value, allele: &str) -> Option { - if allele.is_empty() { - return None; - } - let ref_allele = observation - .get("ref") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let alt_allele = observation - .get("alt") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let zygosity = observation - .get("zygosity") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - if allele == ref_allele { - return match zygosity { - "hom_ref" => Some(2), - "het" => Some(1), - "hom_alt" => Some(0), - _ => None, - }; - } - if allele == alt_allele { - return match zygosity { - "hom_ref" => Some(0), - "het" => Some(1), - "hom_alt" => Some(2), - _ => None, - }; - } - let display = observation - .get("genotype_display") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - if allele.len() == 1 { - let allele_ch = allele.chars().next()?.to_ascii_uppercase(); - return Some( - display - .chars() - .filter(|ch| ch.to_ascii_uppercase() == allele_ch) - .count() - .try_into() - .ok()?, - ); - } - None -} - -fn app_binding_matches_value( - actual: Option<&serde_json::Value>, - binding: &serde_json::Value, -) -> bool { - let actual = actual.and_then(value_as_string).unwrap_or_default(); - match binding - .get("operator") - .and_then(serde_json::Value::as_str) - .unwrap_or("equals") - { - "equals" => binding - .get("value") - .and_then(value_as_string) - .is_some_and(|value| value == actual), - "in" => binding - .get("values") - .and_then(serde_json::Value::as_array) - .is_some_and(|values| { - values - .iter() - .filter_map(value_as_string) - .any(|value| value == actual) - }), - _ => false, - } -} - -fn app_binding_matches_dosage(dosage: Option, binding: &serde_json::Value) -> bool { - let Some(dosage) = dosage else { - return false; - }; - match binding - .get("operator") - .and_then(serde_json::Value::as_str) - .unwrap_or_default() - { - "dosage_equals" => binding - .get("value") - .and_then(serde_json::Value::as_i64) - .is_some_and(|value| value == dosage), - "dosage_in" => binding - .get("values") - .and_then(serde_json::Value::as_array) - .is_some_and(|values| { - values - .iter() - .filter_map(serde_json::Value::as_i64) - .any(|value| value == dosage) - }), - _ => false, - } -} - -fn value_as_string(value: &serde_json::Value) -> Option { - match value { - serde_json::Value::String(value) => Some(value.clone()), - serde_json::Value::Number(value) => Some(value.to_string()), - serde_json::Value::Bool(value) => Some(value.to_string()), - _ => None, - } -} - -fn app_finding_dedupe_key(finding: &serde_json::Value) -> String { - let effect_key = finding - .get("matched_effect") - .and_then(|effect| { - effect - .get("id") - .or_else(|| effect.get("label")) - .or_else(|| effect.get("text")) - }) - .and_then(value_as_string) - .unwrap_or_default(); - if let Some(evidence) = finding.get("evidence") { - let source = evidence - .get("source") - .and_then(value_as_string) - .unwrap_or_default(); - let kind = evidence - .get("kind") - .and_then(value_as_string) - .unwrap_or_default(); - let id = evidence - .get("id") - .and_then(value_as_string) - .unwrap_or_default(); - if !source.is_empty() || !kind.is_empty() || !id.is_empty() { - return format!("evidence|{source}|{kind}|{id}|{effect_key}"); - } - if let Some(url) = evidence.get("url").and_then(value_as_string) { - return format!("evidence_url|{url}|{effect_key}"); - } - } - if let Some(id) = finding.get("id").and_then(value_as_string) { - return format!("id|{id}|{effect_key}"); - } - format!( - "content|{}|{}|{}|{}", - finding - .get("schema") - .and_then(value_as_string) - .unwrap_or_default(), - finding - .get("label") - .and_then(value_as_string) - .unwrap_or_default(), - finding - .get("notes") - .and_then(value_as_string) - .unwrap_or_default(), - effect_key - ) -} - -fn app_report_json( - assay_id: &str, - participant_id: &str, - input_file: &Path, - observations: &[serde_json::Value], - analyses: &[serde_json::Value], - findings: &[serde_json::Value], - provenance: &[serde_json::Value], -) -> serde_json::Value { - let called = observations - .iter() - .filter(|item| { - item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") - }) - .count(); - serde_json::json!({ - "schema": "bioscript:report:1.0", - "version": "1.0", - "participant_id": participant_id, - "assay_id": assay_id, - "assay_version": "1.0", - "input": { - "file_name": input_file.file_name().and_then(|value| value.to_str()).unwrap_or_default(), - "file_path": input_file.display().to_string(), - }, - "report_status": if called == observations.len() { "complete" } else { "partial" }, - "derived_from": observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), - "analyses": analyses, - "findings": findings, - "provenance": provenance, - "metrics": { - "n_sites_tested": observations.len(), - "n_sites_called": called, - "n_sites_missing": observations.len().saturating_sub(called), - "n_analyses": analyses.len(), - "n_findings_matched": findings.len(), - } - }) -} - -fn write_app_observations( - output_dir: &Path, - observations: &[serde_json::Value], - format: AppOutputFormat, -) -> Result<(), String> { - if matches!(format, AppOutputFormat::Tsv | AppOutputFormat::Both) { - let mut out = bioscript_core::OBSERVATION_TSV_HEADERS.join("\t"); - out.push('\n'); - for observation in observations { - let line = bioscript_core::OBSERVATION_TSV_HEADERS - .iter() - .map(|header| json_field_as_tsv(observation.get(*header))) - .collect::>() - .join("\t"); - out.push_str(&line); - out.push('\n'); - } - fs::write(output_dir.join("observations.tsv"), out) - .map_err(|err| format!("failed to write observations.tsv: {err}"))?; - } - if matches!(format, AppOutputFormat::Jsonl | AppOutputFormat::Both) { - write_jsonl(&output_dir.join("observations.jsonl"), observations)?; - } - if matches!(format, AppOutputFormat::Json) { - write_json_pretty( - &output_dir.join("observations.json"), - &serde_json::json!({"observations": observations}), - )?; - } - Ok(()) -} - -fn write_app_analyses(output_dir: &Path, analyses: &[serde_json::Value]) -> Result<(), String> { - write_jsonl(&output_dir.join("analysis.jsonl"), analyses) -} - -fn write_app_reports( - output_dir: &Path, - reports: &[serde_json::Value], - format: AppOutputFormat, -) -> Result<(), String> { - if matches!(format, AppOutputFormat::Jsonl | AppOutputFormat::Both) { - write_jsonl(&output_dir.join("reports.jsonl"), reports)?; - } - if matches!(format, AppOutputFormat::Json | AppOutputFormat::Both) { - write_json_pretty( - &output_dir.join("reports.json"), - &serde_json::json!({ - "schema": "bioscript:report-set:1.0", - "version": "1.0", - "reports": reports, - }), - )?; - } - Ok(()) -} - -fn write_jsonl(path: &Path, rows: &[serde_json::Value]) -> Result<(), String> { - let mut out = String::new(); - for row in rows { - let line = serde_json::to_string(row).map_err(|err| err.to_string())?; - out.push_str(&line); - out.push('\n'); - } - fs::write(path, out).map_err(|err| format!("failed to write {}: {err}", path.display())) -} - -fn write_json_pretty(path: &Path, value: &serde_json::Value) -> Result<(), String> { - let text = serde_json::to_string_pretty(value).map_err(|err| err.to_string())?; - fs::write(path, text).map_err(|err| format!("failed to write {}: {err}", path.display())) -} - -fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { - match value { - Some(serde_json::Value::Null) | None => String::new(), - Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), - Some(value) => value.to_string().replace(['\t', '\n'], " "), - } -} - -fn write_app_html( - output_dir: &Path, - observations: &[serde_json::Value], - reports: &[serde_json::Value], -) -> Result<(), String> { - let mut out = String::from( - r##"BioScript report

BioScript Report

"##, - ); - let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); - let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); - let analysis_outputs = collect_report_analyses(reports); - let _ = write!( - out, - "
{} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
", - observations.len(), - analysis_outputs.len(), - label_findings.len(), - summary_findings.len() - ); - out.push_str(""); - out.push_str("

Observations

"); - render_observation_table(&mut out, observations); - out.push_str("
"); - out.push_str("

Analysis

"); - render_analysis_tables(&mut out, &analysis_outputs); - out.push_str("
"); - out.push_str("

PGx Label Annotations

"); - render_pgx_label_table(&mut out, &label_findings); - out.push_str("
"); - out.push_str("

PGx Summary Annotations

"); - render_pgx_summary_table(&mut out, &summary_findings); - out.push_str("
"); - out.push_str("

Provenance

"); - render_provenance_links(&mut out, reports); - out.push_str("
"); - out.push_str("

Raw Reports JSON

"); - for report in reports { - let text = serde_json::to_string_pretty(report).map_err(|err| err.to_string())?; - let _ = write!(out, "
{}
", html_escape(&text)); - } - out.push_str("
"); - fs::write(output_dir.join("index.html"), out) - .map_err(|err| format!("failed to write index.html: {err}")) -} - -fn collect_report_analyses(reports: &[serde_json::Value]) -> Vec { - reports - .iter() - .filter_map(|report| report.get("analyses").and_then(serde_json::Value::as_array)) - .flat_map(|analyses| analyses.iter()) - .cloned() - .collect() -} - -fn collect_report_findings(reports: &[serde_json::Value], schema: &str) -> Vec { - reports - .iter() - .filter_map(|report| report.get("findings").and_then(serde_json::Value::as_array)) - .flat_map(|findings| findings.iter()) - .filter(|finding| finding.get("schema").and_then(serde_json::Value::as_str) == Some(schema)) - .cloned() - .collect() -} - -fn render_analysis_tables(out: &mut String, analyses: &[serde_json::Value]) { - if analyses.is_empty() { - out.push_str("

No analysis outputs.

"); - return; - } - for (index, analysis) in analyses.iter().enumerate() { - let table_id = format!("analysis-table-{index}"); - let title = format!( - "{} / {}", - value_str(analysis, "participant_id"), - value_str(analysis, "analysis_id") - ); - let _ = write!(out, "

{}

", html_escape(&title)); - render_analysis_logic(out, analysis); - let rows = analysis - .get("rows") - .and_then(serde_json::Value::as_array) - .cloned() - .unwrap_or_default(); - if rows.is_empty() { - out.push_str("

No rows emitted.

"); - continue; - } - let headers = analysis_row_headers(&rows); - let header_refs = headers.iter().map(String::as_str).collect::>(); - render_table_start(out, &table_id, &header_refs); - for row in rows { - out.push_str(""); - for header in &headers { - table_cell(out, &json_field_as_tsv(row.get(header))); - } - out.push_str(""); - } - render_table_end(out); - } -} - -fn analysis_row_headers(rows: &[serde_json::Value]) -> Vec { - let mut headers = Vec::new(); - for row in rows { - let Some(object) = row.as_object() else { - continue; - }; - for key in object.keys() { - if !headers.contains(key) { - headers.push(key.clone()); - } - } - } - headers -} - -fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { - let Some(logic) = analysis.get("logic") else { - return; - }; - if logic.is_null() { - return; - } - let description = logic - .get("description") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let source = logic.get("source").unwrap_or(&serde_json::Value::Null); - let source_name = source - .get("name") - .and_then(serde_json::Value::as_str) - .unwrap_or("source"); - let source_url = source - .get("url") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - out.push_str("
"); - if !description.is_empty() { - let _ = write!(out, "

{}

", html_escape(description)); - } - if !source_url.is_empty() { - let _ = write!( - out, - "

Logic source: {}

", - html_escape(source_url), - html_escape(source_name) - ); - } - out.push_str("
"); -} - -fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { - let mut links = BTreeMap::::new(); - for report in reports { - collect_provenance_links_from_value(report, &mut links); - } - if links.is_empty() { - out.push_str("

No provenance links.

"); - return; - } - out.push_str("
    "); - for (url, label) in links { - let display = if label.is_empty() { &url } else { &label }; - let _ = write!( - out, - "
  • {}
    {}
  • ", - html_escape(&url), - html_escape(display), - html_escape(&url) - ); - } - out.push_str("
"); -} - -fn collect_provenance_links_from_value( - value: &serde_json::Value, - links: &mut BTreeMap, -) { - match value { - serde_json::Value::Object(object) => { - if let Some(url) = object.get("url").and_then(serde_json::Value::as_str) - && url.starts_with("http") - { - let label = object - .get("name") - .or_else(|| object.get("label")) - .or_else(|| object.get("source")) - .and_then(value_as_string) - .unwrap_or_default(); - links.entry(url.to_owned()).or_insert(label); - } - for child in object.values() { - collect_provenance_links_from_value(child, links); - } - } - serde_json::Value::Array(items) => { - for item in items { - collect_provenance_links_from_value(item, links); - } - } - _ => {} - } -} - -fn render_observation_table(out: &mut String, observations: &[serde_json::Value]) { - let headers = [ - "participant_id", - "rsid", - "ref", - "alt", - "genotype_display", - "genotype", - "zygosity", - "outcome", - "match_status", - "coverage_status", - "call_status", - "assembly", - "chrom", - "pos_start", - "pos_end", - "kind", - "ref_count", - "alt_count", - "depth", - "genotype_quality", - "allele_balance", - "evidence_type", - "evidence_raw", - "facets", - "assay_id", - "assay_version", - "variant_key", - ]; - render_table_start(out, "observations-table", &headers); - for observation in observations { - let _ = write!(out, "", observation_row_class(observation)); - for header in headers { - render_observation_cell(out, observation, header); - } - out.push_str(""); - } - out.push_str(""); -} - -fn observation_row_class(observation: &serde_json::Value) -> &'static str { - match observation - .get("outcome") - .and_then(serde_json::Value::as_str) - .unwrap_or_default() - { - "variant" => "row-variant", - "reference" => "row-reference", - _ => "", - } -} - -fn render_observation_cell(out: &mut String, observation: &serde_json::Value, header: &str) { - if header == "genotype_display" { - let outcome = observation - .get("outcome") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let value = json_field_as_tsv(observation.get(header)); - if outcome == "variant" { - let alt = observation - .get("alt") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let _ = write!( - out, - "{}", - highlight_allele(&value, alt) - ); - return; - } - } - let _ = write!( - out, - "{}", - html_escape(&json_field_as_tsv(observation.get(header))) - ); -} - -fn highlight_allele(value: &str, allele: &str) -> String { - if value.is_empty() || allele.is_empty() { - return html_escape(value); - } - if allele.chars().count() == 1 { - let target = allele - .chars() - .next() - .unwrap_or_default() - .to_ascii_uppercase(); - let mut out = String::new(); - for ch in value.chars() { - let escaped = html_escape(&ch.to_string()); - if ch.to_ascii_uppercase() == target { - let _ = write!(out, "{escaped}"); - } else { - out.push_str(&escaped); - } - } - return out; - } - let escaped_value = html_escape(value); - let escaped_allele = html_escape(allele); - escaped_value.replace( - &escaped_allele, - &format!("{escaped_allele}"), - ) -} - -fn render_pgx_label_table(out: &mut String, findings: &[serde_json::Value]) { - let headers = [ - "Variant", - "Ref/Alt", - "Genes", - "Drug(s)", - "Regulator", - "Action", - "Label", - "Evidence", - ]; - render_pgx_label_filters(out); - render_table_start(out, "labels-table", &headers); - for finding in findings { - let evidence = finding.get("evidence"); - let url = evidence - .and_then(|value| value.get("url")) - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let pgx_level = value_str(finding, "pgx_action_level"); - let _ = write!( - out, - "", - html_escape(&pgx_level_slug(pgx_level)) - ); - table_cell(out, value_str(finding, "variant")); - class_cell(out, &matched_ref_alt(finding), "mono"); - table_cell(out, &join_string_array(finding.get("genes"))); - table_cell(out, &join_drugs(finding)); - table_cell(out, &join_string_array(finding.get("regulatory_sources"))); - pgx_level_cell(out, pgx_level); - table_cell(out, value_str(finding, "label")); - link_cell(out, url); - out.push_str(""); - } - render_table_end(out); -} - -fn render_pgx_summary_table(out: &mut String, findings: &[serde_json::Value]) { - let headers = [ - "Variant", - "Ref/Alt", - "Genotype", - "Drug(s)", - "Category", - "Level", - "Phenotype", - "Effect", - "Evidence", - ]; - render_evidence_level_filters(out); - render_table_start(out, "summaries-table", &headers); - for finding in findings { - let effect = finding - .get("matched_effect") - .unwrap_or(&serde_json::Value::Null); - let evidence = finding.get("evidence"); - let url = evidence - .and_then(|value| value.get("url")) - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let evidence_level = value_str(finding, "evidence_level"); - let _ = write!( - out, - "", - html_escape(&evidence_level_group(evidence_level)) - ); - table_cell(out, value_str(finding, "variant")); - class_cell(out, &matched_ref_alt(finding), "mono"); - table_cell(out, value_str(effect, "label")); - table_cell(out, &join_drugs(finding)); - table_cell(out, &join_string_array(finding.get("phenotype_categories"))); - evidence_level_cell(out, evidence_level); - table_cell(out, &join_string_array(finding.get("phenotypes"))); - class_cell(out, value_str(effect, "text"), "effect"); - link_cell(out, url); - out.push_str(""); - } - render_table_end(out); -} - -fn render_evidence_level_filters(out: &mut String) { - out.push_str("
Evidence:"); - for (level, label) in [ - ("1", "Level 1"), - ("1a", "Level 1A"), - ("1b", "Level 1B"), - ("2", "Level 2"), - ("2a", "Level 2A"), - ("2b", "Level 2B"), - ("3", "Level 3"), - ("4", "Level 4"), - ] { - let _ = write!( - out, - "" - ); - } - out.push_str(""); - out.push_str("i
"); -} - -fn render_pgx_label_filters(out: &mut String) { - out.push_str("
PGx level:"); - for (level, label) in [ - ("required", "Testing Required"), - ("recommended", "Testing Recommended"), - ("actionable", "Actionable PGx"), - ("informative", "Informative PGx"), - ("no-clinical", "No Clinical PGx"), - ("criteria", "Criteria Not Met"), - ] { - let _ = write!( - out, - "" - ); - } - out.push_str(""); - out.push_str("i
"); -} - -fn matched_ref_alt(finding: &serde_json::Value) -> String { - let Some(observation) = finding.get("matched_observation") else { - return String::new(); - }; - let ref_allele = observation - .get("ref") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let alt_allele = observation - .get("alt") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - if ref_allele.is_empty() && alt_allele.is_empty() { - String::new() - } else { - let alt_display = alt_allele.replace(',', "/"); - format!("{ref_allele}->{alt_display}") - } -} - -fn evidence_level_group(level: &str) -> String { - let normalized = level.trim().to_ascii_lowercase(); - if normalized.starts_with("1a") { - "1a".to_owned() - } else if normalized.starts_with("1b") { - "1b".to_owned() - } else if normalized.starts_with('1') { - "1".to_owned() - } else if normalized.starts_with("2a") { - "2a".to_owned() - } else if normalized.starts_with("2b") { - "2b".to_owned() - } else if normalized.starts_with('2') { - "2".to_owned() - } else if normalized.starts_with('3') { - "3".to_owned() - } else if normalized.starts_with('4') { - "4".to_owned() - } else { - "unknown".to_owned() - } -} - -fn evidence_level_color_group(level: &str) -> String { - level - .chars() - .find(|ch| ch.is_ascii_digit()) - .map(|ch| ch.to_string()) - .unwrap_or_else(|| "unknown".to_owned()) -} - -fn evidence_level_cell(out: &mut String, level: &str) { - if level.is_empty() { - out.push_str(""); - return; - } - let group = evidence_level_color_group(level); - let _ = write!( - out, - "{}", - html_escape(&group), - html_escape(level) - ); -} - -fn pgx_level_slug(level: &str) -> String { - let normalized = level.to_ascii_lowercase(); - if normalized.contains("required") { - "required".to_owned() - } else if normalized.contains("recommended") { - "recommended".to_owned() - } else if normalized.contains("actionable") { - "actionable".to_owned() - } else if normalized.contains("informative") { - "informative".to_owned() - } else if normalized.contains("no clinical") { - "no-clinical".to_owned() - } else if normalized.contains("criteria") { - "criteria".to_owned() - } else { - "unknown".to_owned() - } -} - -fn pgx_level_cell(out: &mut String, level: &str) { - if level.is_empty() { - out.push_str(""); - return; - } - let slug = pgx_level_slug(level); - let _ = write!( - out, - "{}", - html_escape(&slug), - html_escape(level) - ); -} - -fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { - let escaped_id = html_escape(table_id); - let refs_control = if table_id == "observations-table" { - "" - } else { - "" - }; - let _ = write!( - out, - "
{refs_control}
" - ); - for (index, header) in headers.iter().enumerate() { - let _ = write!( - out, - "", - escaped_id, - index, - html_escape(header) - ); - } - out.push_str(""); -} - -fn render_table_end(out: &mut String) { - out.push_str("
{}
"); -} - -fn table_cell(out: &mut String, value: &str) { - class_cell(out, value, ""); -} - -fn class_cell(out: &mut String, value: &str, class_name: &str) { - if class_name.is_empty() { - let _ = write!(out, "{}", html_escape(value)); - } else { - let _ = write!( - out, - "{}", - class_name, - html_escape(value) - ); - } -} - -fn link_cell(out: &mut String, url: &str) { - if url.is_empty() { - out.push_str(""); - } else { - let escaped = html_escape(url); - let _ = write!( - out, - "source" - ); - } -} - -fn value_str<'a>(value: &'a serde_json::Value, key: &str) -> &'a str { - value - .get(key) - .and_then(serde_json::Value::as_str) - .unwrap_or_default() -} - -fn join_string_array(value: Option<&serde_json::Value>) -> String { - value - .and_then(serde_json::Value::as_array) - .map(|items| { - items - .iter() - .filter_map(serde_json::Value::as_str) - .collect::>() - .join(", ") - }) - .unwrap_or_default() -} - -fn join_drugs(finding: &serde_json::Value) -> String { - finding - .get("drugs") - .and_then(serde_json::Value::as_array) - .map(|items| { - items - .iter() - .filter_map(|drug| drug.get("name").and_then(serde_json::Value::as_str)) - .collect::>() - .join(", ") - }) - .unwrap_or_default() -} - -fn html_escape(value: &str) -> String { - value - .replace('&', "&") - .replace('<', "<") - .replace('>', ">") - .replace('"', """) -} - -fn run_validate_panels(args: Vec) -> Result<(), String> { - let mut path: Option = None; - let mut report_path: Option = None; - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - if arg == "--report" { - let Some(value) = iter.next() else { - return Err("--report requires a path".to_owned()); - }; - report_path = Some(PathBuf::from(value)); - } else if path.is_none() { - path = Some(PathBuf::from(arg)); - } else { - return Err(format!("unexpected argument: {arg}")); - } - } - - let Some(path) = path else { - return Err("usage: bioscript validate-panels [--report ]".to_owned()); - }; - - let report = validate_panels_path(&path)?; - let text = report.render_text(); - print!("{text}"); - - if let Some(report_path) = report_path { - if let Some(parent) = report_path.parent() { - std::fs::create_dir_all(parent).map_err(|err| { - format!("failed to create report dir {}: {err}", parent.display()) - })?; - } - std::fs::write(&report_path, text) - .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; - } - - if report.has_errors() { - return Err(format!( - "validation found {} errors and {} warnings", - report.total_errors(), - report.total_warnings() - )); - } - - Ok(()) -} - -fn run_validate_assays(args: Vec) -> Result<(), String> { - let mut path: Option = None; - let mut report_path: Option = None; - - let mut iter = args.into_iter(); - while let Some(arg) = iter.next() { - if arg == "--report" { - let Some(value) = iter.next() else { - return Err("--report requires a path".to_owned()); - }; - report_path = Some(PathBuf::from(value)); - } else if path.is_none() { - path = Some(PathBuf::from(arg)); - } else { - return Err(format!("unexpected argument: {arg}")); - } - } - - let Some(path) = path else { - return Err("usage: bioscript validate-assays [--report ]".to_owned()); - }; - - let report = validate_assays_path(&path)?; - let text = report.render_text(); - print!("{text}"); - - if let Some(report_path) = report_path { - if let Some(parent) = report_path.parent() { - std::fs::create_dir_all(parent).map_err(|err| { - format!("failed to create report dir {}: {err}", parent.display()) - })?; - } - std::fs::write(&report_path, text) - .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; - } - - if report.has_errors() { - return Err(format!( - "validation found {} errors and {} warnings", - report.total_errors(), - report.total_warnings() - )); - } - - Ok(()) -} - -fn is_yaml_manifest(path: &Path) -> bool { - path.extension() - .and_then(|ext| ext.to_str()) - .is_some_and(|ext| matches!(ext, "yaml" | "yml")) -} - -struct ManifestRunOptions<'a> { - input_file: Option<&'a str>, - output_file: Option<&'a str>, - participant_id: Option<&'a str>, - trace_report: Option<&'a Path>, - loader: &'a GenotypeLoadOptions, - filters: &'a [String], -} - -fn run_manifest( - runtime_root: &Path, - manifest_path: &Path, - options: &ManifestRunOptions<'_>, -) -> Result<(), String> { - let schema = manifest_schema(manifest_path)?; - let resolved_input = options - .input_file - .map(|value| resolve_cli_path(runtime_root, value)); - let resolved_output = options - .output_file - .map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); - let resolved_trace = options - .trace_report - .map(|value| resolve_cli_path_buf(runtime_root, value)); - match schema.as_str() { - "bioscript:variant:1.0" | "bioscript:variant" => { - let manifest = load_variant_manifest(manifest_path)?; - let row = run_variant_manifest( - runtime_root, - &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - )?; - write_manifest_outputs( - std::slice::from_ref(&row), - resolved_output.as_deref(), - resolved_trace.as_deref(), - )?; - Ok(()) - } - "bioscript:panel:1.0" => { - let manifest = load_panel_manifest(manifest_path)?; - let rows = run_panel_manifest( - runtime_root, - &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - options.filters, - )?; - write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; - Ok(()) - } - "bioscript:assay:1.0" => { - let manifest = load_assay_manifest(manifest_path)?; - let rows = run_assay_manifest( - runtime_root, - &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - options.filters, - )?; - write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; - Ok(()) - } - other => Err(format!("unsupported manifest schema '{other}'")), - } -} - -fn run_variant_manifest( - runtime_root: &Path, - manifest: &VariantManifest, - input_file: Option<&str>, - participant_id: Option<&str>, - loader: &GenotypeLoadOptions, -) -> Result, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - Ok(variant_row( - runtime_root, - &manifest.path, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )) -} - -fn run_panel_manifest( - runtime_root: &Path, - panel: &PanelManifest, - input_file: Option<&str>, - participant_id: Option<&str>, - loader: &GenotypeLoadOptions, - filters: &[String], -) -> Result>, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - let mut rows = Vec::new(); - - for member in &panel.members { - let Some(path) = &member.path else { - return Err("remote panel members are not executable yet".to_owned()); - }; - let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; - if member.kind == "variant" { - let manifest = load_variant_manifest(&resolved)?; - if !matches_filters(&manifest, &resolved, filters) { - continue; - } - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - rows.push(variant_row( - runtime_root, - &resolved, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )); - } else if member.kind == "assay" { - let assay = load_assay_manifest(&resolved)?; - rows.extend(run_assay_manifest_with_store( - runtime_root, - &assay, - &store, - participant_id, - filters, - )?); - } else { - return Err(format!( - "panel member kind '{}' is not executable", - member.kind - )); - } - } - - Ok(rows) -} - -fn run_assay_manifest( - runtime_root: &Path, - assay: &AssayManifest, - input_file: Option<&str>, - participant_id: Option<&str>, - loader: &GenotypeLoadOptions, - filters: &[String], -) -> Result>, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - run_assay_manifest_with_store(runtime_root, assay, &store, participant_id, filters) -} - -fn run_assay_manifest_with_store( - runtime_root: &Path, - assay: &AssayManifest, - store: &GenotypeStore, - participant_id: Option<&str>, - filters: &[String], -) -> Result>, String> { - let mut rows = Vec::new(); - - for member in &assay.members { - if member.kind != "variant" { - return Err(format!( - "assay member kind '{}' is not executable", - member.kind - )); - } - let Some(path) = &member.path else { - return Err("remote assay members are not executable yet".to_owned()); - }; - let resolved = resolve_manifest_path(runtime_root, &assay.path, path)?; - let manifest = load_variant_manifest(&resolved)?; - if !matches_filters(&manifest, &resolved, filters) { - continue; - } - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - rows.push(variant_row( - runtime_root, - &resolved, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )); - } - - Ok(rows) -} - -fn variant_row( - runtime_root: &Path, - path: &Path, - name: &str, - tags: &[String], - observation: &bioscript_core::VariantObservation, - participant_id: Option<&str>, -) -> BTreeMap { - let mut row = BTreeMap::new(); - row.insert("kind".to_owned(), "variant".to_owned()); - row.insert("name".to_owned(), name.to_owned()); - row.insert( - "path".to_owned(), - path.strip_prefix(runtime_root) - .unwrap_or(path) - .display() - .to_string(), - ); - row.insert("tags".to_owned(), tags.join(",")); - row.insert("backend".to_owned(), observation.backend.clone()); - row.insert( - "participant_id".to_owned(), - participant_id.unwrap_or_default().to_owned(), - ); - row.insert( - "matched_rsid".to_owned(), - observation.matched_rsid.clone().unwrap_or_default(), - ); - row.insert( - "assembly".to_owned(), - observation - .assembly - .map(|value| match value { - bioscript_core::Assembly::Grch37 => "grch37".to_owned(), - bioscript_core::Assembly::Grch38 => "grch38".to_owned(), - }) - .unwrap_or_default(), - ); - row.insert( - "genotype".to_owned(), - observation.genotype.clone().unwrap_or_default(), - ); - row.insert( - "ref_count".to_owned(), - observation - .ref_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "alt_count".to_owned(), - observation - .alt_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "depth".to_owned(), - observation - .depth - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert("evidence".to_owned(), observation.evidence.join(" | ")); - row -} - -fn write_manifest_outputs( - rows: &[BTreeMap], - output_file: Option<&Path>, - trace_report: Option<&Path>, -) -> Result<(), String> { - let text = render_rows_as_tsv(rows); - if let Some(output_file) = output_file { - if let Some(parent) = output_file.parent() { - fs::create_dir_all(parent).map_err(|err| { - format!("failed to create output dir {}: {err}", parent.display()) - })?; - } - fs::write(output_file, &text) - .map_err(|err| format!("failed to write output {}: {err}", output_file.display()))?; - } else { - print!("{text}"); - } - - if let Some(trace_report) = trace_report { - if let Some(parent) = trace_report.parent() { - fs::create_dir_all(parent) - .map_err(|err| format!("failed to create trace dir {}: {err}", parent.display()))?; - } - let mut trace = String::from("step\tline\tcode\n"); - for (idx, row) in rows.iter().enumerate() { - let _ = writeln!( - trace, - "{}\t{}\t{}", - idx + 1, - idx + 1, - row.get("path").cloned().unwrap_or_default() - ); - } - fs::write(trace_report, trace) - .map_err(|err| format!("failed to write trace {}: {err}", trace_report.display()))?; - } - - Ok(()) -} - -fn resolve_cli_path(root: &Path, value: &str) -> String { - resolve_cli_path_buf(root, Path::new(value)) - .display() - .to_string() -} - -fn resolve_cli_path_buf(root: &Path, value: &Path) -> PathBuf { - if value.is_absolute() { - value.to_path_buf() - } else { - root.join(value) - } -} - -fn render_rows_as_tsv(rows: &[BTreeMap]) -> String { - let headers = [ - "kind", - "name", - "path", - "tags", - "participant_id", - "backend", - "matched_rsid", - "assembly", - "genotype", - "ref_count", - "alt_count", - "depth", - "evidence", - ]; - let mut out = headers.join("\t"); - out.push('\n'); - for row in rows { - let line = headers - .iter() - .map(|header| { - row.get(*header) - .cloned() - .unwrap_or_default() - .replace('\t', " ") - }) - .collect::>() - .join("\t"); - out.push_str(&line); - out.push('\n'); - } - out -} - -fn matches_filters(manifest: &VariantManifest, path: &Path, filters: &[String]) -> bool { - filters.iter().all(|filter| match filter.split_once('=') { - Some(("kind", value)) => value == "variant", - Some(("name", value)) => manifest.name.contains(value), - Some(("path", value)) => path.display().to_string().contains(value), - Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), - Some(_) | None => false, - }) -} - -fn resolve_manifest_path( - runtime_root: &Path, - manifest_path: &Path, - relative: &str, -) -> Result { - let base_dir = manifest_path - .parent() - .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; - let joined = base_dir.join(relative); - let canonical_root = runtime_root - .canonicalize() - .map_err(|err| format!("failed to resolve root {}: {err}", runtime_root.display()))?; - let canonical_base = base_dir.canonicalize().map_err(|err| { - format!( - "failed to resolve manifest dir {}: {err}", - base_dir.display() - ) - })?; - let canonical_joined = joined - .canonicalize() - .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; - let boundary = if canonical_base.starts_with(&canonical_root) { - &canonical_root - } else { - &canonical_base - }; - if !canonical_joined.starts_with(boundary) { - return Err(format!( - "manifest member path escapes bioscript root: {}", - canonical_joined.display() - )); - } - Ok(canonical_joined) -} - -fn manifest_schema(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read {}: {err}", path.display()))?; - let value: serde_yaml::Value = serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; - value - .as_mapping() - .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) - .and_then(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) - .ok_or_else(|| format!("{} is missing schema", path.display())) -} - -fn normalize_loader_paths(root: &Path, loader: &mut GenotypeLoadOptions) { - if let Some(path) = loader.input_index.take() { - loader.input_index = Some(resolve_cli_path_buf(root, &path)); - } - if let Some(path) = loader.reference_file.take() { - loader.reference_file = Some(resolve_cli_path_buf(root, &path)); - } - if let Some(path) = loader.reference_index.take() { - loader.reference_index = Some(resolve_cli_path_buf(root, &path)); - } -} +// Keep included source files small and named by responsibility. +// If a file approaches 500 lines, split it by domain behavior rather than +// creating arbitrary numbered chunks. +include!("cli_bootstrap.rs"); +include!("cli_commands.rs"); +include!("report_options.rs"); +include!("report_execution.rs"); +include!("report_observations.rs"); +include!("report_matching.rs"); +include!("report_output.rs"); +include!("report_html.rs"); +include!("manifest_runner.rs"); diff --git a/rust/bioscript-cli/src/manifest_runner.rs b/rust/bioscript-cli/src/manifest_runner.rs new file mode 100644 index 0000000..bc4e917 --- /dev/null +++ b/rust/bioscript-cli/src/manifest_runner.rs @@ -0,0 +1,424 @@ +struct ManifestRunOptions<'a> { + input_file: Option<&'a str>, + output_file: Option<&'a str>, + participant_id: Option<&'a str>, + trace_report: Option<&'a Path>, + loader: &'a GenotypeLoadOptions, + filters: &'a [String], +} + +fn run_manifest( + runtime_root: &Path, + manifest_path: &Path, + options: &ManifestRunOptions<'_>, +) -> Result<(), String> { + let schema = manifest_schema(manifest_path)?; + let resolved_input = options + .input_file + .map(|value| resolve_cli_path(runtime_root, value)); + let resolved_output = options + .output_file + .map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); + let resolved_trace = options + .trace_report + .map(|value| resolve_cli_path_buf(runtime_root, value)); + match schema.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = load_variant_manifest(manifest_path)?; + let row = run_variant_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + )?; + write_manifest_outputs( + std::slice::from_ref(&row), + resolved_output.as_deref(), + resolved_trace.as_deref(), + )?; + Ok(()) + } + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + let rows = run_panel_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + options.filters, + )?; + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + let rows = run_assay_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + options.filters, + )?; + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn run_variant_manifest( + runtime_root: &Path, + manifest: &VariantManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, +) -> Result, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + Ok(variant_row( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )) +} + +fn run_panel_manifest( + runtime_root: &Path, + panel: &PanelManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let mut rows = Vec::new(); + + for member in &panel.members { + let Some(path) = &member.path else { + return Err("remote panel members are not executable yet".to_owned()); + }; + let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + if member.kind == "variant" { + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + rows.push(variant_row( + runtime_root, + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } else if member.kind == "assay" { + let assay = load_assay_manifest(&resolved)?; + rows.extend(run_assay_manifest_with_store( + runtime_root, + &assay, + &store, + participant_id, + filters, + )?); + } else { + return Err(format!( + "panel member kind '{}' is not executable", + member.kind + )); + } + } + + Ok(rows) +} + +fn run_assay_manifest( + runtime_root: &Path, + assay: &AssayManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + run_assay_manifest_with_store(runtime_root, assay, &store, participant_id, filters) +} + +fn run_assay_manifest_with_store( + runtime_root: &Path, + assay: &AssayManifest, + store: &GenotypeStore, + participant_id: Option<&str>, + filters: &[String], +) -> Result>, String> { + let mut rows = Vec::new(); + + for member in &assay.members { + if member.kind != "variant" { + return Err(format!( + "assay member kind '{}' is not executable", + member.kind + )); + } + let Some(path) = &member.path else { + return Err("remote assay members are not executable yet".to_owned()); + }; + let resolved = resolve_manifest_path(runtime_root, &assay.path, path)?; + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + rows.push(variant_row( + runtime_root, + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } + + Ok(rows) +} + +fn variant_row( + runtime_root: &Path, + path: &Path, + name: &str, + tags: &[String], + observation: &bioscript_core::VariantObservation, + participant_id: Option<&str>, +) -> BTreeMap { + let mut row = BTreeMap::new(); + row.insert("kind".to_owned(), "variant".to_owned()); + row.insert("name".to_owned(), name.to_owned()); + row.insert( + "path".to_owned(), + path.strip_prefix(runtime_root) + .unwrap_or(path) + .display() + .to_string(), + ); + row.insert("tags".to_owned(), tags.join(",")); + row.insert("backend".to_owned(), observation.backend.clone()); + row.insert( + "participant_id".to_owned(), + participant_id.unwrap_or_default().to_owned(), + ); + row.insert( + "matched_rsid".to_owned(), + observation.matched_rsid.clone().unwrap_or_default(), + ); + row.insert( + "assembly".to_owned(), + observation + .assembly + .map(|value| match value { + bioscript_core::Assembly::Grch37 => "grch37".to_owned(), + bioscript_core::Assembly::Grch38 => "grch38".to_owned(), + }) + .unwrap_or_default(), + ); + row.insert( + "genotype".to_owned(), + observation.genotype.clone().unwrap_or_default(), + ); + row.insert( + "ref_count".to_owned(), + observation + .ref_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "alt_count".to_owned(), + observation + .alt_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "depth".to_owned(), + observation + .depth + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert("evidence".to_owned(), observation.evidence.join(" | ")); + row +} + +fn write_manifest_outputs( + rows: &[BTreeMap], + output_file: Option<&Path>, + trace_report: Option<&Path>, +) -> Result<(), String> { + let text = render_rows_as_tsv(rows); + if let Some(output_file) = output_file { + if let Some(parent) = output_file.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!("failed to create output dir {}: {err}", parent.display()) + })?; + } + fs::write(output_file, &text) + .map_err(|err| format!("failed to write output {}: {err}", output_file.display()))?; + } else { + print!("{text}"); + } + + if let Some(trace_report) = trace_report { + if let Some(parent) = trace_report.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create trace dir {}: {err}", parent.display()))?; + } + let mut trace = String::from("step\tline\tcode\n"); + for (idx, row) in rows.iter().enumerate() { + let _ = writeln!( + trace, + "{}\t{}\t{}", + idx + 1, + idx + 1, + row.get("path").cloned().unwrap_or_default() + ); + } + fs::write(trace_report, trace) + .map_err(|err| format!("failed to write trace {}: {err}", trace_report.display()))?; + } + + Ok(()) +} + +fn resolve_cli_path(root: &Path, value: &str) -> String { + resolve_cli_path_buf(root, Path::new(value)) + .display() + .to_string() +} + +fn resolve_cli_path_buf(root: &Path, value: &Path) -> PathBuf { + if value.is_absolute() { + value.to_path_buf() + } else { + root.join(value) + } +} + +fn render_rows_as_tsv(rows: &[BTreeMap]) -> String { + let headers = [ + "kind", + "name", + "path", + "tags", + "participant_id", + "backend", + "matched_rsid", + "assembly", + "genotype", + "ref_count", + "alt_count", + "depth", + "evidence", + ]; + let mut out = headers.join("\t"); + out.push('\n'); + for row in rows { + let line = headers + .iter() + .map(|header| { + row.get(*header) + .cloned() + .unwrap_or_default() + .replace('\t', " ") + }) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + out +} + +fn matches_filters(manifest: &VariantManifest, path: &Path, filters: &[String]) -> bool { + filters.iter().all(|filter| match filter.split_once('=') { + Some(("kind", value)) => value == "variant", + Some(("name", value)) => manifest.name.contains(value), + Some(("path", value)) => path.display().to_string().contains(value), + Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), + Some(_) | None => false, + }) +} + +fn resolve_manifest_path( + runtime_root: &Path, + manifest_path: &Path, + relative: &str, +) -> Result { + let base_dir = manifest_path + .parent() + .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; + let joined = base_dir.join(relative); + let canonical_root = runtime_root + .canonicalize() + .map_err(|err| format!("failed to resolve root {}: {err}", runtime_root.display()))?; + let canonical_base = base_dir.canonicalize().map_err(|err| { + format!( + "failed to resolve manifest dir {}: {err}", + base_dir.display() + ) + })?; + let canonical_joined = joined + .canonicalize() + .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; + let boundary = if canonical_base.starts_with(&canonical_root) { + &canonical_root + } else { + &canonical_base + }; + if !canonical_joined.starts_with(boundary) { + return Err(format!( + "manifest member path escapes bioscript root: {}", + canonical_joined.display() + )); + } + Ok(canonical_joined) +} + +fn manifest_schema(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; + value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{} is missing schema", path.display())) +} + +fn normalize_loader_paths(root: &Path, loader: &mut GenotypeLoadOptions) { + if let Some(path) = loader.input_index.take() { + loader.input_index = Some(resolve_cli_path_buf(root, &path)); + } + if let Some(path) = loader.reference_file.take() { + loader.reference_file = Some(resolve_cli_path_buf(root, &path)); + } + if let Some(path) = loader.reference_index.take() { + loader.reference_index = Some(resolve_cli_path_buf(root, &path)); + } +} diff --git a/rust/bioscript-cli/src/report_execution.rs b/rust/bioscript-cli/src/report_execution.rs new file mode 100644 index 0000000..622ea5b --- /dev/null +++ b/rust/bioscript-cli/src/report_execution.rs @@ -0,0 +1,309 @@ +fn run_manifest_rows_for_report( + runtime_root: &Path, + manifest_path: &Path, + input_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_text = input_file.display().to_string(); + match manifest_schema(manifest_path)?.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = load_variant_manifest(manifest_path)?; + Ok(vec![run_variant_manifest( + runtime_root, + &manifest, + Some(&input_text), + Some(participant_id), + loader, + )?]) + } + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + run_panel_manifest( + runtime_root, + &manifest, + Some(&input_text), + Some(participant_id), + loader, + filters, + ) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + run_assay_manifest( + runtime_root, + &manifest, + Some(&input_text), + Some(participant_id), + loader, + filters, + ) + } + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn run_manifest_analyses_for_report( + runtime_root: &Path, + manifest_path: &Path, + input_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, + output_dir: &Path, +) -> Result, String> { + match manifest_schema(manifest_path)?.as_str() { + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + let mut analyses = Vec::new(); + analyses.extend(run_interpretations_for_report( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.interpretations, + input_file, + participant_id, + loader, + output_dir, + )?); + for member in &manifest.members { + if member.kind != "assay" { + continue; + } + let Some(path) = &member.path else { + continue; + }; + let resolved = resolve_manifest_path(runtime_root, &manifest.path, path)?; + analyses.extend(run_manifest_analyses_for_report( + runtime_root, + &resolved, + input_file, + participant_id, + loader, + output_dir, + )?); + } + Ok(analyses) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + run_interpretations_for_report( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.interpretations, + input_file, + participant_id, + loader, + output_dir, + ) + } + "bioscript:variant:1.0" | "bioscript:variant" => Ok(Vec::new()), + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +#[allow(clippy::too_many_arguments)] +fn run_interpretations_for_report( + runtime_root: &Path, + manifest_path: &Path, + manifest_name: &str, + interpretations: &[PanelInterpretation], + input_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, + output_dir: &Path, +) -> Result, String> { + let mut outputs = Vec::new(); + for interpretation in interpretations { + if interpretation.kind != "bioscript" { + return Err(format!( + "analysis '{}' uses unsupported kind '{}'", + interpretation.id, interpretation.kind + )); + } + let script_path = resolve_manifest_path(runtime_root, manifest_path, &interpretation.path)?; + let format = interpretation + .output_format + .as_deref() + .unwrap_or("json") + .to_ascii_lowercase(); + let analysis_dir = output_dir.join("analysis").join(participant_id); + fs::create_dir_all(&analysis_dir).map_err(|err| { + format!( + "failed to create analysis output dir {}: {err}", + analysis_dir.display() + ) + })?; + let extension = match format.as_str() { + "tsv" => "tsv", + "json" => "json", + "jsonl" => "jsonl", + other => return Err(format!("unsupported analysis output_format '{other}'")), + }; + let output_file = analysis_dir.join(format!("{}.{}", interpretation.id, extension)); + run_bioscript_analysis_script( + runtime_root, + &script_path, + input_file, + &output_file, + participant_id, + loader, + )?; + let rows = parse_analysis_output(&output_file, &format)?; + outputs.push(serde_json::json!({ + "schema": "bioscript:analysis-output:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": manifest_name, + "analysis_id": interpretation.id, + "kind": interpretation.kind, + "output_format": format, + "manifest_path": manifest_path.strip_prefix(runtime_root).unwrap_or(manifest_path).display().to_string(), + "script_path": script_path.strip_prefix(runtime_root).unwrap_or(&script_path).display().to_string(), + "output_file": output_file.strip_prefix(runtime_root).unwrap_or(&output_file).display().to_string(), + "derived_from": interpretation.derived_from.clone(), + "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ + "key": emit.key.clone(), + "label": emit.label.clone(), + "value_type": emit.value_type.clone(), + "format": emit.format.clone(), + })).collect::>(), + "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ + "description": logic.description.clone(), + "source": logic.source.as_ref().map(|source| serde_json::json!({ + "name": source.name.clone(), + "url": source.url.clone(), + })), + })), + "rows": rows, + })); + } + Ok(outputs) +} + +fn run_bioscript_analysis_script( + runtime_root: &Path, + script_path: &Path, + input_file: &Path, + output_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, +) -> Result<(), String> { + let limits = ResourceLimits::new() + .max_duration(Duration::from_millis(1000)) + .max_memory(16 * 1024 * 1024) + .max_allocations(400_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + let runtime = BioscriptRuntime::with_config( + runtime_root.to_path_buf(), + RuntimeConfig { + limits, + loader: loader.clone(), + }, + ) + .map_err(|err| err.to_string())?; + runtime + .run_file( + script_path, + None, + vec![ + ( + "input_file", + monty::MontyObject::String(runtime_path_string(runtime_root, input_file)), + ), + ( + "output_file", + monty::MontyObject::String(runtime_path_string(runtime_root, output_file)), + ), + ( + "participant_id", + monty::MontyObject::String(participant_id.to_owned()), + ), + ], + ) + .map(|_| ()) + .map_err(|err| err.to_string()) +} + +fn runtime_path_string(runtime_root: &Path, path: &Path) -> String { + path.strip_prefix(runtime_root) + .unwrap_or(path) + .display() + .to_string() +} + +fn parse_analysis_output(path: &Path, format: &str) -> Result, String> { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read analysis output {}: {err}", path.display()))?; + match format { + "tsv" => Ok(parse_analysis_tsv(&text)), + "json" => { + let value: serde_json::Value = serde_json::from_str(&text).map_err(|err| { + format!("failed to parse analysis JSON {}: {err}", path.display()) + })?; + Ok(match value { + serde_json::Value::Array(rows) => rows, + serde_json::Value::Object(mut object) => object + .remove("rows") + .and_then(|rows| rows.as_array().cloned()) + .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), + other => vec![other], + }) + } + "jsonl" => text + .lines() + .filter(|line| !line.trim().is_empty()) + .map(|line| serde_json::from_str(line).map_err(|err| err.to_string())) + .collect(), + other => Err(format!("unsupported analysis output_format '{other}'")), + } +} + +fn parse_analysis_tsv(text: &str) -> Vec { + let mut lines = text.lines().filter(|line| !line.trim().is_empty()); + let Some(header_line) = lines.next() else { + return Vec::new(); + }; + let headers: Vec<&str> = header_line.split('\t').collect(); + let mut rows = Vec::new(); + for line in lines { + let values: Vec<&str> = line.split('\t').collect(); + let mut object = serde_json::Map::new(); + for (idx, header) in headers.iter().enumerate() { + object.insert( + (*header).to_owned(), + serde_json::Value::String(values.get(idx).copied().unwrap_or_default().to_owned()), + ); + } + rows.push(serde_json::Value::Object(object)); + } + rows +} + +fn app_assay_id(path: &Path) -> Result { + match manifest_schema(path)?.as_str() { + "bioscript:panel:1.0" => Ok(load_panel_manifest(path)?.name), + "bioscript:assay:1.0" => Ok(load_assay_manifest(path)?.name), + "bioscript:variant:1.0" | "bioscript:variant" => Ok(load_variant_manifest(path)?.name), + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn participant_id_from_path(path: &Path) -> String { + let file_name = path + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or("participant"); + file_name + .trim_end_matches(".txt.zip") + .trim_end_matches(".csv.zip") + .trim_end_matches(".vcf.gz") + .trim_end_matches(".cram") + .trim_end_matches(".zip") + .trim_end_matches(".txt") + .trim_end_matches(".csv") + .to_owned() +} + diff --git a/rust/bioscript-cli/src/report_html.rs b/rust/bioscript-cli/src/report_html.rs new file mode 100644 index 0000000..caed479 --- /dev/null +++ b/rust/bioscript-cli/src/report_html.rs @@ -0,0 +1,3 @@ +include!("report_html_sections.rs"); +include!("report_html_pgx.rs"); +include!("report_html_helpers.rs"); diff --git a/rust/bioscript-cli/src/report_html_helpers.rs b/rust/bioscript-cli/src/report_html_helpers.rs new file mode 100644 index 0000000..d43ef46 --- /dev/null +++ b/rust/bioscript-cli/src/report_html_helpers.rs @@ -0,0 +1,98 @@ +fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { + let escaped_id = html_escape(table_id); + let refs_control = if table_id == "observations-table" { + "" + } else { + "" + }; + let _ = write!( + out, + "
{refs_control}
" + ); + for (index, header) in headers.iter().enumerate() { + let _ = write!( + out, + "", + escaped_id, + index, + html_escape(header) + ); + } + out.push_str(""); +} + +fn render_table_end(out: &mut String) { + out.push_str("
{}
"); +} + +fn table_cell(out: &mut String, value: &str) { + class_cell(out, value, ""); +} + +fn class_cell(out: &mut String, value: &str, class_name: &str) { + if class_name.is_empty() { + let _ = write!(out, "{}", html_escape(value)); + } else { + let _ = write!( + out, + "{}", + class_name, + html_escape(value) + ); + } +} + +fn link_cell(out: &mut String, url: &str) { + if url.is_empty() { + out.push_str(""); + } else { + let escaped = html_escape(url); + let _ = write!( + out, + "source" + ); + } +} + +fn value_str<'a>(value: &'a serde_json::Value, key: &str) -> &'a str { + value + .get(key) + .and_then(serde_json::Value::as_str) + .unwrap_or_default() +} + +fn join_string_array(value: Option<&serde_json::Value>) -> String { + value + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .filter_map(serde_json::Value::as_str) + .collect::>() + .join(", ") + }) + .unwrap_or_default() +} + +fn join_drugs(finding: &serde_json::Value) -> String { + finding + .get("drugs") + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .filter_map(|drug| drug.get("name").and_then(serde_json::Value::as_str)) + .collect::>() + .join(", ") + }) + .unwrap_or_default() +} + +fn html_escape(value: &str) -> String { + value + .replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) +} + diff --git a/rust/bioscript-cli/src/report_html_pgx.rs b/rust/bioscript-cli/src/report_html_pgx.rs new file mode 100644 index 0000000..bbf0065 --- /dev/null +++ b/rust/bioscript-cli/src/report_html_pgx.rs @@ -0,0 +1,218 @@ +fn render_pgx_label_table(out: &mut String, findings: &[serde_json::Value]) { + let headers = [ + "Variant", + "Ref/Alt", + "Genes", + "Drug(s)", + "Regulator", + "Action", + "Label", + "Evidence", + ]; + render_pgx_label_filters(out); + render_table_start(out, "labels-table", &headers); + for finding in findings { + let evidence = finding.get("evidence"); + let url = evidence + .and_then(|value| value.get("url")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let pgx_level = value_str(finding, "pgx_action_level"); + let _ = write!( + out, + "", + html_escape(&pgx_level_slug(pgx_level)) + ); + table_cell(out, value_str(finding, "variant")); + class_cell(out, &matched_ref_alt(finding), "mono"); + table_cell(out, &join_string_array(finding.get("genes"))); + table_cell(out, &join_drugs(finding)); + table_cell(out, &join_string_array(finding.get("regulatory_sources"))); + pgx_level_cell(out, pgx_level); + table_cell(out, value_str(finding, "label")); + link_cell(out, url); + out.push_str(""); + } + render_table_end(out); +} + +fn render_pgx_summary_table(out: &mut String, findings: &[serde_json::Value]) { + let headers = [ + "Variant", + "Ref/Alt", + "Genotype", + "Drug(s)", + "Category", + "Level", + "Phenotype", + "Effect", + "Evidence", + ]; + render_evidence_level_filters(out); + render_table_start(out, "summaries-table", &headers); + for finding in findings { + let effect = finding + .get("matched_effect") + .unwrap_or(&serde_json::Value::Null); + let evidence = finding.get("evidence"); + let url = evidence + .and_then(|value| value.get("url")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let evidence_level = value_str(finding, "evidence_level"); + let _ = write!( + out, + "", + html_escape(&evidence_level_group(evidence_level)) + ); + table_cell(out, value_str(finding, "variant")); + class_cell(out, &matched_ref_alt(finding), "mono"); + table_cell(out, value_str(effect, "label")); + table_cell(out, &join_drugs(finding)); + table_cell(out, &join_string_array(finding.get("phenotype_categories"))); + evidence_level_cell(out, evidence_level); + table_cell(out, &join_string_array(finding.get("phenotypes"))); + class_cell(out, value_str(effect, "text"), "effect"); + link_cell(out, url); + out.push_str(""); + } + render_table_end(out); +} + +fn render_evidence_level_filters(out: &mut String) { + out.push_str("
Evidence:"); + for (level, label) in [ + ("1", "Level 1"), + ("1a", "Level 1A"), + ("1b", "Level 1B"), + ("2", "Level 2"), + ("2a", "Level 2A"), + ("2b", "Level 2B"), + ("3", "Level 3"), + ("4", "Level 4"), + ] { + let _ = write!( + out, + "" + ); + } + out.push_str(""); + out.push_str("i
"); +} + +fn render_pgx_label_filters(out: &mut String) { + out.push_str("
PGx level:"); + for (level, label) in [ + ("required", "Testing Required"), + ("recommended", "Testing Recommended"), + ("actionable", "Actionable PGx"), + ("informative", "Informative PGx"), + ("no-clinical", "No Clinical PGx"), + ("criteria", "Criteria Not Met"), + ] { + let _ = write!( + out, + "" + ); + } + out.push_str(""); + out.push_str("i
"); +} + +fn matched_ref_alt(finding: &serde_json::Value) -> String { + let Some(observation) = finding.get("matched_observation") else { + return String::new(); + }; + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if ref_allele.is_empty() && alt_allele.is_empty() { + String::new() + } else { + let alt_display = alt_allele.replace(',', "/"); + format!("{ref_allele}->{alt_display}") + } +} + +fn evidence_level_group(level: &str) -> String { + let normalized = level.trim().to_ascii_lowercase(); + if normalized.starts_with("1a") { + "1a".to_owned() + } else if normalized.starts_with("1b") { + "1b".to_owned() + } else if normalized.starts_with('1') { + "1".to_owned() + } else if normalized.starts_with("2a") { + "2a".to_owned() + } else if normalized.starts_with("2b") { + "2b".to_owned() + } else if normalized.starts_with('2') { + "2".to_owned() + } else if normalized.starts_with('3') { + "3".to_owned() + } else if normalized.starts_with('4') { + "4".to_owned() + } else { + "unknown".to_owned() + } +} + +fn evidence_level_color_group(level: &str) -> String { + level + .chars() + .find(char::is_ascii_digit) + .map_or_else(|| "unknown".to_owned(), |ch| ch.to_string()) +} + +fn evidence_level_cell(out: &mut String, level: &str) { + if level.is_empty() { + out.push_str(""); + return; + } + let group = evidence_level_color_group(level); + let _ = write!( + out, + "{}", + html_escape(&group), + html_escape(level) + ); +} + +fn pgx_level_slug(level: &str) -> String { + let normalized = level.to_ascii_lowercase(); + if normalized.contains("required") { + "required".to_owned() + } else if normalized.contains("recommended") { + "recommended".to_owned() + } else if normalized.contains("actionable") { + "actionable".to_owned() + } else if normalized.contains("informative") { + "informative".to_owned() + } else if normalized.contains("no clinical") { + "no-clinical".to_owned() + } else if normalized.contains("criteria") { + "criteria".to_owned() + } else { + "unknown".to_owned() + } +} + +fn pgx_level_cell(out: &mut String, level: &str) { + if level.is_empty() { + out.push_str(""); + return; + } + let slug = pgx_level_slug(level); + let _ = write!( + out, + "{}", + html_escape(&slug), + html_escape(level) + ); +} + diff --git a/rust/bioscript-cli/src/report_html_sections.rs b/rust/bioscript-cli/src/report_html_sections.rs new file mode 100644 index 0000000..61ce304 --- /dev/null +++ b/rust/bioscript-cli/src/report_html_sections.rs @@ -0,0 +1,268 @@ +fn collect_report_analyses(reports: &[serde_json::Value]) -> Vec { + reports + .iter() + .filter_map(|report| report.get("analyses").and_then(serde_json::Value::as_array)) + .flat_map(|analyses| analyses.iter()) + .cloned() + .collect() +} + +fn collect_report_findings(reports: &[serde_json::Value], schema: &str) -> Vec { + reports + .iter() + .filter_map(|report| report.get("findings").and_then(serde_json::Value::as_array)) + .flat_map(|findings| findings.iter()) + .filter(|finding| finding.get("schema").and_then(serde_json::Value::as_str) == Some(schema)) + .cloned() + .collect() +} + +fn render_analysis_tables(out: &mut String, analyses: &[serde_json::Value]) { + if analyses.is_empty() { + out.push_str("

No analysis outputs.

"); + return; + } + for (index, analysis) in analyses.iter().enumerate() { + let table_id = format!("analysis-table-{index}"); + let title = format!( + "{} / {}", + value_str(analysis, "participant_id"), + value_str(analysis, "analysis_id") + ); + let _ = write!(out, "

{}

", html_escape(&title)); + render_analysis_logic(out, analysis); + let rows = analysis + .get("rows") + .and_then(serde_json::Value::as_array) + .cloned() + .unwrap_or_default(); + if rows.is_empty() { + out.push_str("

No rows emitted.

"); + continue; + } + let headers = analysis_row_headers(&rows); + let header_refs = headers.iter().map(String::as_str).collect::>(); + render_table_start(out, &table_id, &header_refs); + for row in rows { + out.push_str(""); + for header in &headers { + table_cell(out, &json_field_as_tsv(row.get(header))); + } + out.push_str(""); + } + render_table_end(out); + } +} + +fn analysis_row_headers(rows: &[serde_json::Value]) -> Vec { + let mut headers = Vec::new(); + for row in rows { + let Some(object) = row.as_object() else { + continue; + }; + for key in object.keys() { + if !headers.contains(key) { + headers.push(key.clone()); + } + } + } + headers +} + +fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { + let Some(logic) = analysis.get("logic") else { + return; + }; + if logic.is_null() { + return; + } + let description = logic + .get("description") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let source = logic.get("source").unwrap_or(&serde_json::Value::Null); + let source_name = source + .get("name") + .and_then(serde_json::Value::as_str) + .unwrap_or("source"); + let source_url = source + .get("url") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + out.push_str("
"); + if !description.is_empty() { + let _ = write!(out, "

{}

", html_escape(description)); + } + if !source_url.is_empty() { + let _ = write!( + out, + "

Logic source: {}

", + html_escape(source_url), + html_escape(source_name) + ); + } + out.push_str("
"); +} + +fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { + let mut links = BTreeMap::::new(); + for report in reports { + collect_provenance_links_from_value(report, &mut links); + } + if links.is_empty() { + out.push_str("

No provenance links.

"); + return; + } + out.push_str("
    "); + for (url, label) in links { + let display = if label.is_empty() { &url } else { &label }; + let _ = write!( + out, + "
  • {}
    {}
  • ", + html_escape(&url), + html_escape(display), + html_escape(&url) + ); + } + out.push_str("
"); +} + +fn collect_provenance_links_from_value( + value: &serde_json::Value, + links: &mut BTreeMap, +) { + match value { + serde_json::Value::Object(object) => { + if let Some(url) = object.get("url").and_then(serde_json::Value::as_str) + && url.starts_with("http") + { + let label = object + .get("name") + .or_else(|| object.get("label")) + .or_else(|| object.get("source")) + .and_then(value_as_string) + .unwrap_or_default(); + links.entry(url.to_owned()).or_insert(label); + } + for child in object.values() { + collect_provenance_links_from_value(child, links); + } + } + serde_json::Value::Array(items) => { + for item in items { + collect_provenance_links_from_value(item, links); + } + } + _ => {} + } +} + +fn render_observation_table(out: &mut String, observations: &[serde_json::Value]) { + let headers = [ + "participant_id", + "rsid", + "ref", + "alt", + "genotype_display", + "genotype", + "zygosity", + "outcome", + "match_status", + "coverage_status", + "call_status", + "assembly", + "chrom", + "pos_start", + "pos_end", + "kind", + "ref_count", + "alt_count", + "depth", + "genotype_quality", + "allele_balance", + "evidence_type", + "evidence_raw", + "facets", + "assay_id", + "assay_version", + "variant_key", + ]; + render_table_start(out, "observations-table", &headers); + for observation in observations { + let _ = write!(out, "", observation_row_class(observation)); + for header in headers { + render_observation_cell(out, observation, header); + } + out.push_str(""); + } + out.push_str(""); +} + +fn observation_row_class(observation: &serde_json::Value) -> &'static str { + match observation + .get("outcome") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "variant" => "row-variant", + "reference" => "row-reference", + _ => "", + } +} + +fn render_observation_cell(out: &mut String, observation: &serde_json::Value, header: &str) { + if header == "genotype_display" { + let outcome = observation + .get("outcome") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let value = json_field_as_tsv(observation.get(header)); + if outcome == "variant" { + let alt = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let _ = write!( + out, + "{}", + highlight_allele(&value, alt) + ); + return; + } + } + let _ = write!( + out, + "{}", + html_escape(&json_field_as_tsv(observation.get(header))) + ); +} + +fn highlight_allele(value: &str, allele: &str) -> String { + if value.is_empty() || allele.is_empty() { + return html_escape(value); + } + if allele.chars().count() == 1 { + let target = allele + .chars() + .next() + .unwrap_or_default() + .to_ascii_uppercase(); + let mut out = String::new(); + for ch in value.chars() { + let escaped = html_escape(&ch.to_string()); + if ch.to_ascii_uppercase() == target { + let _ = write!(out, "{escaped}"); + } else { + out.push_str(&escaped); + } + } + return out; + } + let escaped_value = html_escape(value); + let escaped_allele = html_escape(allele); + escaped_value.replace( + &escaped_allele, + &format!("{escaped_allele}"), + ) +} + diff --git a/rust/bioscript-cli/src/report_matching.rs b/rust/bioscript-cli/src/report_matching.rs new file mode 100644 index 0000000..76f4f93 --- /dev/null +++ b/rust/bioscript-cli/src/report_matching.rs @@ -0,0 +1,292 @@ +fn app_finding_match_observation<'a>( + finding: &serde_json::Value, + observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let binding = finding.get("binding")?; + match binding.get("source").and_then(serde_json::Value::as_str) { + Some("variant") => app_variant_binding_match_observation(binding, observations), + _ => None, + } +} + +fn app_finding_match_analysis( + finding: &serde_json::Value, + analyses: &[serde_json::Value], +) -> Option { + let binding = finding.get("binding")?; + if binding.get("source").and_then(serde_json::Value::as_str) != Some("analysis") { + return None; + } + let analysis_id = binding + .get("analysis_id") + .or_else(|| binding.get("analysis")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let key = binding.get("key").and_then(serde_json::Value::as_str)?; + for analysis in analyses { + if !analysis_id.is_empty() + && analysis + .get("analysis_id") + .and_then(serde_json::Value::as_str) + != Some(analysis_id) + { + continue; + } + let Some(rows) = analysis.get("rows").and_then(serde_json::Value::as_array) else { + continue; + }; + for row in rows { + if app_binding_matches_value(row.get(key), binding) { + return Some(serde_json::json!({ + "participant_id": analysis.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), + "assay_id": analysis.get("assay_id").cloned().unwrap_or(serde_json::Value::Null), + "analysis_id": analysis.get("analysis_id").cloned().unwrap_or(serde_json::Value::Null), + "key": key, + "value": row.get(key).cloned().unwrap_or(serde_json::Value::Null), + "row": row, + })); + } + } + } + None +} + +fn app_variant_binding_match_observation<'a>( + binding: &serde_json::Value, + observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let operator = binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or("equals"); + if matches!(operator, "dosage_equals" | "dosage_in") { + let allele = binding + .get("allele") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + return observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .find(|observation| { + let dosage = app_observation_allele_dosage(observation, allele); + app_binding_matches_dosage(dosage, binding) + }); + } + + let key = binding + .get("key") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if key.is_empty() { + return None; + } + observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .find(|observation| app_binding_matches_value(observation.get(key), binding)) +} + +fn app_finding_observation_context(observation: &serde_json::Value) -> serde_json::Value { + serde_json::json!({ + "participant_id": observation.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), + "rsid": observation.get("rsid").cloned().unwrap_or(serde_json::Value::Null), + "ref": observation.get("ref").cloned().unwrap_or(serde_json::Value::Null), + "alt": observation.get("alt").cloned().unwrap_or(serde_json::Value::Null), + "genotype_display": observation.get("genotype_display").cloned().unwrap_or(serde_json::Value::Null), + "outcome": observation.get("outcome").cloned().unwrap_or(serde_json::Value::Null), + }) +} + +fn app_variant_ref_mismatch(binding: &serde_json::Value, observation: &serde_json::Value) -> bool { + let variant_ref = binding + .get("variant") + .or_else(|| binding.get("path")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if variant_ref.is_empty() { + return false; + } + let basename = Path::new(variant_ref) + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or(variant_ref); + let candidates = [ + observation + .get("variant_key") + .and_then(serde_json::Value::as_str), + observation + .get("variant_path") + .and_then(serde_json::Value::as_str), + observation.get("rsid").and_then(serde_json::Value::as_str), + ]; + !candidates.into_iter().flatten().any(|candidate| { + candidate == variant_ref + || Path::new(candidate) + .file_name() + .and_then(|value| value.to_str()) + .is_some_and(|value| value == basename) + }) +} + +fn app_observation_allele_dosage(observation: &serde_json::Value, allele: &str) -> Option { + if allele.is_empty() { + return None; + } + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let zygosity = observation + .get("zygosity") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if allele == ref_allele { + return match zygosity { + "hom_ref" => Some(2), + "het" => Some(1), + "hom_alt" => Some(0), + _ => None, + }; + } + if allele == alt_allele { + return match zygosity { + "hom_ref" => Some(0), + "het" => Some(1), + "hom_alt" => Some(2), + _ => None, + }; + } + let display = observation + .get("genotype_display") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if allele.len() == 1 { + let allele_ch = allele.chars().next()?.to_ascii_uppercase(); + return display + .chars() + .filter(|ch| ch.to_ascii_uppercase() == allele_ch) + .count() + .try_into() + .ok(); + } + None +} + +fn app_binding_matches_value( + actual: Option<&serde_json::Value>, + binding: &serde_json::Value, +) -> bool { + let actual = actual.and_then(value_as_string).unwrap_or_default(); + match binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or("equals") + { + "equals" => binding + .get("value") + .and_then(value_as_string) + .is_some_and(|value| value == actual), + "in" => binding + .get("values") + .and_then(serde_json::Value::as_array) + .is_some_and(|values| { + values + .iter() + .filter_map(value_as_string) + .any(|value| value == actual) + }), + _ => false, + } +} + +fn app_binding_matches_dosage(dosage: Option, binding: &serde_json::Value) -> bool { + let Some(dosage) = dosage else { + return false; + }; + match binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "dosage_equals" => binding + .get("value") + .and_then(serde_json::Value::as_i64) + .is_some_and(|value| value == dosage), + "dosage_in" => binding + .get("values") + .and_then(serde_json::Value::as_array) + .is_some_and(|values| { + values + .iter() + .filter_map(serde_json::Value::as_i64) + .any(|value| value == dosage) + }), + _ => false, + } +} + +fn value_as_string(value: &serde_json::Value) -> Option { + match value { + serde_json::Value::String(value) => Some(value.clone()), + serde_json::Value::Number(value) => Some(value.to_string()), + serde_json::Value::Bool(value) => Some(value.to_string()), + _ => None, + } +} + +fn app_finding_dedupe_key(finding: &serde_json::Value) -> String { + let effect_key = finding + .get("matched_effect") + .and_then(|effect| { + effect + .get("id") + .or_else(|| effect.get("label")) + .or_else(|| effect.get("text")) + }) + .and_then(value_as_string) + .unwrap_or_default(); + if let Some(evidence) = finding.get("evidence") { + let source = evidence + .get("source") + .and_then(value_as_string) + .unwrap_or_default(); + let kind = evidence + .get("kind") + .and_then(value_as_string) + .unwrap_or_default(); + let id = evidence + .get("id") + .and_then(value_as_string) + .unwrap_or_default(); + if !source.is_empty() || !kind.is_empty() || !id.is_empty() { + return format!("evidence|{source}|{kind}|{id}|{effect_key}"); + } + if let Some(url) = evidence.get("url").and_then(value_as_string) { + return format!("evidence_url|{url}|{effect_key}"); + } + } + if let Some(id) = finding.get("id").and_then(value_as_string) { + return format!("id|{id}|{effect_key}"); + } + format!( + "content|{}|{}|{}|{}", + finding + .get("schema") + .and_then(value_as_string) + .unwrap_or_default(), + finding + .get("label") + .and_then(value_as_string) + .unwrap_or_default(), + finding + .get("notes") + .and_then(value_as_string) + .unwrap_or_default(), + effect_key + ) +} + diff --git a/rust/bioscript-cli/src/report_observations.rs b/rust/bioscript-cli/src/report_observations.rs new file mode 100644 index 0000000..73ddd46 --- /dev/null +++ b/rust/bioscript-cli/src/report_observations.rs @@ -0,0 +1,391 @@ +fn app_observation_from_manifest_row( + runtime_root: &Path, + row: &BTreeMap, + assay_id: &str, +) -> Result { + let row_path = row.get("path").cloned().unwrap_or_default(); + let manifest_path = if Path::new(&row_path).is_absolute() { + PathBuf::from(&row_path) + } else { + runtime_root.join(&row_path) + }; + let manifest = load_variant_manifest(&manifest_path)?; + let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); + let genotype_display = row.get("genotype").cloned().unwrap_or_default(); + let alt_alleles = variant_alt_alleles(&manifest_path)?; + let alt_allele = observed_alt_allele(&genotype_display, &ref_allele, &alt_alleles) + .or_else(|| manifest.spec.alternate.clone()) + .unwrap_or_default(); + let (genotype, zygosity) = normalize_app_genotype(&genotype_display, &ref_allele, &alt_allele); + let depth = parse_optional_u32(row.get("depth")); + let ref_count = parse_optional_u32(row.get("ref_count")); + let alt_count = parse_optional_u32(row.get("alt_count")); + let allele_balance = match (alt_count, depth) { + (Some(alt_count), Some(depth)) if depth > 0 => { + Some(f64::from(alt_count) / f64::from(depth)) + } + _ => None, + }; + let assembly = row.get("assembly").cloned().unwrap_or_default(); + let locus = if assembly.eq_ignore_ascii_case("grch37") { + manifest.spec.grch37.as_ref() + } else { + manifest + .spec + .grch38 + .as_ref() + .or(manifest.spec.grch37.as_ref()) + }; + let outcome = if genotype == "./." { + "no_call" + } else if zygosity == "hom_ref" { + "reference" + } else if zygosity == "het" || zygosity == "hom_alt" { + "variant" + } else { + "unknown" + }; + let evidence_raw = row.get("evidence").cloned().unwrap_or_default(); + Ok(serde_json::json!({ + "participant_id": row.get("participant_id").cloned().unwrap_or_default(), + "assay_id": assay_id, + "assay_version": "1.0", + "variant_key": manifest.name, + "variant_path": row_path, + "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), + "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, + "chrom": locus.map_or(String::new(), |locus| locus.chrom.clone()), + "pos_start": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), + "pos_end": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), + "ref": ref_allele, + "alt": alt_allele, + "kind": manifest.spec.kind.map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()), + "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, + "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), + "call_status": if genotype == "./." { "no_call" } else { "called" }, + "genotype": genotype, + "genotype_display": genotype_display, + "zygosity": zygosity, + "ref_count": ref_count, + "alt_count": alt_count, + "depth": depth, + "genotype_quality": serde_json::Value::Null, + "allele_balance": allele_balance, + "outcome": outcome, + "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, + "evidence_raw": evidence_raw, + "facets": serde_json::Value::Null, + })) +} + +fn variant_alt_alleles(path: &Path) -> Result, String> { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read variant YAML {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse variant YAML {}: {err}", path.display()))?; + let Some(items) = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("alleles".to_owned()))) + .and_then(serde_yaml::Value::as_mapping) + .and_then(|mapping| { + mapping + .get(serde_yaml::Value::String("observed_alts".to_owned())) + .or_else(|| mapping.get(serde_yaml::Value::String("alts".to_owned()))) + }) + .and_then(serde_yaml::Value::as_sequence) + else { + return Ok(Vec::new()); + }; + Ok(items + .iter() + .filter_map(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .collect()) +} + +fn observed_alt_allele( + genotype_display: &str, + ref_allele: &str, + alts: &[String], +) -> Option { + if ref_allele.len() != 1 { + return None; + } + let ref_ch = ref_allele.chars().next()?; + genotype_display + .chars() + .filter(|ch| ch.is_ascii_alphabetic() && *ch != ref_ch) + .find_map(|ch| { + alts.iter() + .find(|alt| alt.len() == 1 && alt.starts_with(ch)) + .cloned() + }) +} + +fn normalize_app_genotype(display: &str, ref_allele: &str, alt_allele: &str) -> (String, String) { + if display.is_empty() { + return ("./.".to_owned(), "unknown".to_owned()); + } + let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); + if alleles.len() != 2 || ref_allele.len() != 1 || alt_allele.len() != 1 { + return (display.to_owned(), "unknown".to_owned()); + } + let ref_ch = ref_allele.chars().next().unwrap_or_default(); + let alt_ch = alt_allele.chars().next().unwrap_or_default(); + let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); + let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); + match (ref_count, alt_count) { + (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), + (1, 1) => ("0/1".to_owned(), "het".to_owned()), + (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), + _ => (display.to_owned(), "unknown".to_owned()), + } +} + +fn parse_optional_u32(value: Option<&String>) -> Option { + value.and_then(|value| value.parse::().ok()) +} + +fn load_manifest_findings( + root: &Path, + manifest_path: &Path, +) -> Result, String> { + let value = load_yaml_value(manifest_path)?; + let schema = value + .get("schema") + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default(); + let mut findings = Vec::new(); + + if matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) && let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let include = json_item + .get("include") + .and_then(serde_json::Value::as_str) + .map(str::to_owned); + if let Some(include) = include { + let include_path = resolve_manifest_path(root, manifest_path, &include)?; + let mut included = load_manifest_findings(root, &include_path)?; + let inherited_binding = json_item.get("binding").cloned(); + for included_item in &mut included { + if inherited_binding.is_some() + && included_item.get("binding").is_none() + && included_item.get("effects").is_none() + && let Some(object) = included_item.as_object_mut() + { + object.insert( + "binding".to_owned(), + inherited_binding.clone().unwrap_or(serde_json::Value::Null), + ); + } + } + findings.extend(included); + continue; + } + if json_item.get("include").is_none() { + findings.push(json_item); + } + } + } + + if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") + && let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { + continue; + }; + let member_path = resolve_manifest_path(root, manifest_path, path)?; + findings.extend(load_manifest_findings(root, &member_path)?); + } + } + + Ok(findings) +} + +fn load_yaml_value(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read YAML {}: {err}", path.display()))?; + serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) +} + +fn yaml_to_json(value: serde_yaml::Value) -> Result { + serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) +} + +fn load_manifest_provenance_links( + root: &Path, + manifest_path: &Path, +) -> Result, String> { + let value = load_yaml_value(manifest_path)?; + let schema = value + .get("schema") + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default(); + let mut links = BTreeMap::::new(); + collect_manifest_provenance_entries(&value, &mut links)?; + + if matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) && let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) else { + continue; + }; + let include_path = resolve_manifest_path(root, manifest_path, include)?; + for item in load_manifest_provenance_links(root, &include_path)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + + if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") + && let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { + continue; + }; + let member_path = resolve_manifest_path(root, manifest_path, path)?; + for item in load_manifest_provenance_links(root, &member_path)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + + Ok(links.into_values().collect()) +} + +fn collect_manifest_provenance_entries( + value: &serde_yaml::Value, + links: &mut BTreeMap, +) -> Result<(), String> { + if let Some(sources) = value + .get("provenance") + .and_then(|provenance| provenance.get("sources")) + .and_then(serde_yaml::Value::as_sequence) + { + for source in sources { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + } + if let Some(source) = value.get("source") { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + Ok(()) +} + +fn match_app_findings( + findings: &[serde_json::Value], + observations: &[serde_json::Value], + analyses: &[serde_json::Value], +) -> Vec { + let mut matched = Vec::new(); + let mut seen = std::collections::BTreeSet::new(); + for finding in findings { + if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { + for effect in effects { + if let Some(observation) = app_finding_match_observation(effect, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert("matched_analysis".to_owned(), analysis); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } + } + } else if let Some(observation) = app_finding_match_observation(finding, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_analysis".to_owned(), analysis); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } + } + matched +} + diff --git a/rust/bioscript-cli/src/report_options.rs b/rust/bioscript-cli/src/report_options.rs new file mode 100644 index 0000000..b988a08 --- /dev/null +++ b/rust/bioscript-cli/src/report_options.rs @@ -0,0 +1,225 @@ +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AppOutputFormat { + Tsv, + Json, + Jsonl, + Both, +} + +struct AppReportOptions { + manifest_path: PathBuf, + input_files: Vec, + output_dir: PathBuf, + root: PathBuf, + html: bool, + observations_format: AppOutputFormat, + reports_format: AppOutputFormat, + loader: GenotypeLoadOptions, + filters: Vec, +} + +fn run_app_report(args: Vec) -> Result<(), String> { + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let mut manifest_path: Option = None; + let mut input_files: Vec = Vec::new(); + let mut output_dir: Option = None; + let mut root: Option = None; + let mut html = false; + let mut observations_format = AppOutputFormat::Tsv; + let mut reports_format = AppOutputFormat::Jsonl; + let mut filters = Vec::new(); + let mut loader = GenotypeLoadOptions::default(); + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--input-file" => input_files.push(PathBuf::from( + iter.next().ok_or("--input-file requires a path")?, + )), + "--output-dir" => { + output_dir = Some(PathBuf::from( + iter.next().ok_or("--output-dir requires a path")?, + )); + } + "--root" => { + root = Some(PathBuf::from( + iter.next().ok_or("--root requires a directory")?, + )); + } + "--html" => html = true, + "--filter" => filters.push(iter.next().ok_or("--filter requires key=value")?), + "--observations-format" => { + observations_format = parse_app_output_format( + &iter + .next() + .ok_or("--observations-format requires a value")?, + )?; + } + "--reports-format" => { + reports_format = parse_app_output_format( + &iter.next().ok_or("--reports-format requires a value")?, + )?; + } + "--input-format" => { + let value = iter.next().ok_or("--input-format requires a value")?; + if value.eq_ignore_ascii_case("auto") { + loader.format = None; + } else { + loader.format = + Some(value.parse::().map_err(|err| { + format!("invalid --input-format value {value}: {err}") + })?); + } + } + "--input-index" => { + loader.input_index = Some(PathBuf::from( + iter.next().ok_or("--input-index requires a path")?, + )); + } + "--reference-file" => { + loader.reference_file = Some(PathBuf::from( + iter.next().ok_or("--reference-file requires a path")?, + )); + } + "--reference-index" => { + loader.reference_index = Some(PathBuf::from( + iter.next().ok_or("--reference-index requires a path")?, + )); + } + value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")), + value => { + if manifest_path.is_none() { + manifest_path = Some(PathBuf::from(value)); + } else { + input_files.push(PathBuf::from(value)); + } + } + } + } + + let Some(manifest_path) = manifest_path else { + return Err("usage: bioscript report --input-file [--input-file ...] --output-dir [--html]".to_owned()); + }; + if input_files.is_empty() { + return Err("bioscript report requires at least one --input-file".to_owned()); + } + let output_dir = output_dir.ok_or("bioscript report requires --output-dir")?; + let root = root.unwrap_or(cwd); + normalize_loader_paths(&root, &mut loader); + + let options = AppReportOptions { + manifest_path: absolutize(&root, &manifest_path), + input_files: input_files + .iter() + .map(|path| absolutize(&root, path)) + .collect(), + output_dir: absolutize(&root, &output_dir), + root, + html, + observations_format, + reports_format, + loader, + filters, + }; + generate_app_report(&options) +} + +fn parse_app_output_format(value: &str) -> Result { + match value { + "tsv" => Ok(AppOutputFormat::Tsv), + "json" => Ok(AppOutputFormat::Json), + "jsonl" => Ok(AppOutputFormat::Jsonl), + "both" => Ok(AppOutputFormat::Both), + other => Err(format!( + "unsupported output format '{other}'; expected tsv, json, jsonl, or both" + )), + } +} + +fn absolutize(root: &Path, path: &Path) -> PathBuf { + if path.is_absolute() { + path.to_path_buf() + } else { + root.join(path) + } +} + +fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { + fs::create_dir_all(&options.output_dir).map_err(|err| { + format!( + "failed to create output dir {}: {err}", + options.output_dir.display() + ) + })?; + + let assay_id = app_assay_id(&options.manifest_path)?; + let findings = load_manifest_findings(&options.root, &options.manifest_path)?; + let provenance = load_manifest_provenance_links(&options.root, &options.manifest_path)?; + let mut observations = Vec::new(); + let mut analyses = Vec::new(); + let mut reports = Vec::new(); + + for input_file in &options.input_files { + let participant_id = participant_id_from_path(input_file); + let rows = run_manifest_rows_for_report( + &options.root, + &options.manifest_path, + input_file, + &participant_id, + &options.loader, + &options.filters, + )?; + let input_observations = rows + .iter() + .map(|row| app_observation_from_manifest_row(&options.root, row, &assay_id)) + .collect::, _>>()?; + observations.extend(input_observations.clone()); + let input_analyses = run_manifest_analyses_for_report( + &options.root, + &options.manifest_path, + input_file, + &participant_id, + &options.loader, + &options.output_dir, + )?; + analyses.extend(input_analyses.clone()); + let matched_findings = match_app_findings(&findings, &input_observations, &input_analyses); + reports.push(app_report_json( + &assay_id, + &participant_id, + input_file, + &input_observations, + &input_analyses, + &matched_findings, + &provenance, + )); + } + + write_app_observations( + &options.output_dir, + &observations, + options.observations_format, + )?; + write_app_analyses(&options.output_dir, &analyses)?; + write_app_reports(&options.output_dir, &reports, options.reports_format)?; + if options.html { + write_app_html(&options.output_dir, &observations, &reports)?; + } + + println!( + "observations: {}", + options.output_dir.join("observations.tsv").display() + ); + println!( + "analysis: {}", + options.output_dir.join("analysis.jsonl").display() + ); + println!( + "reports: {}", + options.output_dir.join("reports.jsonl").display() + ); + if options.html { + println!("html: {}", options.output_dir.join("index.html").display()); + } + Ok(()) +} diff --git a/rust/bioscript-cli/src/report_output.rs b/rust/bioscript-cli/src/report_output.rs new file mode 100644 index 0000000..63fea7c --- /dev/null +++ b/rust/bioscript-cli/src/report_output.rs @@ -0,0 +1,165 @@ +fn app_report_json( + assay_id: &str, + participant_id: &str, + input_file: &Path, + observations: &[serde_json::Value], + analyses: &[serde_json::Value], + findings: &[serde_json::Value], + provenance: &[serde_json::Value], +) -> serde_json::Value { + let called = observations + .iter() + .filter(|item| { + item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") + }) + .count(); + serde_json::json!({ + "schema": "bioscript:report:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": assay_id, + "assay_version": "1.0", + "input": { + "file_name": input_file.file_name().and_then(|value| value.to_str()).unwrap_or_default(), + "file_path": input_file.display().to_string(), + }, + "report_status": if called == observations.len() { "complete" } else { "partial" }, + "derived_from": observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), + "analyses": analyses, + "findings": findings, + "provenance": provenance, + "metrics": { + "n_sites_tested": observations.len(), + "n_sites_called": called, + "n_sites_missing": observations.len().saturating_sub(called), + "n_analyses": analyses.len(), + "n_findings_matched": findings.len(), + } + }) +} + +fn write_app_observations( + output_dir: &Path, + observations: &[serde_json::Value], + format: AppOutputFormat, +) -> Result<(), String> { + if matches!(format, AppOutputFormat::Tsv | AppOutputFormat::Both) { + let mut out = bioscript_core::OBSERVATION_TSV_HEADERS.join("\t"); + out.push('\n'); + for observation in observations { + let line = bioscript_core::OBSERVATION_TSV_HEADERS + .iter() + .map(|header| json_field_as_tsv(observation.get(*header))) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + fs::write(output_dir.join("observations.tsv"), out) + .map_err(|err| format!("failed to write observations.tsv: {err}"))?; + } + if matches!(format, AppOutputFormat::Jsonl | AppOutputFormat::Both) { + write_jsonl(&output_dir.join("observations.jsonl"), observations)?; + } + if matches!(format, AppOutputFormat::Json) { + write_json_pretty( + &output_dir.join("observations.json"), + &serde_json::json!({"observations": observations}), + )?; + } + Ok(()) +} + +fn write_app_analyses(output_dir: &Path, analyses: &[serde_json::Value]) -> Result<(), String> { + write_jsonl(&output_dir.join("analysis.jsonl"), analyses) +} + +fn write_app_reports( + output_dir: &Path, + reports: &[serde_json::Value], + format: AppOutputFormat, +) -> Result<(), String> { + if matches!(format, AppOutputFormat::Jsonl | AppOutputFormat::Both) { + write_jsonl(&output_dir.join("reports.jsonl"), reports)?; + } + if matches!(format, AppOutputFormat::Json | AppOutputFormat::Both) { + write_json_pretty( + &output_dir.join("reports.json"), + &serde_json::json!({ + "schema": "bioscript:report-set:1.0", + "version": "1.0", + "reports": reports, + }), + )?; + } + Ok(()) +} + +fn write_jsonl(path: &Path, rows: &[serde_json::Value]) -> Result<(), String> { + let mut out = String::new(); + for row in rows { + let line = serde_json::to_string(row).map_err(|err| err.to_string())?; + out.push_str(&line); + out.push('\n'); + } + fs::write(path, out).map_err(|err| format!("failed to write {}: {err}", path.display())) +} + +fn write_json_pretty(path: &Path, value: &serde_json::Value) -> Result<(), String> { + let text = serde_json::to_string_pretty(value).map_err(|err| err.to_string())?; + fs::write(path, text).map_err(|err| format!("failed to write {}: {err}", path.display())) +} + +fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { + match value { + Some(serde_json::Value::Null) | None => String::new(), + Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), + Some(value) => value.to_string().replace(['\t', '\n'], " "), + } +} + +fn write_app_html( + output_dir: &Path, + observations: &[serde_json::Value], + reports: &[serde_json::Value], +) -> Result<(), String> { + let mut out = String::from( + r##"BioScript report

BioScript Report

"##, + ); + let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); + let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); + let analysis_outputs = collect_report_analyses(reports); + let _ = write!( + out, + "
{} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
", + observations.len(), + analysis_outputs.len(), + label_findings.len(), + summary_findings.len() + ); + out.push_str(""); + out.push_str("

Observations

"); + render_observation_table(&mut out, observations); + out.push_str("
"); + out.push_str("

Analysis

"); + render_analysis_tables(&mut out, &analysis_outputs); + out.push_str("
"); + out.push_str("

PGx Label Annotations

"); + render_pgx_label_table(&mut out, &label_findings); + out.push_str("
"); + out.push_str("

PGx Summary Annotations

"); + render_pgx_summary_table(&mut out, &summary_findings); + out.push_str("
"); + out.push_str("

Provenance

"); + render_provenance_links(&mut out, reports); + out.push_str("
"); + out.push_str("

Raw Reports JSON

"); + for report in reports { + let text = serde_json::to_string_pretty(report).map_err(|err| err.to_string())?; + let _ = write!(out, "
{}
", html_escape(&text)); + } + out.push_str("
"); + fs::write(output_dir.join("index.html"), out) + .map_err(|err| format!("failed to write index.html: {err}")) +} + diff --git a/rust/bioscript-schema/src/validator.rs b/rust/bioscript-schema/src/validator.rs index 64d3211..0bae7f6 100644 --- a/rust/bioscript-schema/src/validator.rs +++ b/rust/bioscript-schema/src/validator.rs @@ -1,2197 +1,10 @@ -use std::{ - collections::BTreeSet, - fmt::{self, Write as _}, - fs, - path::{Path, PathBuf}, -}; - -use bioscript_core::{GenomicLocus, VariantKind, VariantSpec}; -use serde_yaml::{Mapping, Value}; -use url::Url; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Severity { - Error, - Warning, -} - -impl fmt::Display for Severity { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Error => f.write_str("error"), - Self::Warning => f.write_str("warning"), - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Issue { - pub severity: Severity, - pub path: String, - pub message: String, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct FileReport { - pub file: PathBuf, - pub issues: Vec, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct ValidationReport { - pub files_scanned: usize, - pub reports: Vec, -} - -impl ValidationReport { - #[must_use] - pub fn total_issues(&self) -> usize { - self.reports.iter().map(|report| report.issues.len()).sum() - } - - #[must_use] - pub fn total_errors(&self) -> usize { - self.reports - .iter() - .flat_map(|report| &report.issues) - .filter(|issue| issue.severity == Severity::Error) - .count() - } - - #[must_use] - pub fn total_warnings(&self) -> usize { - self.reports - .iter() - .flat_map(|report| &report.issues) - .filter(|issue| issue.severity == Severity::Warning) - .count() - } - - #[must_use] - pub fn has_errors(&self) -> bool { - self.total_errors() > 0 - } - - #[must_use] - pub fn render_text(&self) -> String { - let mut out = String::new(); - let _ = write!( - out, - "files_scanned: {}\nerrors: {}\nwarnings: {}\n", - self.files_scanned, - self.total_errors(), - self.total_warnings() - ); - for report in &self.reports { - out.push('\n'); - let _ = writeln!(out, "file: {}", report.file.display()); - for issue in &report.issues { - let _ = writeln!( - out, - " - [{}] {}: {}", - issue.severity, issue.path, issue.message - ); - } - } - out - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct VariantManifest { - pub path: PathBuf, - pub name: String, - pub tags: Vec, - pub spec: VariantSpec, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PanelManifest { - pub path: PathBuf, - pub name: String, - pub tags: Vec, - pub permissions: Permissions, - pub downloads: Vec, - pub members: Vec, - pub interpretations: Vec, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct AssayManifest { - pub path: PathBuf, - pub name: String, - pub tags: Vec, - pub members: Vec, - pub interpretations: Vec, -} - -#[derive(Debug, Clone, PartialEq, Eq, Default)] -pub struct Permissions { - pub domains: Vec, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Download { - pub id: String, - pub url: String, - pub origin: String, - pub sha256: String, - pub version: String, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PanelMember { - pub kind: String, - pub path: Option, - pub download: Option, - pub sha256: Option, - pub version: Option, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PanelInterpretation { - pub id: String, - pub kind: String, - pub path: String, - pub output_format: Option, - pub derived_from: Vec, - pub emits: Vec, - pub logic: Option, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PanelInterpretationLogic { - pub source: Option, - pub description: Option, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PanelInterpretationLogicSource { - pub name: Option, - pub url: Option, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PanelInterpretationEmit { - pub key: String, - pub label: Option, - pub value_type: Option, - pub format: Option, -} - -/// Validate a variant file or directory of variant files. -/// -/// # Errors -/// -/// Returns an error when the input path cannot be read, traversed, or parsed -/// as YAML. -pub fn validate_variants_path(path: &Path) -> Result { - validate_manifest_path(path, ManifestSelector::Variant) -} - -/// Validate a panel file or directory of panel files. -/// -/// # Errors -/// -/// Returns an error when the input path cannot be read, traversed, or parsed -/// as YAML. -pub fn validate_panels_path(path: &Path) -> Result { - validate_manifest_path(path, ManifestSelector::Panel) -} - -/// Validate an assay file or directory of assay files. -/// -/// # Errors -/// -/// Returns an error when the input path cannot be read, traversed, or parsed -/// as YAML. -pub fn validate_assays_path(path: &Path) -> Result { - validate_manifest_path(path, ManifestSelector::Assay) -} - -/// Load a single variant manifest from YAML. -/// -/// # Errors -/// -/// Returns an error when the file does not parse or is not a valid variant -/// manifest. -pub fn load_variant_manifest(path: &Path) -> Result { - let value = load_yaml(path)?; - variant_manifest_from_root(path, &value) -} - -/// Load a single variant manifest from YAML text. -/// -/// # Errors -/// -/// Returns an error when the text does not parse or is not a valid variant -/// manifest. -pub fn load_variant_manifest_text(name: &str, text: &str) -> Result { - let value: Value = - serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; - variant_manifest_from_root(Path::new(name), &value) -} - -/// Compile a variant manifest from YAML text for lookup execution. -/// -/// This validates the execution-critical fields only: identity, identifiers, -/// coordinates, and alleles. Full manifest validation still reports metadata -/// issues such as missing finding schemas, but those do not block local lookup. -/// -/// # Errors -/// -/// Returns an error when the text does not parse or the execution-critical -/// fields are invalid. -pub fn load_variant_manifest_text_for_lookup( - name: &str, - text: &str, -) -> Result { - let value: Value = - serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; - let path = Path::new(name); - let mut issues = Vec::new(); - validate_schema_and_identity( - &value, - "bioscript:variant:1.0", - Some("bioscript:variant"), - &mut issues, - ); - validate_identifiers(&value, &mut issues); - validate_coordinates(&value, &mut issues); - validate_alleles(&value, &mut issues); - if issues.iter().any(|issue| issue.severity == Severity::Error) { - return Err(render_single_manifest_errors(path, &issues)); - } - - Ok(VariantManifest { - path: path.to_path_buf(), - name: required_non_empty_string(&value, &["name"])?, - tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), - spec: variant_spec_from_root(&value)?, - }) -} - -fn variant_manifest_from_root(path: &Path, value: &Value) -> Result { - let mut issues = Vec::new(); - validate_variant_root(value, &mut issues); - if issues.iter().any(|issue| issue.severity == Severity::Error) { - return Err(render_single_manifest_errors(path, &issues)); - } - - Ok(VariantManifest { - path: path.to_path_buf(), - name: required_non_empty_string(value, &["name"])?, - tags: seq_of_strings(value, &["tags"]).unwrap_or_default(), - spec: variant_spec_from_root(value)?, - }) -} - -/// Load a single panel manifest from YAML. -/// -/// # Errors -/// -/// Returns an error when the file does not parse or is not a valid panel -/// manifest. -pub fn load_panel_manifest(path: &Path) -> Result { - let value = load_yaml(path)?; - let mut issues = Vec::new(); - validate_panel_root(&value, &mut issues); - if issues.iter().any(|issue| issue.severity == Severity::Error) { - return Err(render_single_manifest_errors(path, &issues)); - } - - let permissions = Permissions { - domains: seq_of_strings(&value, &["permissions", "domains"]).unwrap_or_default(), - }; - let downloads = parse_downloads(&value)?; - let members = parse_panel_members(&value)?; - let interpretations = parse_panel_interpretations(&value)?; - - Ok(PanelManifest { - path: path.to_path_buf(), - name: required_non_empty_string(&value, &["name"])?, - tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), - permissions, - downloads, - members, - interpretations, - }) -} - -/// Load a single assay manifest from YAML. -/// -/// # Errors -/// -/// Returns an error when the file does not parse or is not a valid assay -/// manifest. -pub fn load_assay_manifest(path: &Path) -> Result { - let value = load_yaml(path)?; - let mut issues = Vec::new(); - validate_assay_root(&value, &mut issues); - if issues.iter().any(|issue| issue.severity == Severity::Error) { - return Err(render_single_manifest_errors(path, &issues)); - } - - Ok(AssayManifest { - path: path.to_path_buf(), - name: required_non_empty_string(&value, &["name"])?, - tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), - members: parse_panel_members(&value)?, - interpretations: parse_panel_interpretations(&value)?, - }) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum ManifestSelector { - Assay, - Variant, - Panel, -} - -fn validate_manifest_path( - path: &Path, - selector: ManifestSelector, -) -> Result { - let files = collect_yaml_files(path)?; - let mut reports = Vec::new(); - for file in &files { - let report = match selector { - ManifestSelector::Assay => validate_assay_file(file)?, - ManifestSelector::Variant => validate_variant_file(file)?, - ManifestSelector::Panel => validate_panel_file(file)?, - }; - if !report.issues.is_empty() { - reports.push(report); - } - } - Ok(ValidationReport { - files_scanned: files.len(), - reports, - }) -} - -fn collect_yaml_files(path: &Path) -> Result, String> { - if path.is_file() { - return Ok(vec![path.to_path_buf()]); - } - - let mut files = Vec::new(); - collect_yaml_files_recursive(path, &mut files)?; - files.sort(); - Ok(files) -} - -fn collect_yaml_files_recursive(path: &Path, files: &mut Vec) -> Result<(), String> { - let entries = fs::read_dir(path) - .map_err(|err| format!("failed to read directory {}: {err}", path.display()))?; - for entry in entries { - let entry = entry.map_err(|err| format!("failed to read directory entry: {err}"))?; - let entry_path = entry.path(); - if entry_path.is_dir() { - collect_yaml_files_recursive(&entry_path, files)?; - continue; - } - if entry_path.extension().is_some_and(|extension| { - ["yaml", "yml"] - .iter() - .any(|item| extension.eq_ignore_ascii_case(item)) - }) { - files.push(entry_path); - } - } - Ok(()) -} - -fn validate_assay_file(path: &Path) -> Result { - let value = load_yaml(path)?; - let Some(schema) = scalar_at(&value, &["schema"]) else { - return Ok(FileReport { - file: path.to_path_buf(), - issues: vec![Issue { - severity: Severity::Error, - path: "schema".to_owned(), - message: "missing schema".to_owned(), - }], - }); - }; - if !schema.contains("assay") { - return Ok(FileReport { - file: path.to_path_buf(), - issues: Vec::new(), - }); - } - - let mut issues = Vec::new(); - validate_assay_root(&value, &mut issues); - Ok(FileReport { - file: path.to_path_buf(), - issues, - }) -} - -fn validate_variant_file(path: &Path) -> Result { - let value = load_yaml(path)?; - let Some(schema) = scalar_at(&value, &["schema"]) else { - return Ok(FileReport { - file: path.to_path_buf(), - issues: vec![Issue { - severity: Severity::Error, - path: "schema".to_owned(), - message: "missing schema".to_owned(), - }], - }); - }; - if !schema.contains("variant") { - if schema == "bioscript:pgx-findings:1.0" { - let mut issues = Vec::new(); - validate_pgx_findings_root(&value, &mut issues); - return Ok(FileReport { - file: path.to_path_buf(), - issues, - }); - } - return Ok(FileReport { - file: path.to_path_buf(), - issues: Vec::new(), - }); - } - - let mut issues = Vec::new(); - validate_variant_root(&value, &mut issues); - Ok(FileReport { - file: path.to_path_buf(), - issues, - }) -} - -fn validate_panel_file(path: &Path) -> Result { - let value = load_yaml(path)?; - let Some(schema) = scalar_at(&value, &["schema"]) else { - return Ok(FileReport { - file: path.to_path_buf(), - issues: vec![Issue { - severity: Severity::Error, - path: "schema".to_owned(), - message: "missing schema".to_owned(), - }], - }); - }; - if !schema.contains("panel") { - return Ok(FileReport { - file: path.to_path_buf(), - issues: Vec::new(), - }); - } - - let mut issues = Vec::new(); - validate_panel_root(&value, &mut issues); - Ok(FileReport { - file: path.to_path_buf(), - issues, - }) -} - -fn validate_variant_root(root: &Value, issues: &mut Vec) { - validate_schema_and_identity( - root, - "bioscript:variant:1.0", - Some("bioscript:variant"), - issues, - ); - validate_optional_strings(root, &["name", "label", "gene", "summary"], issues); - validate_tags(root, issues); - validate_identifiers(root, issues); - validate_coordinates(root, issues); - validate_alleles(root, issues); - validate_findings(root, issues); - validate_provenance(root, issues); - - let has_identifiers = value_at(root, &["identifiers"]) - .and_then(Value::as_mapping) - .is_some_and(|mapping| !mapping.is_empty()); - let has_coordinates = ["grch37", "grch38"] - .iter() - .any(|assembly| value_at(root, &["coordinates", assembly]).is_some()); - if !has_identifiers && !has_coordinates { - issues.push(Issue { - severity: Severity::Error, - path: "identifiers/coordinates".to_owned(), - message: "expected at least one identifier block or one coordinate block".to_owned(), - }); - } -} - -fn validate_panel_root(root: &Value, issues: &mut Vec) { - validate_schema_and_identity(root, "bioscript:panel:1.0", None, issues); - validate_optional_strings(root, &["name", "label", "summary"], issues); - validate_tags(root, issues); - validate_permissions(root, issues); - validate_downloads(root, issues); - validate_panel_members(root, &["variant", "assay"], issues); - validate_panel_interpretations(root, issues); - validate_findings(root, issues); -} - -fn validate_assay_root(root: &Value, issues: &mut Vec) { - validate_schema_and_identity(root, "bioscript:assay:1.0", None, issues); - validate_optional_strings(root, &["name", "label", "summary"], issues); - validate_tags(root, issues); - validate_panel_members(root, &["variant"], issues); - validate_panel_interpretations(root, issues); - validate_findings(root, issues); -} - -fn validate_pgx_findings_root(root: &Value, issues: &mut Vec) { - require_const(root, &["schema"], "bioscript:pgx-findings:1.0", issues); - require_const(root, &["version"], "1.0", issues); - validate_optional_strings(root, &["variant", "gene", "rsid", "variant_pa_id"], issues); - if value_at(root, &["variant"]).is_none() && value_at(root, &["rsid"]).is_none() { - issues.push(Issue { - severity: Severity::Error, - path: "variant/rsid".to_owned(), - message: "expected at least one variant identifier".to_owned(), - }); - } - match value_at(root, &["findings"]) { - Some(Value::Sequence(_)) => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: "findings".to_owned(), - message: "expected a sequence of findings".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: "findings".to_owned(), - message: "missing required field".to_owned(), - }), - } - validate_findings(root, issues); -} - -fn validate_schema_and_identity( - root: &Value, - canonical_schema: &str, - legacy_schema: Option<&str>, - issues: &mut Vec, -) { - let schema = scalar_at(root, &["schema"]); - let valid_schema = schema - .as_deref() - .is_some_and(|value| value == canonical_schema || legacy_schema == Some(value)); - if !valid_schema { - issues.push(Issue { - severity: Severity::Error, - path: "schema".to_owned(), - message: format!("expected schema to be '{canonical_schema}'"), - }); - } - if let Some(legacy_schema) = legacy_schema - && matches!(schema.as_deref(), Some(value) if value == legacy_schema) - { - issues.push(Issue { - severity: Severity::Warning, - path: "schema".to_owned(), - message: format!("legacy schema value '{legacy_schema}'; prefer '{canonical_schema}'"), - }); - } - require_const(root, &["version"], "1.0", issues); - match scalar_at(root, &["name"]) { - Some(name) if !name.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: "name".to_owned(), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: "name".to_owned(), - message: "missing required field".to_owned(), - }), - } - if value_at(root, &["variant_id"]).is_some() { - issues.push(Issue { - severity: Severity::Warning, - path: "variant_id".to_owned(), - message: "variant_id is legacy; prefer name".to_owned(), - }); - } -} - -fn validate_optional_strings(root: &Value, fields: &[&str], issues: &mut Vec) { - for field in fields { - if let Some(value) = value_at(root, &[*field]) { - match value.as_str() { - Some(text) if !text.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Warning, - path: (*field).to_owned(), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: (*field).to_owned(), - message: "expected string".to_owned(), - }), - } - } - } -} - -fn validate_tags(root: &Value, issues: &mut Vec) { - let Some(value) = value_at(root, &["tags"]) else { - return; - }; - let Some(items) = value.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: "tags".to_owned(), - message: "expected a sequence of strings".to_owned(), - }); - return; - }; - - for (idx, item) in items.iter().enumerate() { - let Some(tag) = item.as_str() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("tags[{idx}]"), - message: "expected string".to_owned(), - }); - continue; - }; - if tag.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("tags[{idx}]"), - message: "empty tag string".to_owned(), - }); - } - } -} - -fn validate_identifiers(root: &Value, issues: &mut Vec) { - for field in ["rsids", "aliases"] { - let Some(values) = value_at(root, &["identifiers", field]) else { - continue; - }; - let Some(items) = values.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("identifiers.{field}"), - message: "expected a sequence of strings".to_owned(), - }); - continue; - }; - let mut seen = BTreeSet::new(); - for (idx, item) in items.iter().enumerate() { - let Some(value) = item.as_str() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("identifiers.{field}[{idx}]"), - message: "expected string".to_owned(), - }); - continue; - }; - if !is_rsid(value) { - issues.push(Issue { - severity: Severity::Error, - path: format!("identifiers.{field}[{idx}]"), - message: format!("expected rsid like rs123, found '{value}'"), - }); - } - if !seen.insert(value.to_owned()) { - issues.push(Issue { - severity: Severity::Warning, - path: format!("identifiers.{field}[{idx}]"), - message: format!("duplicate identifier '{value}'"), - }); - } - } - } -} - -fn validate_coordinates(root: &Value, issues: &mut Vec) { - for assembly in ["grch37", "grch38"] { - let Some(coord) = mapping_at(root, &["coordinates", assembly]) else { - continue; - }; - - let Some(chrom) = coord - .get(Value::String("chrom".to_owned())) - .and_then(Value::as_str) - else { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.chrom"), - message: "missing chrom".to_owned(), - }); - continue; - }; - if !is_allowed_chromosome(chrom) { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.chrom"), - message: format!("invalid chromosome '{chrom}'; expected 1-22, X, Y, or MT"), - }); - } - - let has_pos = coord.contains_key(Value::String("pos".to_owned())); - let has_start = coord.contains_key(Value::String("start".to_owned())); - let has_end = coord.contains_key(Value::String("end".to_owned())); - if has_pos && (has_start || has_end) { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}"), - message: "use either pos or start/end, not both".to_owned(), - }); - continue; - } - if !(has_pos || has_start && has_end) { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}"), - message: "expected either pos or start/end".to_owned(), - }); - continue; - } - - if has_pos { - validate_coordinate_pos(coord, assembly, issues); - } else { - validate_coordinate_range(coord, assembly, issues); - } - } -} - -fn validate_coordinate_pos(coord: &Mapping, assembly: &str, issues: &mut Vec) { - if let Some(pos) = i64_at_mapping(coord, "pos") { - if pos < 1 { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.pos"), - message: "expected integer >= 1".to_owned(), - }); - } - } else { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.pos"), - message: "expected integer".to_owned(), - }); - } -} - -fn validate_coordinate_range(coord: &Mapping, assembly: &str, issues: &mut Vec) { - let start = i64_at_mapping(coord, "start"); - let end = i64_at_mapping(coord, "end"); - match (start, end) { - (Some(start), Some(end)) => validate_coordinate_range_values(start, end, assembly, issues), - _ => issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}"), - message: "expected integer start/end".to_owned(), - }), - } -} - -fn validate_coordinate_range_values(start: i64, end: i64, assembly: &str, issues: &mut Vec) { - if start < 1 { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.start"), - message: "expected integer >= 1".to_owned(), - }); - } - if end < 1 { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.end"), - message: "expected integer >= 1".to_owned(), - }); - } - if end < start { - issues.push(Issue { - severity: Severity::Error, - path: format!("coordinates.{assembly}.end"), - message: "expected end >= start".to_owned(), - }); - } - if start == end { - issues.push(Issue { - severity: Severity::Warning, - path: format!("coordinates.{assembly}"), - message: "single-position coordinate uses start/end; prefer pos".to_owned(), - }); - } -} - -fn validate_alleles(root: &Value, issues: &mut Vec) { - require_path(root, &["alleles"], issues); - require_path(root, &["alleles", "kind"], issues); - require_path(root, &["alleles", "ref"], issues); - require_path(root, &["alleles", "alts"], issues); - - let Some(kind) = scalar_at(root, &["alleles", "kind"]) else { - return; - }; - if !matches!(kind.as_str(), "snv" | "deletion" | "insertion" | "indel") { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.kind".to_owned(), - message: "expected one of snv, deletion, insertion, indel".to_owned(), - }); - } - - if value_at(root, &["alleles", "canonical_alt"]).is_some() { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.canonical_alt".to_owned(), - message: "canonical_alt is not part of the current schema".to_owned(), - }); - } - - let Some(reference) = scalar_at(root, &["alleles", "ref"]) else { - return; - }; - if reference.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.ref".to_owned(), - message: "empty string".to_owned(), - }); - } - - let Some(alts_value) = value_at(root, &["alleles", "alts"]) else { - return; - }; - let Some(alts_seq) = alts_value.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.alts".to_owned(), - message: "expected a non-empty sequence of strings".to_owned(), - }); - return; - }; - if alts_seq.is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.alts".to_owned(), - message: "expected at least one alternate allele".to_owned(), - }); - return; - } - - let mut alts = Vec::new(); - for (idx, item) in alts_seq.iter().enumerate() { - let Some(alt) = item.as_str() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("alleles.alts[{idx}]"), - message: "expected string".to_owned(), - }); - continue; - }; - if alt.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("alleles.alts[{idx}]"), - message: "empty string".to_owned(), - }); - continue; - } - alts.push(alt.to_owned()); - } - let observed_alts = match seq_of_strings(root, &["alleles", "observed_alts"]) { - Some(items) => { - if items.is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.observed_alts".to_owned(), - message: "expected a non-empty sequence of strings when present".to_owned(), - }); - } - for alt in &alts { - if !items.iter().any(|item| item == alt) { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.observed_alts".to_owned(), - message: format!("significant alt '{alt}' is not present in observed_alts"), - }); - } - } - items - } - None => alts.clone(), - }; - validate_symbolic_alleles(&reference, &observed_alts, issues); - validate_snv_alleles(&kind, &reference, &observed_alts, issues); -} - -fn validate_symbolic_alleles(reference: &str, alts: &[String], issues: &mut Vec) { - if reference == "I" || reference == "D" { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.ref".to_owned(), - message: "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" - .to_owned(), - }); - } - for (idx, alt) in alts.iter().enumerate() { - if alt == "I" || alt == "D" { - issues.push(Issue { - severity: Severity::Error, - path: format!("alleles.alts[{idx}]"), - message: - "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" - .to_owned(), - }); - } - } -} - -fn validate_snv_alleles(kind: &str, reference: &str, alts: &[String], issues: &mut Vec) { - if kind != "snv" { - return; - } - if !is_base_allele(reference) { - issues.push(Issue { - severity: Severity::Error, - path: "alleles.ref".to_owned(), - message: "snv ref must be one of A/C/G/T".to_owned(), - }); - } - for (idx, alt) in alts.iter().enumerate() { - if !is_base_allele(alt) { - issues.push(Issue { - severity: Severity::Error, - path: format!("alleles.alts[{idx}]"), - message: "snv alt must be one of A/C/G/T".to_owned(), - }); - } - } -} - -fn validate_findings(root: &Value, issues: &mut Vec) { - let alts = seq_of_strings(root, &["alleles", "alts"]).unwrap_or_default(); - let Some(findings) = value_at(root, &["findings"]).and_then(Value::as_sequence) else { - return; - }; - - for (idx, finding) in findings.iter().enumerate() { - let Some(mapping) = finding.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - - let Some(schema) = mapping - .get(Value::String("schema".to_owned())) - .and_then(Value::as_str) - else { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}].schema"), - message: "missing schema".to_owned(), - }); - continue; - }; - if schema.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}].schema"), - message: "empty string".to_owned(), - }); - } - if schema == "bioscript:pgx:1.0" { - issues.push(Issue { - severity: Severity::Warning, - path: format!("findings[{idx}].schema"), - message: "legacy PGx finding schema; prefer bioscript:pgx-summary:1.0 or bioscript:pgx-label:1.0".to_owned(), - }); - } - if let Some(alt) = mapping - .get(Value::String("alt".to_owned())) - .and_then(Value::as_str) - && !alts.is_empty() - && alt != "*" - && !alts.iter().any(|item| item == alt) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}].alt"), - message: format!("finding alt '{alt}' is not present in alleles.alts {alts:?}"), - }); - } - let has_summary = mapping - .get(Value::String("summary".to_owned())) - .and_then(Value::as_str) - .is_some_and(|value| !value.trim().is_empty()); - let has_notes = mapping - .get(Value::String("notes".to_owned())) - .and_then(Value::as_str) - .is_some_and(|value| !value.trim().is_empty()); - if !has_summary && !has_notes { - issues.push(Issue { - severity: Severity::Warning, - path: format!("findings[{idx}]"), - message: "finding has neither summary nor notes".to_owned(), - }); - } - validate_finding_binding(&format!("findings[{idx}]"), mapping, issues); - validate_finding_effects(idx, mapping, issues); - } -} - -fn validate_finding_effects(idx: usize, mapping: &Mapping, issues: &mut Vec) { - let Some(effects) = mapping.get(Value::String("effects".to_owned())) else { - return; - }; - let Some(effects) = effects.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}].effects"), - message: "expected a sequence of mappings".to_owned(), - }); - return; - }; - for (effect_idx, effect) in effects.iter().enumerate() { - let Some(effect) = effect.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("findings[{idx}].effects[{effect_idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - validate_finding_binding( - &format!("findings[{idx}].effects[{effect_idx}]"), - effect, - issues, - ); - } -} - -fn validate_finding_binding(parent: &str, mapping: &Mapping, issues: &mut Vec) { - let Some(binding) = mapping.get(Value::String("binding".to_owned())) else { - return; - }; - let Some(binding) = binding.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding"), - message: "expected mapping".to_owned(), - }); - return; - }; - validate_required_mapping_string(binding, "source", &format!("{parent}.binding"), issues); - let source = binding - .get(Value::String("source".to_owned())) - .and_then(Value::as_str); - match source { - Some("variant") => { - if !binding.contains_key(Value::String("variant".to_owned())) - && !binding.contains_key(Value::String("path".to_owned())) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.variant"), - message: "variant findings require variant or path".to_owned(), - }); - } - } - Some("analysis") => { - validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); - validate_required_mapping_string( - binding, - "analysis_id", - &format!("{parent}.binding"), - issues, - ); - } - Some(other) => issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.source"), - message: format!("unsupported source '{other}'"), - }), - None => {} - } - - let operator = binding - .get(Value::String("operator".to_owned())) - .and_then(Value::as_str) - .unwrap_or("equals"); - match operator { - "equals" => { - validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); - if !binding.contains_key(Value::String("value".to_owned())) { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.value"), - message: "equals requires value".to_owned(), - }); - } - } - "in" => { - validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); - let values = binding - .get(Value::String("values".to_owned())) - .and_then(Value::as_sequence); - if values.is_none_or(Vec::is_empty) { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.values"), - message: "in requires non-empty values".to_owned(), - }); - } - } - "dosage_equals" => { - if binding - .get(Value::String("allele".to_owned())) - .and_then(Value::as_str) - .is_none_or(|value| value.trim().is_empty()) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.allele"), - message: "dosage_equals requires allele".to_owned(), - }); - } - if binding - .get(Value::String("value".to_owned())) - .and_then(Value::as_i64) - .is_none_or(|value| !(0..=2).contains(&value)) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.value"), - message: "dosage_equals requires integer value 0, 1, or 2".to_owned(), - }); - } - } - "dosage_in" => { - if binding - .get(Value::String("allele".to_owned())) - .and_then(Value::as_str) - .is_none_or(|value| value.trim().is_empty()) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.allele"), - message: "dosage_in requires allele".to_owned(), - }); - } - let values = binding - .get(Value::String("values".to_owned())) - .and_then(Value::as_sequence); - let invalid_values = match values { - Some(items) if !items.is_empty() => items - .iter() - .any(|value| value.as_i64().is_none_or(|n| !(0..=2).contains(&n))), - _ => true, - }; - if invalid_values { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.values"), - message: "dosage_in requires integer values from 0 to 2".to_owned(), - }); - } - } - other => issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.operator"), - message: format!( - "unsupported operator '{other}'; expected 'equals', 'in', 'dosage_equals', or 'dosage_in'" - ), - }), - } -} - -fn validate_provenance(root: &Value, issues: &mut Vec) { - let Some(sources) = value_at(root, &["provenance", "sources"]).and_then(Value::as_sequence) - else { - return; - }; - for (idx, source) in sources.iter().enumerate() { - let Some(mapping) = source.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("provenance.sources[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - for field in ["kind", "label", "url"] { - match mapping - .get(Value::String(field.to_owned())) - .and_then(Value::as_str) - { - Some(text) if !text.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: format!("provenance.sources[{idx}].{field}"), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("provenance.sources[{idx}].{field}"), - message: "missing required field".to_owned(), - }), - } - } - if let Some(url) = mapping - .get(Value::String("url".to_owned())) - .and_then(Value::as_str) - { - validate_url_string( - url, - &format!("provenance.sources[{idx}].url"), - false, - issues, - ); - } - } -} - -fn validate_permissions(root: &Value, issues: &mut Vec) { - let Some(domains) = value_at(root, &["permissions", "domains"]) else { - return; - }; - let Some(items) = domains.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: "permissions.domains".to_owned(), - message: "expected a sequence of origins".to_owned(), - }); - return; - }; - let mut seen = BTreeSet::new(); - for (idx, item) in items.iter().enumerate() { - let Some(value) = item.as_str() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("permissions.domains[{idx}]"), - message: "expected string".to_owned(), - }); - continue; - }; - match normalize_origin(value) { - Ok(origin) => { - if !seen.insert(origin.clone()) { - issues.push(Issue { - severity: Severity::Warning, - path: format!("permissions.domains[{idx}]"), - message: format!("duplicate origin '{origin}'"), - }); - } - } - Err(message) => issues.push(Issue { - severity: Severity::Error, - path: format!("permissions.domains[{idx}]"), - message, - }), - } - } -} - -fn validate_downloads(root: &Value, issues: &mut Vec) { - let allowed_origins: BTreeSet = seq_of_strings(root, &["permissions", "domains"]) - .unwrap_or_default() - .into_iter() - .filter_map(|domain| normalize_origin(&domain).ok()) - .collect(); - let Some(downloads) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { - return; - }; - let mut ids = BTreeSet::new(); - for (idx, item) in downloads.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - for field in ["id", "url", "sha256", "version"] { - match mapping - .get(Value::String(field.to_owned())) - .and_then(Value::as_str) - { - Some(text) if !text.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].{field}"), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].{field}"), - message: "missing required field".to_owned(), - }), - } - } - - if let Some(id) = mapping - .get(Value::String("id".to_owned())) - .and_then(Value::as_str) - && !ids.insert(id.to_owned()) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].id"), - message: format!("duplicate download id '{id}'"), - }); - } - if let Some(sha) = mapping - .get(Value::String("sha256".to_owned())) - .and_then(Value::as_str) - && !is_sha256(sha) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].sha256"), - message: "expected 64 lowercase hex characters".to_owned(), - }); - } - if let Some(url) = mapping - .get(Value::String("url".to_owned())) - .and_then(Value::as_str) - { - match normalize_download_url(url) { - Ok(origin) => { - if !allowed_origins.is_empty() && !allowed_origins.contains(&origin) { - issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].url"), - message: format!( - "download origin '{origin}' is not listed in permissions.domains" - ), - }); - } - } - Err(message) => issues.push(Issue { - severity: Severity::Error, - path: format!("downloads[{idx}].url"), - message, - }), - } - } - } -} - -fn validate_panel_members(root: &Value, allowed_kinds: &[&str], issues: &mut Vec) { - let Some(members) = value_at(root, &["members"]).and_then(Value::as_sequence) else { - issues.push(Issue { - severity: Severity::Error, - path: "members".to_owned(), - message: "missing required field".to_owned(), - }); - return; - }; - if members.is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: "members".to_owned(), - message: "expected at least one member".to_owned(), - }); - return; - } - - let download_ids = panel_download_ids(root); - - for (idx, item) in members.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - validate_panel_member(idx, mapping, allowed_kinds, &download_ids, issues); - } -} - -fn panel_download_ids(root: &Value) -> BTreeSet { - value_at(root, &["downloads"]) - .and_then(Value::as_sequence) - .into_iter() - .flatten() - .filter_map(|item| { - item.as_mapping()? - .get(Value::String("id".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned) - }) - .collect() -} - -fn validate_panel_member( - idx: usize, - mapping: &Mapping, - allowed_kinds: &[&str], - download_ids: &BTreeSet, - issues: &mut Vec, -) { - let kind = mapping - .get(Value::String("kind".to_owned())) - .and_then(Value::as_str); - match kind { - Some(kind) if allowed_kinds.contains(&kind) => {} - Some(other) => issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].kind"), - message: format!("unsupported member kind '{other}'"), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].kind"), - message: "missing required field".to_owned(), - }), - } - - let path_value = mapping - .get(Value::String("path".to_owned())) - .and_then(Value::as_str); - let download_value = mapping - .get(Value::String("download".to_owned())) - .and_then(Value::as_str); - if path_value.is_some() == download_value.is_some() { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}]"), - message: "expected exactly one of path or download".to_owned(), - }); - } - validate_panel_member_path(idx, path_value, issues); - validate_panel_member_download(idx, download_value, download_ids, issues); - validate_panel_member_metadata(idx, mapping, issues); -} - -fn validate_panel_member_path(idx: usize, path_value: Option<&str>, issues: &mut Vec) { - if let Some(path) = path_value - && path.trim().is_empty() - { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].path"), - message: "empty string".to_owned(), - }); - } -} - -fn validate_panel_member_download( - idx: usize, - download_value: Option<&str>, - download_ids: &BTreeSet, - issues: &mut Vec, -) { - let Some(download) = download_value else { - return; - }; - if download.trim().is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].download"), - message: "empty string".to_owned(), - }); - } else if !download_ids.contains(download) { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].download"), - message: format!("unknown download id '{download}'"), - }); - } -} - -fn validate_panel_member_metadata(idx: usize, mapping: &Mapping, issues: &mut Vec) { - if let Some(version) = mapping - .get(Value::String("version".to_owned())) - .and_then(Value::as_str) - && version.trim().is_empty() - { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].version"), - message: "empty string".to_owned(), - }); - } - if let Some(sha) = mapping - .get(Value::String("sha256".to_owned())) - .and_then(Value::as_str) - && !is_sha256(sha) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("members[{idx}].sha256"), - message: "expected 64 lowercase hex characters".to_owned(), - }); - } -} - -fn validate_panel_interpretations(root: &Value, issues: &mut Vec) { - if value_at(root, &["analyses"]).is_some() && value_at(root, &["interpretations"]).is_some() { - issues.push(Issue { - severity: Severity::Warning, - path: "interpretations".to_owned(), - message: "use analyses instead of interpretations; do not define both".to_owned(), - }); - } - let key = if value_at(root, &["analyses"]).is_some() { - "analyses" - } else { - "interpretations" - }; - let Some(items) = value_at(root, &[key]) else { - return; - }; - let Some(items) = items.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: key.to_owned(), - message: "expected a sequence of mappings".to_owned(), - }); - return; - }; - for (idx, item) in items.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - validate_panel_interpretation(key, idx, mapping, issues); - } -} - -fn validate_panel_interpretation( - key: &str, - idx: usize, - mapping: &Mapping, - issues: &mut Vec, -) { - for field in ["id", "kind", "path"] { - validate_required_mapping_string(mapping, field, &format!("{key}[{idx}]"), issues); - } - if let Some(kind) = mapping - .get(Value::String("kind".to_owned())) - .and_then(Value::as_str) - && kind != "bioscript" - { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].kind"), - message: "expected 'bioscript'".to_owned(), - }); - } - if let Some(output_format) = mapping - .get(Value::String("output_format".to_owned())) - .and_then(Value::as_str) - && !matches!(output_format, "tsv" | "json" | "jsonl") - { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].output_format"), - message: "expected 'tsv', 'json', or 'jsonl'".to_owned(), - }); - } - let Some(derived_from) = mapping - .get(Value::String("derived_from".to_owned())) - .and_then(Value::as_sequence) - else { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].derived_from"), - message: "expected a non-empty sequence of strings".to_owned(), - }); - return; - }; - if derived_from.is_empty() { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].derived_from"), - message: "expected at least one source variant".to_owned(), - }); - } - for (source_idx, source) in derived_from.iter().enumerate() { - match source.as_str() { - Some(value) if !value.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].derived_from[{source_idx}]"), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].derived_from[{source_idx}]"), - message: "expected string".to_owned(), - }), - } - } - validate_panel_interpretation_logic(key, idx, mapping, issues); - validate_panel_interpretation_emits(key, idx, mapping, issues); -} - -fn validate_panel_interpretation_logic( - key: &str, - idx: usize, - mapping: &Mapping, - issues: &mut Vec, -) { - let Some(logic) = mapping.get(Value::String("logic".to_owned())) else { - return; - }; - let Some(logic) = logic.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].logic"), - message: "expected mapping".to_owned(), - }); - return; - }; - validate_optional_mapping_string(logic, "description", &format!("{key}[{idx}].logic"), issues); - let Some(source) = logic.get(Value::String("source".to_owned())) else { - return; - }; - let Some(source) = source.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].logic.source"), - message: "expected mapping".to_owned(), - }); - return; - }; - validate_optional_mapping_string( - source, - "name", - &format!("{key}[{idx}].logic.source"), - issues, - ); - validate_optional_mapping_string(source, "url", &format!("{key}[{idx}].logic.source"), issues); - if let Some(url) = source - .get(Value::String("url".to_owned())) - .and_then(Value::as_str) - { - validate_url_string( - url, - &format!("{key}[{idx}].logic.source.url"), - false, - issues, - ); - } -} - -fn validate_panel_interpretation_emits( - key: &str, - idx: usize, - mapping: &Mapping, - issues: &mut Vec, -) { - let Some(emits) = mapping.get(Value::String("emits".to_owned())) else { - return; - }; - let Some(emits) = emits.as_sequence() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].emits"), - message: "expected a sequence of mappings".to_owned(), - }); - return; - }; - for (emit_idx, item) in emits.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - issues.push(Issue { - severity: Severity::Error, - path: format!("{key}[{idx}].emits[{emit_idx}]"), - message: "expected mapping".to_owned(), - }); - continue; - }; - validate_required_mapping_string( - mapping, - "key", - &format!("{key}[{idx}].emits[{emit_idx}]"), - issues, - ); - for field in ["label", "value_type", "format"] { - validate_optional_mapping_string( - mapping, - field, - &format!("{key}[{idx}].emits[{emit_idx}]"), - issues, - ); - } - } -} - -fn validate_required_mapping_string( - mapping: &Mapping, - field: &str, - parent: &str, - issues: &mut Vec, -) { - match mapping - .get(Value::String(field.to_owned())) - .and_then(Value::as_str) - { - Some(value) if !value.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.{field}"), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.{field}"), - message: "missing required field".to_owned(), - }), - } -} - -fn validate_optional_mapping_string( - mapping: &Mapping, - field: &str, - parent: &str, - issues: &mut Vec, -) { - if let Some(value) = mapping.get(Value::String(field.to_owned())) { - match value.as_str() { - Some(text) if !text.trim().is_empty() => {} - Some(_) => issues.push(Issue { - severity: Severity::Warning, - path: format!("{parent}.{field}"), - message: "empty string".to_owned(), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.{field}"), - message: "expected string".to_owned(), - }), - } - } -} - -fn variant_spec_from_root(root: &Value) -> Result { - let rsids = seq_of_strings(root, &["identifiers", "rsids"]).unwrap_or_default(); - let grch37 = locus_from_root(root, "grch37")?; - let grch38 = locus_from_root(root, "grch38")?; - let reference = scalar_at(root, &["alleles", "ref"]); - let alternate = seq_of_strings(root, &["alleles", "observed_alts"]) - .or_else(|| seq_of_strings(root, &["alleles", "alts"])) - .and_then(|alts| alts.first().cloned()); - let deletion_length = value_at(root, &["alleles", "deletion_length"]) - .and_then(Value::as_u64) - .and_then(|value| usize::try_from(value).ok()); - let motifs = seq_of_strings(root, &["alleles", "motifs"]).unwrap_or_default(); - let kind = scalar_at(root, &["alleles", "kind"]).map(|kind| match kind.as_str() { - "snv" => VariantKind::Snp, - "deletion" => VariantKind::Deletion, - "insertion" => VariantKind::Insertion, - "indel" => VariantKind::Indel, - _ => VariantKind::Other, - }); - - Ok(VariantSpec { - rsids, - grch37, - grch38, - reference, - alternate, - kind, - deletion_length, - motifs, - }) -} - -fn locus_from_root(root: &Value, assembly: &str) -> Result, String> { - let Some(mapping) = mapping_at(root, &["coordinates", assembly]) else { - return Ok(None); - }; - let chrom = mapping - .get(Value::String("chrom".to_owned())) - .and_then(Value::as_str) - .ok_or_else(|| format!("coordinates.{assembly}.chrom missing"))?; - let (start, end) = if let Some(pos) = i64_at_mapping(mapping, "pos") { - (pos, pos) - } else { - let start = i64_at_mapping(mapping, "start") - .ok_or_else(|| format!("coordinates.{assembly}.start missing"))?; - let end = i64_at_mapping(mapping, "end") - .ok_or_else(|| format!("coordinates.{assembly}.end missing"))?; - (start, end) - }; - Ok(Some(GenomicLocus { - chrom: chrom.to_owned(), - start, - end, - })) -} - -fn parse_downloads(root: &Value) -> Result, String> { - let mut downloads = Vec::new(); - let Some(items) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { - return Ok(downloads); - }; - for (idx, item) in items.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - return Err(format!("downloads[{idx}] must be a mapping")); - }; - let id = mapping_required_string(mapping, "id", idx, "downloads")?; - let url = mapping_required_string(mapping, "url", idx, "downloads")?; - let sha256 = mapping_required_string(mapping, "sha256", idx, "downloads")?; - let version = mapping_required_string(mapping, "version", idx, "downloads")?; - let origin = normalize_download_url(&url)?; - downloads.push(Download { - id, - url, - origin, - sha256, - version, - }); - } - Ok(downloads) -} - -fn parse_panel_members(root: &Value) -> Result, String> { - let mut members = Vec::new(); - let Some(items) = value_at(root, &["members"]).and_then(Value::as_sequence) else { - return Ok(members); - }; - for (idx, item) in items.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - return Err(format!("members[{idx}] must be a mapping")); - }; - members.push(PanelMember { - kind: mapping_required_string(mapping, "kind", idx, "members")?, - path: mapping - .get(Value::String("path".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - download: mapping - .get(Value::String("download".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - sha256: mapping - .get(Value::String("sha256".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - version: mapping - .get(Value::String("version".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - }); - } - Ok(members) -} - -fn parse_panel_interpretations(root: &Value) -> Result, String> { - let mut interpretations = Vec::new(); - let key = if value_at(root, &["analyses"]).is_some() { - "analyses" - } else { - "interpretations" - }; - let Some(items) = value_at(root, &[key]).and_then(Value::as_sequence) else { - return Ok(interpretations); - }; - for (idx, item) in items.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - return Err(format!("{key}[{idx}] must be a mapping")); - }; - interpretations.push(PanelInterpretation { - id: mapping_required_string(mapping, "id", idx, key)?, - kind: mapping_required_string(mapping, "kind", idx, key)?, - path: mapping_required_string(mapping, "path", idx, key)?, - output_format: mapping - .get(Value::String("output_format".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - derived_from: mapping_sequence_of_strings(mapping, "derived_from", idx, key)?, - emits: parse_panel_interpretation_emits(mapping, idx)?, - logic: parse_panel_interpretation_logic(mapping)?, - }); - } - Ok(interpretations) -} - -fn parse_panel_interpretation_logic( - mapping: &Mapping, -) -> Result, String> { - let Some(logic) = mapping.get(Value::String("logic".to_owned())) else { - return Ok(None); - }; - let Some(logic_mapping) = logic.as_mapping() else { - return Err("analysis logic must be a mapping".to_owned()); - }; - let source = match logic_mapping.get(Value::String("source".to_owned())) { - Some(source) => { - let Some(source_mapping) = source.as_mapping() else { - return Err("analysis logic.source must be a mapping".to_owned()); - }; - Some(PanelInterpretationLogicSource { - name: source_mapping - .get(Value::String("name".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - url: source_mapping - .get(Value::String("url".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - }) - } - None => None, - }; - Ok(Some(PanelInterpretationLogic { - source, - description: logic_mapping - .get(Value::String("description".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - })) -} - -fn parse_panel_interpretation_emits( - mapping: &Mapping, - interpretation_idx: usize, -) -> Result, String> { - let Some(items) = mapping - .get(Value::String("emits".to_owned())) - .and_then(Value::as_sequence) - else { - return Ok(Vec::new()); - }; - let mut emits = Vec::new(); - for (idx, item) in items.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { - return Err(format!( - "interpretations[{interpretation_idx}].emits[{idx}] must be a mapping" - )); - }; - emits.push(PanelInterpretationEmit { - key: mapping_required_string( - mapping, - "key", - idx, - &format!("interpretations[{interpretation_idx}].emits"), - )?, - label: mapping - .get(Value::String("label".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - value_type: mapping - .get(Value::String("value_type".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - format: mapping - .get(Value::String("format".to_owned())) - .and_then(Value::as_str) - .map(ToOwned::to_owned), - }); - } - Ok(emits) -} - -fn mapping_sequence_of_strings( - mapping: &Mapping, - field: &str, - idx: usize, - parent: &str, -) -> Result, String> { - let value = mapping - .get(Value::String(field.to_owned())) - .ok_or_else(|| format!("{parent}[{idx}].{field} is required"))?; - let items = value - .as_sequence() - .ok_or_else(|| format!("{parent}[{idx}].{field} must be a sequence"))?; - items - .iter() - .enumerate() - .map(|(item_idx, item)| { - item.as_str() - .map(ToOwned::to_owned) - .ok_or_else(|| format!("{parent}[{idx}].{field}[{item_idx}] must be a string")) - }) - .collect() -} - -fn mapping_required_string( - mapping: &Mapping, - field: &str, - idx: usize, - parent: &str, -) -> Result { - mapping - .get(Value::String(field.to_owned())) - .and_then(Value::as_str) - .filter(|value| !value.trim().is_empty()) - .map(ToOwned::to_owned) - .ok_or_else(|| format!("{parent}[{idx}].{field} missing or empty")) -} - -fn validate_url_string( - value: &str, - path: &str, - require_origin_only: bool, - issues: &mut Vec, -) { - let normalized = if require_origin_only { - normalize_origin(value) - } else { - normalize_download_url(value) - }; - if let Err(message) = normalized { - issues.push(Issue { - severity: Severity::Error, - path: path.to_owned(), - message, - }); - } -} - -fn normalize_origin(value: &str) -> Result { - let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; - if !matches!(url.scheme(), "http" | "https") { - return Err("expected http or https origin".to_owned()); - } - if url.host_str().is_none() { - return Err("origin is missing host".to_owned()); - } - if url.path() != "/" || url.query().is_some() || url.fragment().is_some() { - return Err("expected origin only, without path, query, or fragment".to_owned()); - } - let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); - if let Some(port) = url.port() { - let _ = write!(origin, ":{port}"); - } - Ok(origin) -} - -fn normalize_download_url(value: &str) -> Result { - let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; - if !matches!(url.scheme(), "http" | "https") { - return Err("expected http or https URL".to_owned()); - } - if url.host_str().is_none() { - return Err("URL is missing host".to_owned()); - } - let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); - if let Some(port) = url.port() { - let _ = write!(origin, ":{port}"); - } - Ok(origin) -} - -fn is_allowed_chromosome(value: &str) -> bool { - matches!(value, "X" | "Y" | "MT") - || value - .parse::() - .is_ok_and(|chrom| (1..=22).contains(&chrom)) -} - -fn is_base_allele(value: &str) -> bool { - matches!(value, "A" | "C" | "G" | "T") -} - -fn is_rsid(value: &str) -> bool { - value.starts_with("rs") && value[2..].chars().all(|ch| ch.is_ascii_digit()) -} - -fn is_sha256(value: &str) -> bool { - value.len() == 64 - && value - .chars() - .all(|ch| ch.is_ascii_hexdigit() && !ch.is_ascii_uppercase()) -} - -fn i64_at_mapping(mapping: &Mapping, key: &str) -> Option { - mapping - .get(Value::String(key.to_owned())) - .and_then(Value::as_i64) -} - -fn required_non_empty_string(root: &Value, path: &[&str]) -> Result { - scalar_at(root, path) - .filter(|value| !value.trim().is_empty()) - .ok_or_else(|| format!("{} missing or empty", path.join("."))) -} - -fn render_single_manifest_errors(path: &Path, issues: &[Issue]) -> String { - let mut out = format!("invalid manifest {}:\n", path.display()); - for issue in issues { - let _ = writeln!( - out, - " - [{}] {}: {}", - issue.severity, issue.path, issue.message - ); - } - out -} - -fn load_yaml(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read {}: {err}", path.display()))?; - serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) -} - -fn require_const(root: &Value, path: &[&str], expected: &str, issues: &mut Vec) { - match scalar_at(root, path) { - Some(actual) if actual == expected => {} - Some(actual) => issues.push(Issue { - severity: Severity::Error, - path: path.join("."), - message: format!("expected '{expected}', found '{actual}'"), - }), - None => issues.push(Issue { - severity: Severity::Error, - path: path.join("."), - message: "missing required field".to_owned(), - }), - } -} - -fn require_path(root: &Value, path: &[&str], issues: &mut Vec) { - if value_at(root, path).is_none() { - issues.push(Issue { - severity: Severity::Error, - path: path.join("."), - message: "missing required field".to_owned(), - }); - } -} - -fn value_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Value> { - let mut current = root; - for key in path { - let mapping = current.as_mapping()?; - current = mapping.get(Value::String((*key).to_owned()))?; - } - Some(current) -} - -fn mapping_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Mapping> { - value_at(root, path)?.as_mapping() -} - -fn scalar_at(root: &Value, path: &[&str]) -> Option { - value_at(root, path).and_then(|value| match value { - Value::String(text) => Some(text.clone()), - Value::Number(number) => Some(number.to_string()), - _ => None, - }) -} - -fn seq_of_strings(root: &Value, path: &[&str]) -> Option> { - value_at(root, path)?.as_sequence().map(|items| { - items - .iter() - .filter_map(|item| item.as_str().map(ToOwned::to_owned)) - .collect() - }) -} +// Keep validator source files small and grouped by schema responsibility. +// If a file approaches 500 lines, split it by validation domain rather than +// creating arbitrary numbered chunks. +include!("validator_types.rs"); +include!("validator_load.rs"); +include!("validator_roots.rs"); +include!("validator_alleles_findings.rs"); +include!("validator_panel.rs"); +include!("validator_parse.rs"); +include!("validator_helpers.rs"); diff --git a/rust/bioscript-schema/src/validator_alleles.rs b/rust/bioscript-schema/src/validator_alleles.rs new file mode 100644 index 0000000..12b2759 --- /dev/null +++ b/rust/bioscript-schema/src/validator_alleles.rs @@ -0,0 +1,146 @@ +fn validate_alleles(root: &Value, issues: &mut Vec) { + require_path(root, &["alleles"], issues); + require_path(root, &["alleles", "kind"], issues); + require_path(root, &["alleles", "ref"], issues); + require_path(root, &["alleles", "alts"], issues); + + let Some(kind) = scalar_at(root, &["alleles", "kind"]) else { + return; + }; + if !matches!(kind.as_str(), "snv" | "deletion" | "insertion" | "indel") { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.kind".to_owned(), + message: "expected one of snv, deletion, insertion, indel".to_owned(), + }); + } + + if value_at(root, &["alleles", "canonical_alt"]).is_some() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.canonical_alt".to_owned(), + message: "canonical_alt is not part of the current schema".to_owned(), + }); + } + + let Some(reference) = scalar_at(root, &["alleles", "ref"]) else { + return; + }; + if reference.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "empty string".to_owned(), + }); + } + + let Some(alts_value) = value_at(root, &["alleles", "alts"]) else { + return; + }; + let Some(alts_seq) = alts_value.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.alts".to_owned(), + message: "expected a non-empty sequence of strings".to_owned(), + }); + return; + }; + if alts_seq.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.alts".to_owned(), + message: "expected at least one alternate allele".to_owned(), + }); + return; + } + + let mut alts = Vec::new(); + for (idx, item) in alts_seq.iter().enumerate() { + let Some(alt) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if alt.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "empty string".to_owned(), + }); + continue; + } + alts.push(alt.to_owned()); + } + let observed_alts = match seq_of_strings(root, &["alleles", "observed_alts"]) { + Some(items) => { + if items.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.observed_alts".to_owned(), + message: "expected a non-empty sequence of strings when present".to_owned(), + }); + } + for alt in &alts { + if !items.iter().any(|item| item == alt) { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.observed_alts".to_owned(), + message: format!("significant alt '{alt}' is not present in observed_alts"), + }); + } + } + items + } + None => alts.clone(), + }; + validate_symbolic_alleles(&reference, &observed_alts, issues); + validate_snv_alleles(&kind, &reference, &observed_alts, issues); +} + +fn validate_symbolic_alleles(reference: &str, alts: &[String], issues: &mut Vec) { + if reference == "I" || reference == "D" { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" + .to_owned(), + }); + } + for (idx, alt) in alts.iter().enumerate() { + if alt == "I" || alt == "D" { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: + "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" + .to_owned(), + }); + } + } +} + +fn validate_snv_alleles(kind: &str, reference: &str, alts: &[String], issues: &mut Vec) { + if kind != "snv" { + return; + } + if !is_base_allele(reference) { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "snv ref must be one of A/C/G/T".to_owned(), + }); + } + for (idx, alt) in alts.iter().enumerate() { + if !is_base_allele(alt) { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "snv alt must be one of A/C/G/T".to_owned(), + }); + } + } +} + diff --git a/rust/bioscript-schema/src/validator_alleles_findings.rs b/rust/bioscript-schema/src/validator_alleles_findings.rs new file mode 100644 index 0000000..94368b1 --- /dev/null +++ b/rust/bioscript-schema/src/validator_alleles_findings.rs @@ -0,0 +1,3 @@ +include!("validator_alleles.rs"); +include!("validator_findings.rs"); +include!("validator_resources.rs"); diff --git a/rust/bioscript-schema/src/validator_findings.rs b/rust/bioscript-schema/src/validator_findings.rs new file mode 100644 index 0000000..185649f --- /dev/null +++ b/rust/bioscript-schema/src/validator_findings.rs @@ -0,0 +1,246 @@ +fn validate_findings(root: &Value, issues: &mut Vec) { + let alts = seq_of_strings(root, &["alleles", "alts"]).unwrap_or_default(); + let Some(findings) = value_at(root, &["findings"]).and_then(Value::as_sequence) else { + return; + }; + + for (idx, finding) in findings.iter().enumerate() { + let Some(mapping) = finding.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + + let Some(schema) = mapping + .get(Value::String("schema".to_owned())) + .and_then(Value::as_str) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].schema"), + message: "missing schema".to_owned(), + }); + continue; + }; + if schema.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].schema"), + message: "empty string".to_owned(), + }); + } + if schema == "bioscript:pgx:1.0" { + issues.push(Issue { + severity: Severity::Warning, + path: format!("findings[{idx}].schema"), + message: "legacy PGx finding schema; prefer bioscript:pgx-summary:1.0 or bioscript:pgx-label:1.0".to_owned(), + }); + } + if let Some(alt) = mapping + .get(Value::String("alt".to_owned())) + .and_then(Value::as_str) + && !alts.is_empty() + && alt != "*" + && !alts.iter().any(|item| item == alt) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].alt"), + message: format!("finding alt '{alt}' is not present in alleles.alts {alts:?}"), + }); + } + let has_summary = mapping + .get(Value::String("summary".to_owned())) + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + let has_notes = mapping + .get(Value::String("notes".to_owned())) + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + if !has_summary && !has_notes { + issues.push(Issue { + severity: Severity::Warning, + path: format!("findings[{idx}]"), + message: "finding has neither summary nor notes".to_owned(), + }); + } + validate_finding_binding(&format!("findings[{idx}]"), mapping, issues); + validate_finding_effects(idx, mapping, issues); + } +} + +fn validate_finding_effects(idx: usize, mapping: &Mapping, issues: &mut Vec) { + let Some(effects) = mapping.get(Value::String("effects".to_owned())) else { + return; + }; + let Some(effects) = effects.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].effects"), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (effect_idx, effect) in effects.iter().enumerate() { + let Some(effect) = effect.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].effects[{effect_idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_finding_binding( + &format!("findings[{idx}].effects[{effect_idx}]"), + effect, + issues, + ); + } +} + +fn validate_finding_binding(parent: &str, mapping: &Mapping, issues: &mut Vec) { + let Some(binding) = mapping.get(Value::String("binding".to_owned())) else { + return; + }; + let Some(binding) = binding.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_required_mapping_string(binding, "source", &format!("{parent}.binding"), issues); + validate_finding_binding_source(parent, binding, issues); + validate_finding_binding_operator(parent, binding, issues); +} + +fn validate_finding_binding_source(parent: &str, binding: &Mapping, issues: &mut Vec) { + let source = binding + .get(Value::String("source".to_owned())) + .and_then(Value::as_str); + match source { + Some("variant") => { + if !binding.contains_key(Value::String("variant".to_owned())) + && !binding.contains_key(Value::String("path".to_owned())) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.variant"), + message: "variant findings require variant or path".to_owned(), + }); + } + } + Some("analysis") => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + validate_required_mapping_string( + binding, + "analysis_id", + &format!("{parent}.binding"), + issues, + ); + } + Some(other) => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.source"), + message: format!("unsupported source '{other}'"), + }), + None => {} + } +} + +fn validate_finding_binding_operator(parent: &str, binding: &Mapping, issues: &mut Vec) { + let operator = binding + .get(Value::String("operator".to_owned())) + .and_then(Value::as_str) + .unwrap_or("equals"); + match operator { + "equals" => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + if !binding.contains_key(Value::String("value".to_owned())) { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.value"), + message: "equals requires value".to_owned(), + }); + } + } + "in" => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + let values = binding + .get(Value::String("values".to_owned())) + .and_then(Value::as_sequence); + if values.is_none_or(Vec::is_empty) { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.values"), + message: "in requires non-empty values".to_owned(), + }); + } + } + "dosage_equals" => { + if binding + .get(Value::String("allele".to_owned())) + .and_then(Value::as_str) + .is_none_or(|value| value.trim().is_empty()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.allele"), + message: "dosage_equals requires allele".to_owned(), + }); + } + if binding + .get(Value::String("value".to_owned())) + .and_then(Value::as_i64) + .is_none_or(|value| !(0..=2).contains(&value)) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.value"), + message: "dosage_equals requires integer value 0, 1, or 2".to_owned(), + }); + } + } + "dosage_in" => { + if binding + .get(Value::String("allele".to_owned())) + .and_then(Value::as_str) + .is_none_or(|value| value.trim().is_empty()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.allele"), + message: "dosage_in requires allele".to_owned(), + }); + } + let values = binding + .get(Value::String("values".to_owned())) + .and_then(Value::as_sequence); + let invalid_values = match values { + Some(items) if !items.is_empty() => items + .iter() + .any(|value| value.as_i64().is_none_or(|n| !(0..=2).contains(&n))), + _ => true, + }; + if invalid_values { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.values"), + message: "dosage_in requires integer values from 0 to 2".to_owned(), + }); + } + } + other => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.operator"), + message: format!( + "unsupported operator '{other}'; expected 'equals', 'in', 'dosage_equals', or 'dosage_in'" + ), + }), + } +} + diff --git a/rust/bioscript-schema/src/validator_helpers.rs b/rust/bioscript-schema/src/validator_helpers.rs new file mode 100644 index 0000000..c3d8492 --- /dev/null +++ b/rust/bioscript-schema/src/validator_helpers.rs @@ -0,0 +1,161 @@ +fn validate_url_string( + value: &str, + path: &str, + require_origin_only: bool, + issues: &mut Vec, +) { + let normalized = if require_origin_only { + normalize_origin(value) + } else { + normalize_download_url(value) + }; + if let Err(message) = normalized { + issues.push(Issue { + severity: Severity::Error, + path: path.to_owned(), + message, + }); + } +} + +fn normalize_origin(value: &str) -> Result { + let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; + if !matches!(url.scheme(), "http" | "https") { + return Err("expected http or https origin".to_owned()); + } + if url.host_str().is_none() { + return Err("origin is missing host".to_owned()); + } + if url.path() != "/" || url.query().is_some() || url.fragment().is_some() { + return Err("expected origin only, without path, query, or fragment".to_owned()); + } + let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); + if let Some(port) = url.port() { + let _ = write!(origin, ":{port}"); + } + Ok(origin) +} + +fn normalize_download_url(value: &str) -> Result { + let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; + if !matches!(url.scheme(), "http" | "https") { + return Err("expected http or https URL".to_owned()); + } + if url.host_str().is_none() { + return Err("URL is missing host".to_owned()); + } + let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); + if let Some(port) = url.port() { + let _ = write!(origin, ":{port}"); + } + Ok(origin) +} + +fn is_allowed_chromosome(value: &str) -> bool { + matches!(value, "X" | "Y" | "MT") + || value + .parse::() + .is_ok_and(|chrom| (1..=22).contains(&chrom)) +} + +fn is_base_allele(value: &str) -> bool { + matches!(value, "A" | "C" | "G" | "T") +} + +fn is_rsid(value: &str) -> bool { + value.starts_with("rs") && value[2..].chars().all(|ch| ch.is_ascii_digit()) +} + +fn is_sha256(value: &str) -> bool { + value.len() == 64 + && value + .chars() + .all(|ch| ch.is_ascii_hexdigit() && !ch.is_ascii_uppercase()) +} + +fn i64_at_mapping(mapping: &Mapping, key: &str) -> Option { + mapping + .get(Value::String(key.to_owned())) + .and_then(Value::as_i64) +} + +fn required_non_empty_string(root: &Value, path: &[&str]) -> Result { + scalar_at(root, path) + .filter(|value| !value.trim().is_empty()) + .ok_or_else(|| format!("{} missing or empty", path.join("."))) +} + +fn render_single_manifest_errors(path: &Path, issues: &[Issue]) -> String { + let mut out = format!("invalid manifest {}:\n", path.display()); + for issue in issues { + let _ = writeln!( + out, + " - [{}] {}: {}", + issue.severity, issue.path, issue.message + ); + } + out +} + +fn load_yaml(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) +} + +fn require_const(root: &Value, path: &[&str], expected: &str, issues: &mut Vec) { + match scalar_at(root, path) { + Some(actual) if actual == expected => {} + Some(actual) => issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: format!("expected '{expected}', found '{actual}'"), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: "missing required field".to_owned(), + }), + } +} + +fn require_path(root: &Value, path: &[&str], issues: &mut Vec) { + if value_at(root, path).is_none() { + issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: "missing required field".to_owned(), + }); + } +} + +fn value_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Value> { + let mut current = root; + for key in path { + let mapping = current.as_mapping()?; + current = mapping.get(Value::String((*key).to_owned()))?; + } + Some(current) +} + +fn mapping_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Mapping> { + value_at(root, path)?.as_mapping() +} + +fn scalar_at(root: &Value, path: &[&str]) -> Option { + value_at(root, path).and_then(|value| match value { + Value::String(text) => Some(text.clone()), + Value::Number(number) => Some(number.to_string()), + _ => None, + }) +} + +fn seq_of_strings(root: &Value, path: &[&str]) -> Option> { + value_at(root, path)?.as_sequence().map(|items| { + items + .iter() + .filter_map(|item| item.as_str().map(ToOwned::to_owned)) + .collect() + }) +} diff --git a/rust/bioscript-schema/src/validator_load.rs b/rust/bioscript-schema/src/validator_load.rs new file mode 100644 index 0000000..f3532ca --- /dev/null +++ b/rust/bioscript-schema/src/validator_load.rs @@ -0,0 +1,347 @@ +/// Validate a variant file or directory of variant files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +pub fn validate_variants_path(path: &Path) -> Result { + validate_manifest_path(path, ManifestSelector::Variant) +} + +/// Validate a panel file or directory of panel files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +/// Validate a panel file or directory of panel files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +pub fn validate_panels_path(path: &Path) -> Result { + validate_manifest_path(path, ManifestSelector::Panel) +} + +/// Validate an assay file or directory of assay files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +/// Validate an assay file or directory of assay files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +pub fn validate_assays_path(path: &Path) -> Result { + validate_manifest_path(path, ManifestSelector::Assay) +} + +/// Load a single variant manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid variant +/// manifest. +/// Load a variant manifest from a YAML file. +/// +/// # Errors +/// +/// Returns an error when the file cannot be read, parsed, or converted into a +/// valid variant manifest shape. +pub fn load_variant_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + variant_manifest_from_root(path, &value) +} + +/// Load a single variant manifest from YAML text. +/// +/// # Errors +/// +/// Returns an error when the text does not parse or is not a valid variant +/// manifest. +/// Load a variant manifest from YAML text. +/// +/// # Errors +/// +/// Returns an error when the text cannot be parsed or converted into a valid +/// variant manifest shape. +pub fn load_variant_manifest_text(name: &str, text: &str) -> Result { + let value: Value = + serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; + variant_manifest_from_root(Path::new(name), &value) +} + +/// Compile a variant manifest from YAML text for lookup execution. +/// +/// This validates the execution-critical fields only: identity, identifiers, +/// coordinates, and alleles. Full manifest validation still reports metadata +/// issues such as missing finding schemas, but those do not block local lookup. +/// +/// # Errors +/// +/// Returns an error when the text does not parse or the execution-critical +/// fields are invalid. +pub fn load_variant_manifest_text_for_lookup( + name: &str, + text: &str, +) -> Result { + let value: Value = + serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; + let path = Path::new(name); + let mut issues = Vec::new(); + validate_schema_and_identity( + &value, + "bioscript:variant:1.0", + Some("bioscript:variant"), + &mut issues, + ); + validate_identifiers(&value, &mut issues); + validate_coordinates(&value, &mut issues); + validate_alleles(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + Ok(VariantManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + spec: variant_spec_from_root(&value)?, + }) +} + +fn variant_manifest_from_root(path: &Path, value: &Value) -> Result { + let mut issues = Vec::new(); + validate_variant_root(value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + Ok(VariantManifest { + path: path.to_path_buf(), + name: required_non_empty_string(value, &["name"])?, + tags: seq_of_strings(value, &["tags"]).unwrap_or_default(), + spec: variant_spec_from_root(value)?, + }) +} + +/// Load a single panel manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid panel +/// manifest. +/// Load a panel manifest from a YAML file. +/// +/// # Errors +/// +/// Returns an error when the file cannot be read, parsed, or converted into a +/// valid panel manifest shape. +pub fn load_panel_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + let mut issues = Vec::new(); + validate_panel_root(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + let permissions = Permissions { + domains: seq_of_strings(&value, &["permissions", "domains"]).unwrap_or_default(), + }; + let downloads = parse_downloads(&value)?; + let members = parse_panel_members(&value)?; + let interpretations = parse_panel_interpretations(&value)?; + + Ok(PanelManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + permissions, + downloads, + members, + interpretations, + }) +} + +/// Load a single assay manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid assay +/// manifest. +/// Load an assay manifest from a YAML file. +/// +/// # Errors +/// +/// Returns an error when the file cannot be read, parsed, or converted into a +/// valid assay manifest shape. +pub fn load_assay_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + let mut issues = Vec::new(); + validate_assay_root(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + Ok(AssayManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + members: parse_panel_members(&value)?, + interpretations: parse_panel_interpretations(&value)?, + }) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ManifestSelector { + Assay, + Variant, + Panel, +} + +fn validate_manifest_path( + path: &Path, + selector: ManifestSelector, +) -> Result { + let files = collect_yaml_files(path)?; + let mut reports = Vec::new(); + for file in &files { + let report = match selector { + ManifestSelector::Assay => validate_assay_file(file)?, + ManifestSelector::Variant => validate_variant_file(file)?, + ManifestSelector::Panel => validate_panel_file(file)?, + }; + if !report.issues.is_empty() { + reports.push(report); + } + } + Ok(ValidationReport { + files_scanned: files.len(), + reports, + }) +} + +fn collect_yaml_files(path: &Path) -> Result, String> { + if path.is_file() { + return Ok(vec![path.to_path_buf()]); + } + + let mut files = Vec::new(); + collect_yaml_files_recursive(path, &mut files)?; + files.sort(); + Ok(files) +} + +fn collect_yaml_files_recursive(path: &Path, files: &mut Vec) -> Result<(), String> { + let entries = fs::read_dir(path) + .map_err(|err| format!("failed to read directory {}: {err}", path.display()))?; + for entry in entries { + let entry = entry.map_err(|err| format!("failed to read directory entry: {err}"))?; + let entry_path = entry.path(); + if entry_path.is_dir() { + collect_yaml_files_recursive(&entry_path, files)?; + continue; + } + if entry_path.extension().is_some_and(|extension| { + ["yaml", "yml"] + .iter() + .any(|item| extension.eq_ignore_ascii_case(item)) + }) { + files.push(entry_path); + } + } + Ok(()) +} + +fn validate_assay_file(path: &Path) -> Result { + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], + }); + }; + if !schema.contains("assay") { + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), + }); + } + + let mut issues = Vec::new(); + validate_assay_root(&value, &mut issues); + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} + +fn validate_variant_file(path: &Path) -> Result { + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], + }); + }; + if !schema.contains("variant") { + if schema == "bioscript:pgx-findings:1.0" { + let mut issues = Vec::new(); + validate_pgx_findings_root(&value, &mut issues); + return Ok(FileReport { + file: path.to_path_buf(), + issues, + }); + } + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), + }); + } + + let mut issues = Vec::new(); + validate_variant_root(&value, &mut issues); + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} + +fn validate_panel_file(path: &Path) -> Result { + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], + }); + }; + if !schema.contains("panel") { + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), + }); + } + + let mut issues = Vec::new(); + validate_panel_root(&value, &mut issues); + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} diff --git a/rust/bioscript-schema/src/validator_panel.rs b/rust/bioscript-schema/src/validator_panel.rs new file mode 100644 index 0000000..4ef3c43 --- /dev/null +++ b/rust/bioscript-schema/src/validator_panel.rs @@ -0,0 +1,450 @@ +fn validate_panel_members(root: &Value, allowed_kinds: &[&str], issues: &mut Vec) { + let Some(members) = value_at(root, &["members"]).and_then(Value::as_sequence) else { + issues.push(Issue { + severity: Severity::Error, + path: "members".to_owned(), + message: "missing required field".to_owned(), + }); + return; + }; + if members.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "members".to_owned(), + message: "expected at least one member".to_owned(), + }); + return; + } + + let download_ids = panel_download_ids(root); + + for (idx, item) in members.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_panel_member(idx, mapping, allowed_kinds, &download_ids, issues); + } +} + +fn panel_download_ids(root: &Value) -> BTreeSet { + value_at(root, &["downloads"]) + .and_then(Value::as_sequence) + .into_iter() + .flatten() + .filter_map(|item| { + item.as_mapping()? + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .collect() +} + +fn validate_panel_member( + idx: usize, + mapping: &Mapping, + allowed_kinds: &[&str], + download_ids: &BTreeSet, + issues: &mut Vec, +) { + let kind = mapping + .get(Value::String("kind".to_owned())) + .and_then(Value::as_str); + match kind { + Some(kind) if allowed_kinds.contains(&kind) => {} + Some(other) => issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].kind"), + message: format!("unsupported member kind '{other}'"), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].kind"), + message: "missing required field".to_owned(), + }), + } + + let path_value = mapping + .get(Value::String("path".to_owned())) + .and_then(Value::as_str); + let download_value = mapping + .get(Value::String("download".to_owned())) + .and_then(Value::as_str); + if path_value.is_some() == download_value.is_some() { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}]"), + message: "expected exactly one of path or download".to_owned(), + }); + } + validate_panel_member_path(idx, path_value, issues); + validate_panel_member_download(idx, download_value, download_ids, issues); + validate_panel_member_metadata(idx, mapping, issues); +} + +fn validate_panel_member_path(idx: usize, path_value: Option<&str>, issues: &mut Vec) { + if let Some(path) = path_value + && path.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].path"), + message: "empty string".to_owned(), + }); + } +} + +fn validate_panel_member_download( + idx: usize, + download_value: Option<&str>, + download_ids: &BTreeSet, + issues: &mut Vec, +) { + let Some(download) = download_value else { + return; + }; + if download.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].download"), + message: "empty string".to_owned(), + }); + } else if !download_ids.contains(download) { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].download"), + message: format!("unknown download id '{download}'"), + }); + } +} + +fn validate_panel_member_metadata(idx: usize, mapping: &Mapping, issues: &mut Vec) { + if let Some(version) = mapping + .get(Value::String("version".to_owned())) + .and_then(Value::as_str) + && version.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].version"), + message: "empty string".to_owned(), + }); + } + if let Some(sha) = mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + && !is_sha256(sha) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].sha256"), + message: "expected 64 lowercase hex characters".to_owned(), + }); + } +} + +fn validate_panel_interpretations(root: &Value, issues: &mut Vec) { + if value_at(root, &["analyses"]).is_some() && value_at(root, &["interpretations"]).is_some() { + issues.push(Issue { + severity: Severity::Warning, + path: "interpretations".to_owned(), + message: "use analyses instead of interpretations; do not define both".to_owned(), + }); + } + let key = if value_at(root, &["analyses"]).is_some() { + "analyses" + } else { + "interpretations" + }; + let Some(items) = value_at(root, &[key]) else { + return; + }; + let Some(items) = items.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: key.to_owned(), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_panel_interpretation(key, idx, mapping, issues); + } +} + +fn validate_panel_interpretation( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + for field in ["id", "kind", "path"] { + validate_required_mapping_string(mapping, field, &format!("{key}[{idx}]"), issues); + } + if let Some(kind) = mapping + .get(Value::String("kind".to_owned())) + .and_then(Value::as_str) + && kind != "bioscript" + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].kind"), + message: "expected 'bioscript'".to_owned(), + }); + } + if let Some(output_format) = mapping + .get(Value::String("output_format".to_owned())) + .and_then(Value::as_str) + && !matches!(output_format, "tsv" | "json" | "jsonl") + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].output_format"), + message: "expected 'tsv', 'json', or 'jsonl'".to_owned(), + }); + } + let Some(derived_from) = mapping + .get(Value::String("derived_from".to_owned())) + .and_then(Value::as_sequence) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from"), + message: "expected a non-empty sequence of strings".to_owned(), + }); + return; + }; + if derived_from.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from"), + message: "expected at least one source variant".to_owned(), + }); + } + for (source_idx, source) in derived_from.iter().enumerate() { + match source.as_str() { + Some(value) if !value.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from[{source_idx}]"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from[{source_idx}]"), + message: "expected string".to_owned(), + }), + } + } + validate_panel_interpretation_logic(key, idx, mapping, issues); + validate_panel_interpretation_emits(key, idx, mapping, issues); +} + +fn validate_panel_interpretation_logic( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + let Some(logic) = mapping.get(Value::String("logic".to_owned())) else { + return; + }; + let Some(logic) = logic.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].logic"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_optional_mapping_string(logic, "description", &format!("{key}[{idx}].logic"), issues); + let Some(source) = logic.get(Value::String("source".to_owned())) else { + return; + }; + let Some(source) = source.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].logic.source"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_optional_mapping_string( + source, + "name", + &format!("{key}[{idx}].logic.source"), + issues, + ); + validate_optional_mapping_string(source, "url", &format!("{key}[{idx}].logic.source"), issues); + if let Some(url) = source + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + validate_url_string( + url, + &format!("{key}[{idx}].logic.source.url"), + false, + issues, + ); + } +} + +fn validate_panel_interpretation_emits( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + let Some(emits) = mapping.get(Value::String("emits".to_owned())) else { + return; + }; + let Some(emits) = emits.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].emits"), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (emit_idx, item) in emits.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].emits[{emit_idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_required_mapping_string( + mapping, + "key", + &format!("{key}[{idx}].emits[{emit_idx}]"), + issues, + ); + for field in ["label", "value_type", "format"] { + validate_optional_mapping_string( + mapping, + field, + &format!("{key}[{idx}].emits[{emit_idx}]"), + issues, + ); + } + } +} + +fn validate_required_mapping_string( + mapping: &Mapping, + field: &str, + parent: &str, + issues: &mut Vec, +) { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(value) if !value.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "missing required field".to_owned(), + }), + } +} + +fn validate_optional_mapping_string( + mapping: &Mapping, + field: &str, + parent: &str, + issues: &mut Vec, +) { + if let Some(value) = mapping.get(Value::String(field.to_owned())) { + match value.as_str() { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Warning, + path: format!("{parent}.{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "expected string".to_owned(), + }), + } + } +} + +fn variant_spec_from_root(root: &Value) -> Result { + let rsids = seq_of_strings(root, &["identifiers", "rsids"]).unwrap_or_default(); + let grch37 = locus_from_root(root, "grch37")?; + let grch38 = locus_from_root(root, "grch38")?; + let reference = scalar_at(root, &["alleles", "ref"]); + let alternate = seq_of_strings(root, &["alleles", "observed_alts"]) + .or_else(|| seq_of_strings(root, &["alleles", "alts"])) + .and_then(|alts| alts.first().cloned()); + let deletion_length = value_at(root, &["alleles", "deletion_length"]) + .and_then(Value::as_u64) + .and_then(|value| usize::try_from(value).ok()); + let motifs = seq_of_strings(root, &["alleles", "motifs"]).unwrap_or_default(); + let kind = scalar_at(root, &["alleles", "kind"]).map(|kind| match kind.as_str() { + "snv" => VariantKind::Snp, + "deletion" => VariantKind::Deletion, + "insertion" => VariantKind::Insertion, + "indel" => VariantKind::Indel, + _ => VariantKind::Other, + }); + + Ok(VariantSpec { + rsids, + grch37, + grch38, + reference, + alternate, + kind, + deletion_length, + motifs, + }) +} + +fn locus_from_root(root: &Value, assembly: &str) -> Result, String> { + let Some(mapping) = mapping_at(root, &["coordinates", assembly]) else { + return Ok(None); + }; + let chrom = mapping + .get(Value::String("chrom".to_owned())) + .and_then(Value::as_str) + .ok_or_else(|| format!("coordinates.{assembly}.chrom missing"))?; + let (start, end) = if let Some(pos) = i64_at_mapping(mapping, "pos") { + (pos, pos) + } else { + let start = i64_at_mapping(mapping, "start") + .ok_or_else(|| format!("coordinates.{assembly}.start missing"))?; + let end = i64_at_mapping(mapping, "end") + .ok_or_else(|| format!("coordinates.{assembly}.end missing"))?; + (start, end) + }; + Ok(Some(GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + })) +} + diff --git a/rust/bioscript-schema/src/validator_parse.rs b/rust/bioscript-schema/src/validator_parse.rs new file mode 100644 index 0000000..f31f28f --- /dev/null +++ b/rust/bioscript-schema/src/validator_parse.rs @@ -0,0 +1,201 @@ +fn parse_downloads(root: &Value) -> Result, String> { + let mut downloads = Vec::new(); + let Some(items) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { + return Ok(downloads); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("downloads[{idx}] must be a mapping")); + }; + let id = mapping_required_string(mapping, "id", idx, "downloads")?; + let url = mapping_required_string(mapping, "url", idx, "downloads")?; + let sha256 = mapping_required_string(mapping, "sha256", idx, "downloads")?; + let version = mapping_required_string(mapping, "version", idx, "downloads")?; + let origin = normalize_download_url(&url)?; + downloads.push(Download { + id, + url, + origin, + sha256, + version, + }); + } + Ok(downloads) +} + +fn parse_panel_members(root: &Value) -> Result, String> { + let mut members = Vec::new(); + let Some(items) = value_at(root, &["members"]).and_then(Value::as_sequence) else { + return Ok(members); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("members[{idx}] must be a mapping")); + }; + members.push(PanelMember { + kind: mapping_required_string(mapping, "kind", idx, "members")?, + path: mapping + .get(Value::String("path".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + download: mapping + .get(Value::String("download".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + sha256: mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + version: mapping + .get(Value::String("version".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }); + } + Ok(members) +} + +fn parse_panel_interpretations(root: &Value) -> Result, String> { + let mut interpretations = Vec::new(); + let key = if value_at(root, &["analyses"]).is_some() { + "analyses" + } else { + "interpretations" + }; + let Some(items) = value_at(root, &[key]).and_then(Value::as_sequence) else { + return Ok(interpretations); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("{key}[{idx}] must be a mapping")); + }; + interpretations.push(PanelInterpretation { + id: mapping_required_string(mapping, "id", idx, key)?, + kind: mapping_required_string(mapping, "kind", idx, key)?, + path: mapping_required_string(mapping, "path", idx, key)?, + output_format: mapping + .get(Value::String("output_format".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + derived_from: mapping_sequence_of_strings(mapping, "derived_from", idx, key)?, + emits: parse_panel_interpretation_emits(mapping, idx)?, + logic: parse_panel_interpretation_logic(mapping)?, + }); + } + Ok(interpretations) +} + +fn parse_panel_interpretation_logic( + mapping: &Mapping, +) -> Result, String> { + let Some(logic) = mapping.get(Value::String("logic".to_owned())) else { + return Ok(None); + }; + let Some(logic_mapping) = logic.as_mapping() else { + return Err("analysis logic must be a mapping".to_owned()); + }; + let source = match logic_mapping.get(Value::String("source".to_owned())) { + Some(source) => { + let Some(source_mapping) = source.as_mapping() else { + return Err("analysis logic.source must be a mapping".to_owned()); + }; + Some(PanelInterpretationLogicSource { + name: source_mapping + .get(Value::String("name".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + url: source_mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }) + } + None => None, + }; + Ok(Some(PanelInterpretationLogic { + source, + description: logic_mapping + .get(Value::String("description".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + })) +} + +fn parse_panel_interpretation_emits( + mapping: &Mapping, + interpretation_idx: usize, +) -> Result, String> { + let Some(items) = mapping + .get(Value::String("emits".to_owned())) + .and_then(Value::as_sequence) + else { + return Ok(Vec::new()); + }; + let mut emits = Vec::new(); + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!( + "interpretations[{interpretation_idx}].emits[{idx}] must be a mapping" + )); + }; + emits.push(PanelInterpretationEmit { + key: mapping_required_string( + mapping, + "key", + idx, + &format!("interpretations[{interpretation_idx}].emits"), + )?, + label: mapping + .get(Value::String("label".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + value_type: mapping + .get(Value::String("value_type".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + format: mapping + .get(Value::String("format".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }); + } + Ok(emits) +} + +fn mapping_sequence_of_strings( + mapping: &Mapping, + field: &str, + idx: usize, + parent: &str, +) -> Result, String> { + let value = mapping + .get(Value::String(field.to_owned())) + .ok_or_else(|| format!("{parent}[{idx}].{field} is required"))?; + let items = value + .as_sequence() + .ok_or_else(|| format!("{parent}[{idx}].{field} must be a sequence"))?; + items + .iter() + .enumerate() + .map(|(item_idx, item)| { + item.as_str() + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{parent}[{idx}].{field}[{item_idx}] must be a string")) + }) + .collect() +} + +fn mapping_required_string( + mapping: &Mapping, + field: &str, + idx: usize, + parent: &str, +) -> Result { + mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + .filter(|value| !value.trim().is_empty()) + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{parent}[{idx}].{field} missing or empty")) +} + diff --git a/rust/bioscript-schema/src/validator_resources.rs b/rust/bioscript-schema/src/validator_resources.rs new file mode 100644 index 0000000..9f5ff22 --- /dev/null +++ b/rust/bioscript-schema/src/validator_resources.rs @@ -0,0 +1,173 @@ +fn validate_provenance(root: &Value, issues: &mut Vec) { + let Some(sources) = value_at(root, &["provenance", "sources"]).and_then(Value::as_sequence) + else { + return; + }; + for (idx, source) in sources.iter().enumerate() { + let Some(mapping) = source.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["kind", "label", "url"] { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].{field}"), + message: "missing required field".to_owned(), + }), + } + } + if let Some(url) = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + validate_url_string( + url, + &format!("provenance.sources[{idx}].url"), + false, + issues, + ); + } + } +} + +fn validate_permissions(root: &Value, issues: &mut Vec) { + let Some(domains) = value_at(root, &["permissions", "domains"]) else { + return; + }; + let Some(items) = domains.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "permissions.domains".to_owned(), + message: "expected a sequence of origins".to_owned(), + }); + return; + }; + let mut seen = BTreeSet::new(); + for (idx, item) in items.iter().enumerate() { + let Some(value) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("permissions.domains[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + match normalize_origin(value) { + Ok(origin) => { + if !seen.insert(origin.clone()) { + issues.push(Issue { + severity: Severity::Warning, + path: format!("permissions.domains[{idx}]"), + message: format!("duplicate origin '{origin}'"), + }); + } + } + Err(message) => issues.push(Issue { + severity: Severity::Error, + path: format!("permissions.domains[{idx}]"), + message, + }), + } + } +} + +fn validate_downloads(root: &Value, issues: &mut Vec) { + let allowed_origins: BTreeSet = seq_of_strings(root, &["permissions", "domains"]) + .unwrap_or_default() + .into_iter() + .filter_map(|domain| normalize_origin(&domain).ok()) + .collect(); + let Some(downloads) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { + return; + }; + let mut ids = BTreeSet::new(); + for (idx, item) in downloads.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["id", "url", "sha256", "version"] { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].{field}"), + message: "missing required field".to_owned(), + }), + } + } + + if let Some(id) = mapping + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + && !ids.insert(id.to_owned()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].id"), + message: format!("duplicate download id '{id}'"), + }); + } + if let Some(sha) = mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + && !is_sha256(sha) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].sha256"), + message: "expected 64 lowercase hex characters".to_owned(), + }); + } + if let Some(url) = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + match normalize_download_url(url) { + Ok(origin) => { + if !allowed_origins.is_empty() && !allowed_origins.contains(&origin) { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].url"), + message: format!( + "download origin '{origin}' is not listed in permissions.domains" + ), + }); + } + } + Err(message) => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].url"), + message, + }), + } + } + } +} + diff --git a/rust/bioscript-schema/src/validator_roots.rs b/rust/bioscript-schema/src/validator_roots.rs new file mode 100644 index 0000000..9600eb7 --- /dev/null +++ b/rust/bioscript-schema/src/validator_roots.rs @@ -0,0 +1,334 @@ +fn validate_variant_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity( + root, + "bioscript:variant:1.0", + Some("bioscript:variant"), + issues, + ); + validate_optional_strings(root, &["name", "label", "gene", "summary"], issues); + validate_tags(root, issues); + validate_identifiers(root, issues); + validate_coordinates(root, issues); + validate_alleles(root, issues); + validate_findings(root, issues); + validate_provenance(root, issues); + + let has_identifiers = value_at(root, &["identifiers"]) + .and_then(Value::as_mapping) + .is_some_and(|mapping| !mapping.is_empty()); + let has_coordinates = ["grch37", "grch38"] + .iter() + .any(|assembly| value_at(root, &["coordinates", assembly]).is_some()); + if !has_identifiers && !has_coordinates { + issues.push(Issue { + severity: Severity::Error, + path: "identifiers/coordinates".to_owned(), + message: "expected at least one identifier block or one coordinate block".to_owned(), + }); + } +} + +fn validate_panel_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity(root, "bioscript:panel:1.0", None, issues); + validate_optional_strings(root, &["name", "label", "summary"], issues); + validate_tags(root, issues); + validate_permissions(root, issues); + validate_downloads(root, issues); + validate_panel_members(root, &["variant", "assay"], issues); + validate_panel_interpretations(root, issues); + validate_findings(root, issues); +} + +fn validate_assay_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity(root, "bioscript:assay:1.0", None, issues); + validate_optional_strings(root, &["name", "label", "summary"], issues); + validate_tags(root, issues); + validate_panel_members(root, &["variant"], issues); + validate_panel_interpretations(root, issues); + validate_findings(root, issues); +} + +fn validate_pgx_findings_root(root: &Value, issues: &mut Vec) { + require_const(root, &["schema"], "bioscript:pgx-findings:1.0", issues); + require_const(root, &["version"], "1.0", issues); + validate_optional_strings(root, &["variant", "gene", "rsid", "variant_pa_id"], issues); + if value_at(root, &["variant"]).is_none() && value_at(root, &["rsid"]).is_none() { + issues.push(Issue { + severity: Severity::Error, + path: "variant/rsid".to_owned(), + message: "expected at least one variant identifier".to_owned(), + }); + } + match value_at(root, &["findings"]) { + Some(Value::Sequence(_)) => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: "findings".to_owned(), + message: "expected a sequence of findings".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: "findings".to_owned(), + message: "missing required field".to_owned(), + }), + } + validate_findings(root, issues); +} + +fn validate_schema_and_identity( + root: &Value, + canonical_schema: &str, + legacy_schema: Option<&str>, + issues: &mut Vec, +) { + let schema = scalar_at(root, &["schema"]); + let valid_schema = schema + .as_deref() + .is_some_and(|value| value == canonical_schema || legacy_schema == Some(value)); + if !valid_schema { + issues.push(Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: format!("expected schema to be '{canonical_schema}'"), + }); + } + if let Some(legacy_schema) = legacy_schema + && matches!(schema.as_deref(), Some(value) if value == legacy_schema) + { + issues.push(Issue { + severity: Severity::Warning, + path: "schema".to_owned(), + message: format!("legacy schema value '{legacy_schema}'; prefer '{canonical_schema}'"), + }); + } + require_const(root, &["version"], "1.0", issues); + match scalar_at(root, &["name"]) { + Some(name) if !name.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: "name".to_owned(), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: "name".to_owned(), + message: "missing required field".to_owned(), + }), + } + if value_at(root, &["variant_id"]).is_some() { + issues.push(Issue { + severity: Severity::Warning, + path: "variant_id".to_owned(), + message: "variant_id is legacy; prefer name".to_owned(), + }); + } +} + +fn validate_optional_strings(root: &Value, fields: &[&str], issues: &mut Vec) { + for field in fields { + if let Some(value) = value_at(root, &[*field]) { + match value.as_str() { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Warning, + path: (*field).to_owned(), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: (*field).to_owned(), + message: "expected string".to_owned(), + }), + } + } + } +} + +fn validate_tags(root: &Value, issues: &mut Vec) { + let Some(value) = value_at(root, &["tags"]) else { + return; + }; + let Some(items) = value.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "tags".to_owned(), + message: "expected a sequence of strings".to_owned(), + }); + return; + }; + + for (idx, item) in items.iter().enumerate() { + let Some(tag) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("tags[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if tag.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("tags[{idx}]"), + message: "empty tag string".to_owned(), + }); + } + } +} + +fn validate_identifiers(root: &Value, issues: &mut Vec) { + for field in ["rsids", "aliases"] { + let Some(values) = value_at(root, &["identifiers", field]) else { + continue; + }; + let Some(items) = values.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}"), + message: "expected a sequence of strings".to_owned(), + }); + continue; + }; + let mut seen = BTreeSet::new(); + for (idx, item) in items.iter().enumerate() { + let Some(value) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if !is_rsid(value) { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}[{idx}]"), + message: format!("expected rsid like rs123, found '{value}'"), + }); + } + if !seen.insert(value.to_owned()) { + issues.push(Issue { + severity: Severity::Warning, + path: format!("identifiers.{field}[{idx}]"), + message: format!("duplicate identifier '{value}'"), + }); + } + } + } +} + +fn validate_coordinates(root: &Value, issues: &mut Vec) { + for assembly in ["grch37", "grch38"] { + let Some(coord) = mapping_at(root, &["coordinates", assembly]) else { + continue; + }; + + let Some(chrom) = coord + .get(Value::String("chrom".to_owned())) + .and_then(Value::as_str) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.chrom"), + message: "missing chrom".to_owned(), + }); + continue; + }; + if !is_allowed_chromosome(chrom) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.chrom"), + message: format!("invalid chromosome '{chrom}'; expected 1-22, X, Y, or MT"), + }); + } + + let has_pos = coord.contains_key(Value::String("pos".to_owned())); + let has_start = coord.contains_key(Value::String("start".to_owned())); + let has_end = coord.contains_key(Value::String("end".to_owned())); + if has_pos && (has_start || has_end) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "use either pos or start/end, not both".to_owned(), + }); + continue; + } + if !(has_pos || has_start && has_end) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "expected either pos or start/end".to_owned(), + }); + continue; + } + + if has_pos { + validate_coordinate_pos(coord, assembly, issues); + } else { + validate_coordinate_range(coord, assembly, issues); + } + } +} + +fn validate_coordinate_pos(coord: &Mapping, assembly: &str, issues: &mut Vec) { + if let Some(pos) = i64_at_mapping(coord, "pos") { + if pos < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.pos"), + message: "expected integer >= 1".to_owned(), + }); + } + } else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.pos"), + message: "expected integer".to_owned(), + }); + } +} + +fn validate_coordinate_range(coord: &Mapping, assembly: &str, issues: &mut Vec) { + let start = i64_at_mapping(coord, "start"); + let end = i64_at_mapping(coord, "end"); + match (start, end) { + (Some(start), Some(end)) => validate_coordinate_range_values(start, end, assembly, issues), + _ => issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "expected integer start/end".to_owned(), + }), + } +} + +fn validate_coordinate_range_values(start: i64, end: i64, assembly: &str, issues: &mut Vec) { + if start < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.start"), + message: "expected integer >= 1".to_owned(), + }); + } + if end < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.end"), + message: "expected integer >= 1".to_owned(), + }); + } + if end < start { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.end"), + message: "expected end >= start".to_owned(), + }); + } + if start == end { + issues.push(Issue { + severity: Severity::Warning, + path: format!("coordinates.{assembly}"), + message: "single-position coordinate uses start/end; prefer pos".to_owned(), + }); + } +} + diff --git a/rust/bioscript-schema/src/validator_types.rs b/rust/bioscript-schema/src/validator_types.rs new file mode 100644 index 0000000..13f7a96 --- /dev/null +++ b/rust/bioscript-schema/src/validator_types.rs @@ -0,0 +1,180 @@ +use std::{ + collections::BTreeSet, + fmt::{self, Write as _}, + fs, + path::{Path, PathBuf}, +}; + +use bioscript_core::{GenomicLocus, VariantKind, VariantSpec}; +use serde_yaml::{Mapping, Value}; +use url::Url; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Severity { + Error, + Warning, +} + +impl fmt::Display for Severity { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Error => f.write_str("error"), + Self::Warning => f.write_str("warning"), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Issue { + pub severity: Severity, + pub path: String, + pub message: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FileReport { + pub file: PathBuf, + pub issues: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ValidationReport { + pub files_scanned: usize, + pub reports: Vec, +} + +impl ValidationReport { + #[must_use] + pub fn total_issues(&self) -> usize { + self.reports.iter().map(|report| report.issues.len()).sum() + } + + #[must_use] + pub fn total_errors(&self) -> usize { + self.reports + .iter() + .flat_map(|report| &report.issues) + .filter(|issue| issue.severity == Severity::Error) + .count() + } + + #[must_use] + pub fn total_warnings(&self) -> usize { + self.reports + .iter() + .flat_map(|report| &report.issues) + .filter(|issue| issue.severity == Severity::Warning) + .count() + } + + #[must_use] + pub fn has_errors(&self) -> bool { + self.total_errors() > 0 + } + + #[must_use] + pub fn render_text(&self) -> String { + let mut out = String::new(); + let _ = write!( + out, + "files_scanned: {}\nerrors: {}\nwarnings: {}\n", + self.files_scanned, + self.total_errors(), + self.total_warnings() + ); + for report in &self.reports { + out.push('\n'); + let _ = writeln!(out, "file: {}", report.file.display()); + for issue in &report.issues { + let _ = writeln!( + out, + " - [{}] {}: {}", + issue.severity, issue.path, issue.message + ); + } + } + out + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct VariantManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub spec: VariantSpec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub permissions: Permissions, + pub downloads: Vec, + pub members: Vec, + pub interpretations: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct AssayManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub members: Vec, + pub interpretations: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct Permissions { + pub domains: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Download { + pub id: String, + pub url: String, + pub origin: String, + pub sha256: String, + pub version: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelMember { + pub kind: String, + pub path: Option, + pub download: Option, + pub sha256: Option, + pub version: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretation { + pub id: String, + pub kind: String, + pub path: String, + pub output_format: Option, + pub derived_from: Vec, + pub emits: Vec, + pub logic: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationLogic { + pub source: Option, + pub description: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationLogicSource { + pub name: Option, + pub url: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationEmit { + pub key: String, + pub label: Option, + pub value_type: Option, + pub format: Option, +} From ada4d02f1a5983ebae32e46f34dfb6eb90b7f477 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 7 May 2026 09:23:27 +1000 Subject: [PATCH 3/4] Show clippy diagnostics in lint script --- lint.sh | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/lint.sh b/lint.sh index 9e37793..cbc390a 100755 --- a/lint.sh +++ b/lint.sh @@ -24,6 +24,27 @@ filter_vendored() { awk 'BEGIN{RS=""; ORS="\n\n"} !/\/noodles\/|\/vendor\/|`noodles-|`lexical-/' } -cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings 2> >(filter_vendored >&2) +CLIPPY_STDERR="$(mktemp)" +FILTERED_STDERR="$(mktemp)" +cleanup() { + rm -f "$CLIPPY_STDERR" "$FILTERED_STDERR" +} +trap cleanup EXIT + +set +e +cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings 2> "$CLIPPY_STDERR" +CLIPPY_STATUS=$? +set -e + +filter_vendored < "$CLIPPY_STDERR" > "$FILTERED_STDERR" +if [[ -s "$FILTERED_STDERR" ]]; then + cat "$FILTERED_STDERR" >&2 +elif [[ "$CLIPPY_STATUS" -ne 0 ]]; then + cat "$CLIPPY_STDERR" >&2 +fi + +if [[ "$CLIPPY_STATUS" -ne 0 ]]; then + exit "$CLIPPY_STATUS" +fi cargo test -p bioscript-core --test source_size -- --nocapture From 7b955544099d1b500596680ae7b6019146cbc10d Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 7 May 2026 09:33:04 +1000 Subject: [PATCH 4/4] Fix lint on current stable Rust --- lint.sh | 18 ++++++++--------- rust/bioscript-cli/src/report_execution.rs | 3 +-- .../src/validator_findings.rs | 20 +++++++++---------- 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/lint.sh b/lint.sh index cbc390a..74b8876 100755 --- a/lint.sh +++ b/lint.sh @@ -24,23 +24,23 @@ filter_vendored() { awk 'BEGIN{RS=""; ORS="\n\n"} !/\/noodles\/|\/vendor\/|`noodles-|`lexical-/' } -CLIPPY_STDERR="$(mktemp)" -FILTERED_STDERR="$(mktemp)" +CLIPPY_OUTPUT="$(mktemp)" +FILTERED_OUTPUT="$(mktemp)" cleanup() { - rm -f "$CLIPPY_STDERR" "$FILTERED_STDERR" + rm -f "$CLIPPY_OUTPUT" "$FILTERED_OUTPUT" } trap cleanup EXIT set +e -cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings 2> "$CLIPPY_STDERR" +cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings > "$CLIPPY_OUTPUT" 2>&1 CLIPPY_STATUS=$? set -e -filter_vendored < "$CLIPPY_STDERR" > "$FILTERED_STDERR" -if [[ -s "$FILTERED_STDERR" ]]; then - cat "$FILTERED_STDERR" >&2 -elif [[ "$CLIPPY_STATUS" -ne 0 ]]; then - cat "$CLIPPY_STDERR" >&2 +filter_vendored < "$CLIPPY_OUTPUT" > "$FILTERED_OUTPUT" +if [[ "$CLIPPY_STATUS" -ne 0 ]]; then + cat "$CLIPPY_OUTPUT" >&2 +elif [[ -s "$FILTERED_OUTPUT" ]]; then + cat "$FILTERED_OUTPUT" >&2 fi if [[ "$CLIPPY_STATUS" -ne 0 ]]; then diff --git a/rust/bioscript-cli/src/report_execution.rs b/rust/bioscript-cli/src/report_execution.rs index 622ea5b..d657f6c 100644 --- a/rust/bioscript-cli/src/report_execution.rs +++ b/rust/bioscript-cli/src/report_execution.rs @@ -191,7 +191,7 @@ fn run_bioscript_analysis_script( loader: &GenotypeLoadOptions, ) -> Result<(), String> { let limits = ResourceLimits::new() - .max_duration(Duration::from_millis(1000)) + .max_duration(Duration::from_secs(1)) .max_memory(16 * 1024 * 1024) .max_allocations(400_000) .gc_interval(1000) @@ -306,4 +306,3 @@ fn participant_id_from_path(path: &Path) -> String { .trim_end_matches(".csv") .to_owned() } - diff --git a/rust/bioscript-schema/src/validator_findings.rs b/rust/bioscript-schema/src/validator_findings.rs index 185649f..065c55c 100644 --- a/rust/bioscript-schema/src/validator_findings.rs +++ b/rust/bioscript-schema/src/validator_findings.rs @@ -123,17 +123,17 @@ fn validate_finding_binding_source(parent: &str, binding: &Mapping, issues: &mut .get(Value::String("source".to_owned())) .and_then(Value::as_str); match source { - Some("variant") => { + Some("variant") if !binding.contains_key(Value::String("variant".to_owned())) - && !binding.contains_key(Value::String("path".to_owned())) - { - issues.push(Issue { - severity: Severity::Error, - path: format!("{parent}.binding.variant"), - message: "variant findings require variant or path".to_owned(), - }); - } + && !binding.contains_key(Value::String("path".to_owned())) => + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.variant"), + message: "variant findings require variant or path".to_owned(), + }); } + Some("variant") | None => {} Some("analysis") => { validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); validate_required_mapping_string( @@ -148,7 +148,6 @@ fn validate_finding_binding_source(parent: &str, binding: &Mapping, issues: &mut path: format!("{parent}.binding.source"), message: format!("unsupported source '{other}'"), }), - None => {} } } @@ -243,4 +242,3 @@ fn validate_finding_binding_operator(parent: &str, binding: &Mapping, issues: &m }), } } -