diff --git a/AGENTS.md b/AGENTS.md index d3fc05f..ac2c44f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -5,6 +5,11 @@ Keep first-party production Rust source files at or below 500 lines. This applies to files under `rust/bioscript-*/src/**/*.rs`. +When editing BioScript Rust, prefer adding behavior to a small, named module +whose filename describes the responsibility. If a file is approaching 500 lines, +split it along a real domain boundary before adding more code. Do not satisfy +the guard by creating arbitrary numbered chunks or `*_part_*` files. + The 500-line rule does not apply to: - integration tests and unit-test modules @@ -16,6 +21,5 @@ production limit measures production code, not test scaffolding. Test files should still be split when they mix unrelated behavior or become hard to scan. When a production file grows past 500 lines, split it before adding more -behavior. Temporary exceptions must be listed in this file under -`Current Refactor Backlog`; the source-size guard reads that list and fails when -it drifts from the code. +behavior. Keep the include list in the parent file short and logical, and leave +file names meaningful enough that future agents can find the right place to edit. diff --git a/docs/assay-schema.md b/docs/assay-schema.md new file mode 100644 index 0000000..d087f4e --- /dev/null +++ b/docs/assay-schema.md @@ -0,0 +1,138 @@ +# Assay Schema + +Use an assay when a named test observes one or more variants and emits custom derived report fields. + +An assay is different from a panel: a panel is a collection of mostly independent observations, while an assay has its own interpretation logic. APOL1 is an assay because it observes G1/G2 sites and reports one derived APOL1 status. + +## Schema Identity + +```yaml +schema: "bioscript:assay:1.0" +version: "1.0" +``` + +## Minimal Shape + +```yaml +schema: "bioscript:assay:1.0" +version: "1.0" +name: "APOL1" +label: "APOL1 Risk Assay" +tags: + - "type:risk" + - "gene:APOL1" + +members: + - kind: "variant" + path: "g1-site-1.yaml" + version: "1.0" + - kind: "variant" + path: "g1-site-2.yaml" + version: "1.0" + - kind: "variant" + path: "g2-site.yaml" + version: "1.0" + +analyses: + - id: "apol1_status" + kind: "bioscript" + path: "apol1.py" + output_format: "tsv" + label: "APOL1 risk genotype" + derived_from: + - "g1-site-1.yaml" + - "g1-site-2.yaml" + - "g2-site.yaml" + emits: + - key: "apol1_status" + label: "APOL1 status" + value_type: "string" + format: "badge" + logic: + source: + name: "Example derivation source" + url: "https://example.org/assay-logic" + description: > + Optional human-readable description of the derivation logic implemented by the analysis script. +``` + +## Members + +Assay members are currently local variant YAML files: + +```yaml +- kind: "variant" + path: "g1-site-1.yaml" + version: "1.0" +``` + +Rules: + +- `kind` is required and currently must be `variant` +- `path` is required +- `version` is recommended +- keep variant identity, coordinates, alleles, findings, and provenance in the variant YAML files + +## Analyses + +Use `analyses` for custom output derived from the member variants. The older `interpretations` key is accepted for compatibility, but new manifests should use `analyses`. + +Rules: + +- `id`, `kind`, `path`, and `derived_from` are required +- `kind` is currently `bioscript` +- `path` points to a BioScript-compatible Python file +- `output_format` is optional and defaults to `tsv`; use `json` or `jsonl` when the script writes structured JSON output +- `derived_from` lists the variant YAML files used by the interpretation +- `emits` is optional but recommended so report generators know which output columns to display and how to label them +- `logic` is optional; use `logic.description` and `logic.source.url` to document where the script's derivation rules came from + +## Findings + +Use `findings` for evidence that binds either to a variant observation or an emitted analysis value. Keep the executable logic in `analyses`; keep PGx evidence and reporting semantics in YAML. + +```yaml +findings: + - schema: "bioscript:pgx-label:1.0" + id: "clinpgx_PA166313401" + label: "ClinPGx drug label annotation PA166313401" + authority_type: "regulatory_label" + binding: + source: "analysis" + analysis_id: "apoe_epsilon" + key: "apoe_status" + operator: "equals" + value: "e4/e4" + drugs: + - name: "lecanemab" + aliases: + - "LEQEMBI" + evidence: + source: "ClinPGx" + kind: "label_annotation" + id: "PA166313401" + url: "https://www.clinpgx.org/labelAnnotation/PA166313401" + notes: "Drug label annotation applies when APOE status is e4/e4." +``` + +Binding rules: + +- `source` is `analysis` or `variant` +- `analysis` bindings require `analysis_id`, `key`, and either `operator: equals` with `value` or `operator: in` with `values` +- `variant` bindings require `variant` or `path`, `key`, and either `equals`/`value` or `in`/`values` +- PGx label findings use `schema: "bioscript:pgx-label:1.0"` and should include `regulatory_sources`, `pgx_action_level` or `prescribing_actions` when known +- PGx summary findings use `schema: "bioscript:pgx-summary:1.0"` and should include `evidence_level`, `phenotype_categories`, and genotype-specific `effects` when known +- PGx findings should include `drugs` and should link to the exact ClinPGx/PharmGKB/ClinVar evidence page + +## Inclusion In Panels + +A larger panel may include an assay as a member: + +```yaml +members: + - kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" +``` + +When a panel includes an assay, the assay's variant observations can be expanded into the panel output, while report tooling can also run the assay's interpretation and include its emitted fields. diff --git a/docs/assay-schema.yaml b/docs/assay-schema.yaml new file mode 100644 index 0000000..7019e2f --- /dev/null +++ b/docs/assay-schema.yaml @@ -0,0 +1,66 @@ +schema: "bioscript:assay:1.0" +version: "1.0" +name: "APOL1" +label: "APOL1 Risk Assay" +summary: "APOL1 assay that observes G1 and G2 sites and emits the derived APOL1 risk genotype." +tags: + - "type:risk" + - "gene:APOL1" + +members: + - kind: "variant" + path: "g1-site-1.yaml" + version: "1.0" + - kind: "variant" + path: "g1-site-2.yaml" + version: "1.0" + - kind: "variant" + path: "g2-site.yaml" + version: "1.0" + +analyses: + - id: "apol1_status" + kind: "bioscript" + path: "apol1.py" + output_format: "tsv" + label: "APOL1 risk genotype" + derived_from: + - "g1-site-1.yaml" + - "g1-site-2.yaml" + - "g2-site.yaml" + emits: + - key: "apol1_status" + label: "APOL1 status" + value_type: "string" + format: "badge" + logic: + source: + name: "Example derivation source" + url: "https://example.org/assay-logic" + description: > + Optional human-readable description of the derivation logic implemented by the analysis script. + +findings: + - schema: "bioscript:pgx-label:1.0" + id: "example_analysis_bound_pgx_finding" + label: "Example analysis-bound PGx finding" + authority_type: "regulatory_label" + binding: + source: "analysis" + analysis_id: "apol1_status" + key: "apol1_status" + operator: "equals" + value: "G2/G2" + drugs: + - name: "example drug" + aliases: + - "example brand" + regulatory_sources: + - "FDA" + pgx_action_level: "Actionable PGx" + evidence: + source: "ClinPGx" + kind: "label_annotation" + id: "PA..." + url: "https://www.clinpgx.org/labelAnnotation/PA..." + notes: "Findings can bind to emitted analysis keys using equals or in." diff --git a/docs/panel-schema.md b/docs/panel-schema.md index 708daee..ac155a1 100644 --- a/docs/panel-schema.md +++ b/docs/panel-schema.md @@ -1,8 +1,8 @@ # Panel Schema -Use a panel when you want one manifest that points to a curated set of runnable variant records. +Use a panel when you want one manifest that points to a curated set of runnable variant records, assay manifests, and optional interpretation scripts derived from those records. -Right now the Rust runner supports variant members directly. Keep the shape simple. +The Rust runner supports variant members directly. Test tooling can also run declared interpretation scripts and add their emitted fields to the generated report. ## Schema Identity @@ -25,9 +25,26 @@ members: - kind: "variant" path: "variants/rs671.yaml" version: "1.0" + - kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" - kind: "variant" path: "variants/rs713598.yaml" version: "1.0" + +analyses: + - id: "taste_status" + kind: "bioscript" + path: "interpretations/taste.py" + output_format: "tsv" + label: "Taste status" + derived_from: + - "variants/rs713598.yaml" + emits: + - key: "taste_status" + label: "Taste status" + value_type: "string" + format: "badge" ``` ## Purpose @@ -37,30 +54,74 @@ A panel is: - a selection manifest - a stable name for a bundle of variants - something the Rust `bioscript` command can run directly +- a way to include smaller assay manifests in a broader bundle +- a place to declare interpretation chunks that derive custom report fields from member variants It is not: - a full remote package manager - a replacement for richer assay manifests +- a place to hide variant metadata inside Python when YAML can describe it ## Members -Each member must currently be: +Each member must currently be a local variant or assay: ```yaml - kind: "variant" path: "variants/rs671.yaml" version: "1.0" +- kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" ``` Rules: - `kind` is required - exactly one of `path` or `download` is required -- current runner support is `variant` members only +- current runner support is local `variant` and `assay` members - `version` is recommended for local members - `sha256` is optional for local members +## Analyses + +Use `analyses` when a panel needs custom derived output that is not the same thing as a single variant observation. Examples include APOE epsilon genotype from rs429358/rs7412 or APOL1 G0/G1/G2 status from three sites. The older `interpretations` key is accepted for compatibility, but new manifests should use `analyses`. + +```yaml +analyses: + - id: "apoe_epsilon" + kind: "bioscript" + path: "variants/APOE/apoe.py" + output_format: "tsv" + label: "APOE epsilon genotype" + derived_from: + - "variants/APOE/rs429358.yaml" + - "variants/APOE/rs7412.yaml" + emits: + - key: "apoe_status" + label: "APOE status" + value_type: "string" + format: "badge" + logic: + source: + name: "ClinPGx / PharmGKB" + url: "https://www.clinpgx.org/variant/PA166155341/overview" + description: > + Optional human-readable description of the derivation logic implemented by the analysis script. +``` + +Rules: + +- `id`, `kind`, `path`, and `derived_from` are required +- `kind` is currently `bioscript` +- `path` points to a BioScript-compatible Python file +- `output_format` is optional and defaults to `tsv`; use `json` or `jsonl` when the script writes structured JSON output +- `derived_from` lists the variant YAML files used by the interpretation +- `emits` is optional but recommended so report generators know which output columns to display and how to label them +- `logic` is optional; use `logic.description` and `logic.source.url` to document where the script's derivation rules came from +- keep variant identity, coordinates, alleles, findings, and provenance in YAML; keep cross-variant logic in the interpretation script + ## Permissions And Downloads Panels may declare remote downloads up front even if the current runner only executes local members. diff --git a/docs/panel-schema.yaml b/docs/panel-schema.yaml index a35dcd3..d9d7fa1 100644 --- a/docs/panel-schema.yaml +++ b/docs/panel-schema.yaml @@ -9,6 +9,29 @@ members: - kind: "variant" path: "variants/rs671.yaml" version: "1.0" + - kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" - kind: "variant" path: "variants/rs713598.yaml" version: "1.0" + +analyses: + - id: "taste_status" + kind: "bioscript" + path: "interpretations/taste.py" + output_format: "tsv" + label: "Taste status" + derived_from: + - "variants/rs713598.yaml" + emits: + - key: "taste_status" + label: "Taste status" + value_type: "string" + format: "badge" + logic: + source: + name: "Example derivation source" + url: "https://example.org/panel-analysis-logic" + description: > + Optional human-readable description of the derivation logic implemented by the analysis script. diff --git a/docs/variant-schema.md b/docs/variant-schema.md index 84e5ac8..f6c9de6 100644 --- a/docs/variant-schema.md +++ b/docs/variant-schema.md @@ -166,6 +166,7 @@ Fields: - `kind`: `snv | deletion | insertion | indel` - `ref` - `alts` +- `observed_alts` optional - `deletion_length` optional - `insertion_sequence` optional - `motifs` optional @@ -174,6 +175,10 @@ Fields: Stored YAML should describe the biological allele. Do not use symbolic `I` / `D` allele values in this schema. +`alts` is the curated set of alternate alleles that matter for this catalogue entry and should drive app flagging. `observed_alts` is the full set of source-reported alternate alleles observed at the same locus, usually from dbSNP. If `observed_alts` is omitted, tools treat `alts` as both curated and observed. + +When a source such as dbSNP reports multiple alternates but only one has the clinical or PGx evidence being catalogued, keep that allele in `alts` and put the full dbSNP set in `observed_alts`. + Example SNV: ```yaml @@ -181,6 +186,8 @@ alleles: kind: "snv" ref: "G" alts: + - "T" + observed_alts: - "A" - "C" - "T" @@ -205,10 +212,90 @@ Envelope fields: - `schema` required - `alt` optional, but required for allele-specific findings; use `"*"` when the finding applies to any alternate allele at a multiallelic locus +- `binding` optional; use it when report logic should match a specific variant observation field instead of relying on `alt` - `label` optional - `summary` optional - `notes` optional +Variant-bound PGx sidecar include: + +```yaml +findings: + - schema: "bioscript:pgx-summary:1.0" + id: "rs123_pgx_sidecar" + include: "rs123-pgx.yaml" + notes: "Detailed PGx findings are stored in the sidecar file." +``` + +Sidecar files use `schema: "bioscript:pgx-findings:1.0"` and contain dense PGx evidence for one variant. This keeps large ClinPGx/PharmGKB annotation tables out of the core variant identity file. A sidecar may contain both summary annotations and drug label annotations. + +Summary annotations are variant/drug evidence interpretations. They use `schema: "bioscript:pgx-summary:1.0"` and normally include evidence levels, phenotype categories, and genotype-specific effects. + +```yaml +findings: + - schema: "bioscript:pgx-summary:1.0" + id: "example_summary_annotation" + authority_type: "evidence_summary" + drugs: + - name: "example drug" + phenotype_categories: + - "Toxicity" + evidence_level: "3" + evidence: + source: "ClinPGx" + kind: "summary_annotation" + id: "1448427005" + url: "https://www.clinpgx.org/variant/PA.../summaryAnnotation" + effects: + - id: "example_variant_bound_pgx_finding_alt_carrier" + label: "C carrier" + binding: + source: "variant" + variant: "rs123.yaml" + allele: "C" + operator: "dosage_in" + values: [1, 2] + description: "Applies when the participant carries one or two copies of C." + text: "Reportable text for this allele dosage." +``` + +Drug label annotations are regulatory label statements. They use `schema: "bioscript:pgx-label:1.0"` and should carry regulatory/action fields instead of summary evidence levels. + +```yaml +findings: + - schema: "bioscript:pgx-label:1.0" + id: "example_label_annotation" + authority_type: "regulatory_label" + genes: + - "ABCG2" + drugs: + - name: "rosuvastatin" + aliases: + - "Crestor" + regulatory_sources: + - "FDA" + pgx_action_level: "testing_recommended" + prescribing_actions: + - "dose_adjustment" + evidence: + source: "ClinPGx" + kind: "label_annotation" + id: "PA..." + url: "https://www.clinpgx.org/variant/PA.../labelAnnotation" + notes: "Regulatory drug label annotation." +``` + +Supported binding operators are: + +- `equals` and `in` for matching literal analysis outputs or observation fields +- `dosage_equals` and `dosage_in` for variant allele dosage, where `allele` is the reference allele or one of the alternate alleles and dosage values are `0`, `1`, or `2` + +Known PGx finding schemas are: + +- `bioscript:pgx-summary:1.0` for ClinPGx/PharmGKB summary annotations +- `bioscript:pgx-label:1.0` for ClinPGx/PharmGKB drug label annotations +- `bioscript:pgx:1.0` is legacy; prefer one of the two specific schemas above + Unknown finding schemas are allowed. ## Optional Metadata diff --git a/docs/variant-schema.yaml b/docs/variant-schema.yaml index e99f8a6..3c2a26d 100644 --- a/docs/variant-schema.yaml +++ b/docs/variant-schema.yaml @@ -28,6 +28,9 @@ alleles: ref: "A" alts: - "G" + observed_alts: + - "G" + - "T" deletion_length: 1 insertion_sequence: "AT" motifs: @@ -46,9 +49,28 @@ findings: label: "string" summary: "string" notes: "string" - - schema: "bioscript:pgx:1.0" - alt: "*" - notes: "Finding applies to any alternate allele at this multiallelic locus." + - schema: "bioscript:pgx-summary:1.0" + id: "example_pgx_summary" + label: "Example summary annotation" + authority_type: "evidence_summary" + binding: + source: "variant" + variant: "rs123.yaml" + allele: "G" + operator: "dosage_in" + values: + - 1 + - 2 + description: "Applies when the participant carries at least one G allele." + drugs: + - name: "example drug" + evidence_level: "3" + evidence: + source: "ClinPGx" + kind: "summary_annotation" + id: "144..." + url: "https://www.clinpgx.org/variant/PA.../summaryAnnotation" + notes: "Summary annotations capture variant-drug evidence and can be stored directly or through a pgx-findings sidecar." provenance: sources: diff --git a/lint.sh b/lint.sh index 9e37793..74b8876 100755 --- a/lint.sh +++ b/lint.sh @@ -24,6 +24,27 @@ filter_vendored() { awk 'BEGIN{RS=""; ORS="\n\n"} !/\/noodles\/|\/vendor\/|`noodles-|`lexical-/' } -cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings 2> >(filter_vendored >&2) +CLIPPY_OUTPUT="$(mktemp)" +FILTERED_OUTPUT="$(mktemp)" +cleanup() { + rm -f "$CLIPPY_OUTPUT" "$FILTERED_OUTPUT" +} +trap cleanup EXIT + +set +e +cargo clippy "${PKG_ARGS[@]}" --all-targets --color=never -- -D warnings > "$CLIPPY_OUTPUT" 2>&1 +CLIPPY_STATUS=$? +set -e + +filter_vendored < "$CLIPPY_OUTPUT" > "$FILTERED_OUTPUT" +if [[ "$CLIPPY_STATUS" -ne 0 ]]; then + cat "$CLIPPY_OUTPUT" >&2 +elif [[ -s "$FILTERED_OUTPUT" ]]; then + cat "$FILTERED_OUTPUT" >&2 +fi + +if [[ "$CLIPPY_STATUS" -ne 0 ]]; then + exit "$CLIPPY_STATUS" +fi cargo test -p bioscript-core --test source_size -- --nocapture diff --git a/rust/Cargo.lock b/rust/Cargo.lock index bf9a5dc..fbf0dee 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -109,6 +109,7 @@ dependencies = [ "bioscript-runtime", "bioscript-schema", "monty", + "serde_json", "serde_yaml", "zip", ] diff --git a/rust/bioscript-cli/Cargo.toml b/rust/bioscript-cli/Cargo.toml index 374329d..cb78301 100644 --- a/rust/bioscript-cli/Cargo.toml +++ b/rust/bioscript-cli/Cargo.toml @@ -13,6 +13,7 @@ bioscript-formats = { path = "../bioscript-formats" } bioscript-runtime = { path = "../bioscript-runtime" } bioscript-schema = { path = "../bioscript-schema" } monty = { path = "../../monty/crates/monty" } +serde_json = "1.0.133" serde_yaml = "0.9.34" [dev-dependencies] diff --git a/rust/bioscript-cli/src/cli_bootstrap.rs b/rust/bioscript-cli/src/cli_bootstrap.rs new file mode 100644 index 0000000..5337229 --- /dev/null +++ b/rust/bioscript-cli/src/cli_bootstrap.rs @@ -0,0 +1,414 @@ +use std::{ + collections::BTreeMap, + env, + fmt::Write as _, + fs, + path::{Path, PathBuf}, + process::ExitCode, + time::{Duration, Instant}, +}; + +use bioscript_formats::{ + GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, InspectOptions, PrepareRequest, + inspect_file, prepare_indexes, shell_flags, +}; +use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; +use bioscript_schema::{ + AssayManifest, PanelInterpretation, PanelManifest, VariantManifest, load_assay_manifest, + load_panel_manifest, load_variant_manifest, validate_assays_path, validate_panels_path, + validate_variants_path, +}; +use monty::ResourceLimits; + +fn main() -> ExitCode { + match run_cli() { + Ok(()) => ExitCode::SUCCESS, + Err(err) => { + eprintln!("bioscript: {err}"); + ExitCode::FAILURE + } + } +} + +fn run_cli() -> Result<(), String> { + let args: Vec = env::args().skip(1).collect(); + if dispatch_subcommand(&args)? { + return Ok(()); + } + + let mut options = parse_cli_options(args)?; + let script_path = options.script_path.clone().ok_or_else(|| USAGE.to_owned())?; + let runtime_root = options + .root + .clone() + .map_or_else(env::current_dir, Ok) + .map_err(|err| format!("failed to get current directory: {err}"))?; + normalize_loader_paths(&runtime_root, &mut options.loader); + let mut cli_timings = prepare_cli_indexes(&runtime_root, &mut options)?; + + if is_yaml_manifest(&script_path) { + run_cli_manifest(&runtime_root, &script_path, &options, &mut cli_timings)?; + } else { + run_cli_script(&script_path, options, cli_timings)?; + } + Ok(()) +} + +const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--root ] [--input-format auto|text|zip|vcf|cram]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]"; + +struct CliOptions { + script_path: Option, + root: Option, + input_file: Option, + output_file: Option, + participant_id: Option, + trace_report: Option, + timing_report: Option, + filters: Vec, + auto_index: bool, + cache_dir: Option, + loader: GenotypeLoadOptions, + limits: ResourceLimits, +} + +fn dispatch_subcommand(args: &[String]) -> Result { + let Some((first, rest)) = args.split_first() else { + return Ok(false); + }; + let rest = rest.to_vec(); + match first.as_str() { + "report" => run_app_report(rest).map(|()| true), + "validate-variants" => run_validate_variants(rest).map(|()| true), + "validate-panels" => run_validate_panels(rest).map(|()| true), + "validate-assays" => run_validate_assays(rest).map(|()| true), + "prepare" => run_prepare(rest).map(|()| true), + "inspect" => run_inspect(rest).map(|()| true), + _ => Ok(false), + } +} + +fn parse_cli_options(args: Vec) -> Result { + let mut args = args.into_iter(); + let mut options = default_cli_options(); + while let Some(arg) = args.next() { + parse_cli_arg(arg, &mut args, &mut options)?; + } + Ok(options) +} + +fn default_cli_options() -> CliOptions { + CliOptions { + script_path: None, + root: None, + input_file: None, + output_file: None, + participant_id: None, + trace_report: None, + timing_report: None, + filters: Vec::new(), + auto_index: false, + cache_dir: None, + loader: GenotypeLoadOptions::default(), + limits: ResourceLimits::new() + .max_duration(Duration::from_millis(100)) + .max_memory(8 * 1024 * 1024) + .max_allocations(200_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)), + } +} + +fn parse_cli_arg( + arg: String, + args: &mut impl Iterator, + options: &mut CliOptions, +) -> Result<(), String> { + if parse_cli_path_arg(&arg, args, options)? { + return Ok(()); + } + if parse_cli_loader_arg(&arg, args, options)? { + return Ok(()); + } + if parse_cli_limit_arg(&arg, args, options)? { + return Ok(()); + } + if arg == "--auto-index" { + options.auto_index = true; + } else if options.script_path.is_none() { + options.script_path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + Ok(()) +} + +fn parse_cli_path_arg( + arg: &str, + args: &mut impl Iterator, + options: &mut CliOptions, +) -> Result { + if arg == "--root" { + let Some(value) = args.next() else { + return Err("--root requires a directory".to_owned()); + }; + options.root = Some(PathBuf::from(value)); + } else if arg == "--input-file" { + let Some(value) = args.next() else { + return Err("--input-file requires a path".to_owned()); + }; + options.input_file = Some(value); + } else if arg == "--output-file" { + let Some(value) = args.next() else { + return Err("--output-file requires a path".to_owned()); + }; + options.output_file = Some(value); + } else if arg == "--participant-id" { + let Some(value) = args.next() else { + return Err("--participant-id requires a value".to_owned()); + }; + options.participant_id = Some(value); + } else if arg == "--trace-report" { + let Some(value) = args.next() else { + return Err("--trace-report requires a path".to_owned()); + }; + options.trace_report = Some(PathBuf::from(value)); + } else if arg == "--timing-report" { + let Some(value) = args.next() else { + return Err("--timing-report requires a path".to_owned()); + }; + options.timing_report = Some(PathBuf::from(value)); + } else if arg == "--filter" { + let Some(value) = args.next() else { + return Err("--filter requires key=value".to_owned()); + }; + options.filters.push(value); + } else if arg == "--cache-dir" { + let Some(value) = args.next() else { + return Err("--cache-dir requires a path".to_owned()); + }; + options.cache_dir = Some(PathBuf::from(value)); + } else { + return Ok(false); + } + Ok(true) +} + +fn parse_cli_loader_arg( + arg: &str, + args: &mut impl Iterator, + options: &mut CliOptions, +) -> Result { + if arg == "--input-format" { + let Some(value) = args.next() else { + return Err("--input-format requires a value".to_owned()); + }; + if value.eq_ignore_ascii_case("auto") { + options.loader.format = None; + } else { + let parsed = value + .parse::() + .map_err(|err| format!("invalid --input-format value {value}: {err}"))?; + options.loader.format = Some(parsed); + } + } else if arg == "--input-index" { + let Some(value) = args.next() else { + return Err("--input-index requires a path".to_owned()); + }; + options.loader.input_index = Some(PathBuf::from(value)); + } else if arg == "--reference-file" { + let Some(value) = args.next() else { + return Err("--reference-file requires a path".to_owned()); + }; + options.loader.reference_file = Some(PathBuf::from(value)); + } else if arg == "--reference-index" { + let Some(value) = args.next() else { + return Err("--reference-index requires a path".to_owned()); + }; + options.loader.reference_index = Some(PathBuf::from(value)); + } else { + return Ok(false); + } + Ok(true) +} + +fn parse_cli_limit_arg( + arg: &str, + args: &mut impl Iterator, + options: &mut CliOptions, +) -> Result { + if arg == "--max-duration-ms" { + let Some(value) = args.next() else { + return Err("--max-duration-ms requires an integer".to_owned()); + }; + let parsed = value + .parse::() + .map_err(|err| format!("invalid --max-duration-ms value {value}: {err}"))?; + options.limits = options.limits.clone().max_duration(Duration::from_millis(parsed)); + } else if arg == "--max-memory-bytes" { + let Some(value) = args.next() else { + return Err("--max-memory-bytes requires an integer".to_owned()); + }; + let parsed = value + .parse::() + .map_err(|err| format!("invalid --max-memory-bytes value {value}: {err}"))?; + options.limits = options.limits.clone().max_memory(parsed); + } else if arg == "--max-allocations" { + let Some(value) = args.next() else { + return Err("--max-allocations requires an integer".to_owned()); + }; + let parsed = value + .parse::() + .map_err(|err| format!("invalid --max-allocations value {value}: {err}"))?; + options.limits = options.limits.clone().max_allocations(parsed); + } else if arg == "--max-recursion-depth" { + let Some(value) = args.next() else { + return Err("--max-recursion-depth requires an integer".to_owned()); + }; + let parsed = value + .parse::() + .map_err(|err| format!("invalid --max-recursion-depth value {value}: {err}"))?; + options.limits = options.limits.clone().max_recursion_depth(Some(parsed)); + } else { + return Ok(false); + } + Ok(true) +} + +fn prepare_cli_indexes( + runtime_root: &Path, + options: &mut CliOptions, +) -> Result, String> { + let mut cli_timings: Vec = Vec::new(); + if options.auto_index { + let auto_index_started = Instant::now(); + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let effective_cache = options + .cache_dir + .clone() + .unwrap_or_else(|| cwd.join(".bioscript-cache")); + let request = PrepareRequest { + root: runtime_root.to_path_buf(), + cwd, + cache_dir: effective_cache, + input_file: options.input_file.clone(), + input_format: options.loader.format, + reference_file: options + .loader + .reference_file + .as_ref() + .map(|p| p.to_string_lossy().to_string()), + }; + let prepared = prepare_indexes(&request)?; + if let Some(idx) = prepared.input_index + && options.loader.input_index.is_none() + { + eprintln!("bioscript: auto-indexed input -> {}", idx.display()); + options.loader.input_index = Some(idx); + } + if let Some(ref_file) = prepared.reference_file { + options.loader.reference_file = Some(ref_file); + } + if let Some(ref_idx) = prepared.reference_index + && options.loader.reference_index.is_none() + { + eprintln!("bioscript: auto-indexed reference -> {}", ref_idx.display()); + options.loader.reference_index = Some(ref_idx); + } + cli_timings.push(StageTiming { + stage: "auto_index".to_owned(), + duration_ms: auto_index_started.elapsed().as_millis(), + detail: "prepare_indexes".to_owned(), + }); + } + Ok(cli_timings) +} + +fn run_cli_manifest( + runtime_root: &Path, + script_path: &Path, + options: &CliOptions, + cli_timings: &mut Vec, +) -> Result<(), String> { + let manifest_started = Instant::now(); + let manifest_options = ManifestRunOptions { + input_file: options.input_file.as_deref(), + output_file: options.output_file.as_deref(), + participant_id: options.participant_id.as_deref(), + trace_report: options.trace_report.as_deref(), + loader: &options.loader, + filters: &options.filters, + }; + run_manifest(runtime_root, script_path, &manifest_options)?; + cli_timings.push(StageTiming { + stage: "manifest_run".to_owned(), + duration_ms: manifest_started.elapsed().as_millis(), + detail: script_path.display().to_string(), + }); + if let Some(timing_path) = &options.timing_report { + write_timing_report(timing_path, cli_timings)?; + } + Ok(()) +} + +fn run_cli_script( + script_path: &Path, + options: CliOptions, + cli_timings: Vec, +) -> Result<(), String> { + let runtime_root = options + .root + .map_or_else(env::current_dir, Ok) + .map_err(|err| format!("failed to get current directory: {err}"))?; + let runtime = BioscriptRuntime::with_config( + runtime_root, + RuntimeConfig { + limits: options.limits, + loader: options.loader, + }, + ) + .map_err(|err| err.to_string())?; + let mut inputs = Vec::new(); + if let Some(input_file) = options.input_file { + inputs.push(("input_file", monty::MontyObject::String(input_file))); + } + if let Some(output_file) = options.output_file { + inputs.push(("output_file", monty::MontyObject::String(output_file))); + } + if let Some(participant_id) = options.participant_id { + inputs.push(("participant_id", monty::MontyObject::String(participant_id))); + } + + runtime + .run_file(script_path, options.trace_report.as_deref(), inputs) + .map_err(|err| err.to_string())?; + if let Some(timing_path) = options.timing_report { + let mut all_timings = cli_timings; + all_timings.extend(runtime.timing_snapshot()); + write_timing_report(&timing_path, &all_timings)?; + } + Ok(()) +} + +fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), String> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!( + "failed to create timing report dir {}: {err}", + parent.display() + ) + })?; + } + let mut output = String::from("stage\tduration_ms\tdetail\n"); + for timing in timings { + let _ = writeln!( + output, + "{}\t{}\t{}", + timing.stage, + timing.duration_ms, + timing.detail.replace('\t', " ") + ); + } + fs::write(path, output) + .map_err(|err| format!("failed to write timing report {}: {err}", path.display())) +} + diff --git a/rust/bioscript-cli/src/cli_commands.rs b/rust/bioscript-cli/src/cli_commands.rs new file mode 100644 index 0000000..2b5e577 --- /dev/null +++ b/rust/bioscript-cli/src/cli_commands.rs @@ -0,0 +1,257 @@ +fn run_prepare(args: Vec) -> Result<(), String> { + let mut root: Option = None; + let mut input_file: Option = None; + let mut reference_file: Option = None; + let mut input_format: Option = None; + let mut cache_dir: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--root" => { + root = Some(PathBuf::from( + iter.next().ok_or("--root requires a directory")?, + )); + } + "--input-file" => { + input_file = Some(iter.next().ok_or("--input-file requires a path")?); + } + "--reference-file" => { + reference_file = Some(iter.next().ok_or("--reference-file requires a path")?); + } + "--input-format" => { + let value = iter.next().ok_or("--input-format requires a value")?; + if !value.eq_ignore_ascii_case("auto") { + input_format = Some( + value + .parse::() + .map_err(|err| format!("invalid --input-format: {err}"))?, + ); + } + } + "--cache-dir" => { + cache_dir = Some(PathBuf::from( + iter.next().ok_or("--cache-dir requires a path")?, + )); + } + other => { + return Err(format!("unexpected argument: {other}")); + } + } + } + + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let effective_root = root.unwrap_or_else(|| cwd.clone()); + let effective_cache = cache_dir.unwrap_or_else(|| cwd.join(".bioscript-cache")); + + let request = PrepareRequest { + root: effective_root, + cwd, + cache_dir: effective_cache, + input_file, + input_format, + reference_file, + }; + + let prepared = prepare_indexes(&request)?; + + // print the flags that should be passed to a subsequent bioscript run + let flags = shell_flags(&prepared); + if flags.is_empty() { + eprintln!("bioscript prepare: nothing to index"); + } else { + println!("{flags}"); + } + + Ok(()) +} + +fn run_inspect(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut options = InspectOptions::default(); + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--input-index" => { + options.input_index = Some(PathBuf::from( + iter.next().ok_or("--input-index requires a path")?, + )); + } + "--reference-file" => { + options.reference_file = Some(PathBuf::from( + iter.next().ok_or("--reference-file requires a path")?, + )); + } + "--reference-index" => { + options.reference_index = Some(PathBuf::from( + iter.next().ok_or("--reference-index requires a path")?, + )); + } + other if path.is_none() => { + path = Some(PathBuf::from(other)); + } + other => { + return Err(format!("unexpected argument: {other}")); + } + } + } + + let Some(path) = path else { + return Err( + "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + .to_owned(), + ); + }; + + let inspection = inspect_file(&path, &options).map_err(|err| err.to_string())?; + println!("{}", inspection.render_text()); + Ok(()) +} + +fn run_validate_variants(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-variants [--report ]".to_owned()); + }; + + let report = validate_variants_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} + +fn run_validate_panels(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-panels [--report ]".to_owned()); + }; + + let report = validate_panels_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} + +fn is_yaml_manifest(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| matches!(ext, "yaml" | "yml")) +} + +fn run_validate_assays(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-assays [--report ]".to_owned()); + }; + + let report = validate_assays_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index 3c0a73c..7e0e20a 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -1,515 +1,12 @@ -use std::{ - env, - path::PathBuf, - process::ExitCode, - time::{Duration, Instant}, -}; - -use bioscript_formats::{ - GenotypeLoadOptions, GenotypeSourceFormat, PrepareRequest, prepare_indexes, -}; -use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; -use monty::ResourceLimits; - -mod commands; -mod manifest; -mod paths; - -use commands::{run_inspect, run_prepare, run_validate_panels, run_validate_variants}; -use manifest::{ManifestRunOptions, is_yaml_manifest, run_manifest}; -use paths::{normalize_loader_paths, write_timing_report}; - -fn main() -> ExitCode { - match run_cli() { - Ok(()) => ExitCode::SUCCESS, - Err(err) => { - eprintln!("bioscript: {err}"); - ExitCode::FAILURE - } - } -} - -#[allow(clippy::too_many_lines)] -fn run_cli() -> Result<(), String> { - run_cli_args(env::args().skip(1).collect()) -} - -#[allow(clippy::too_many_lines)] -fn run_cli_args(raw_args: Vec) -> Result<(), String> { - let mut args = raw_args.clone().into_iter(); - if let Some(first) = args.next() { - if first == "validate-variants" { - return run_validate_variants(args.collect()); - } - if first == "validate-panels" { - return run_validate_panels(args.collect()); - } - if first == "prepare" { - return run_prepare(args.collect()); - } - if first == "inspect" { - return run_inspect(args.collect()); - } - } - - let mut args = raw_args.into_iter(); - let mut script_path: Option = None; - let mut root: Option = None; - let mut input_file: Option = None; - let mut output_file: Option = None; - let mut participant_id: Option = None; - let mut trace_report: Option = None; - let mut timing_report: Option = None; - let mut filters: Vec = Vec::new(); - let mut auto_index = false; - let mut cache_dir: Option = None; - let mut loader = GenotypeLoadOptions::default(); - let mut limits = ResourceLimits::new() - .max_duration(Duration::from_millis(100)) - .max_memory(8 * 1024 * 1024) - .max_allocations(200_000) - .gc_interval(1000) - .max_recursion_depth(Some(200)); - - while let Some(arg) = args.next() { - if arg == "--root" { - let Some(value) = args.next() else { - return Err("--root requires a directory".to_owned()); - }; - root = Some(PathBuf::from(value)); - } else if arg == "--input-file" { - let Some(value) = args.next() else { - return Err("--input-file requires a path".to_owned()); - }; - input_file = Some(value); - } else if arg == "--output-file" { - let Some(value) = args.next() else { - return Err("--output-file requires a path".to_owned()); - }; - output_file = Some(value); - } else if arg == "--participant-id" { - let Some(value) = args.next() else { - return Err("--participant-id requires a value".to_owned()); - }; - participant_id = Some(value); - } else if arg == "--trace-report" { - let Some(value) = args.next() else { - return Err("--trace-report requires a path".to_owned()); - }; - trace_report = Some(PathBuf::from(value)); - } else if arg == "--timing-report" { - let Some(value) = args.next() else { - return Err("--timing-report requires a path".to_owned()); - }; - timing_report = Some(PathBuf::from(value)); - } else if arg == "--filter" { - let Some(value) = args.next() else { - return Err("--filter requires key=value".to_owned()); - }; - filters.push(value); - } else if arg == "--input-format" { - let Some(value) = args.next() else { - return Err("--input-format requires a value".to_owned()); - }; - if value.eq_ignore_ascii_case("auto") { - loader.format = None; - } else { - let parsed = value - .parse::() - .map_err(|err| format!("invalid --input-format value {value}: {err}"))?; - loader.format = Some(parsed); - } - } else if arg == "--input-index" { - let Some(value) = args.next() else { - return Err("--input-index requires a path".to_owned()); - }; - loader.input_index = Some(PathBuf::from(value)); - } else if arg == "--reference-file" { - let Some(value) = args.next() else { - return Err("--reference-file requires a path".to_owned()); - }; - loader.reference_file = Some(PathBuf::from(value)); - } else if arg == "--reference-index" { - let Some(value) = args.next() else { - return Err("--reference-index requires a path".to_owned()); - }; - loader.reference_index = Some(PathBuf::from(value)); - } else if arg == "--allow-md5-mismatch" { - loader.allow_reference_md5_mismatch = true; - } else if arg == "--max-duration-ms" { - let Some(value) = args.next() else { - return Err("--max-duration-ms requires an integer".to_owned()); - }; - let parsed = value - .parse::() - .map_err(|err| format!("invalid --max-duration-ms value {value}: {err}"))?; - limits = limits.max_duration(Duration::from_millis(parsed)); - } else if arg == "--max-memory-bytes" { - let Some(value) = args.next() else { - return Err("--max-memory-bytes requires an integer".to_owned()); - }; - let parsed = value - .parse::() - .map_err(|err| format!("invalid --max-memory-bytes value {value}: {err}"))?; - limits = limits.max_memory(parsed); - } else if arg == "--max-allocations" { - let Some(value) = args.next() else { - return Err("--max-allocations requires an integer".to_owned()); - }; - let parsed = value - .parse::() - .map_err(|err| format!("invalid --max-allocations value {value}: {err}"))?; - limits = limits.max_allocations(parsed); - } else if arg == "--auto-index" { - auto_index = true; - } else if arg == "--cache-dir" { - let Some(value) = args.next() else { - return Err("--cache-dir requires a path".to_owned()); - }; - cache_dir = Some(PathBuf::from(value)); - } else if arg == "--max-recursion-depth" { - let Some(value) = args.next() else { - return Err("--max-recursion-depth requires an integer".to_owned()); - }; - let parsed = value - .parse::() - .map_err(|err| format!("invalid --max-recursion-depth value {value}: {err}"))?; - limits = limits.max_recursion_depth(Some(parsed)); - } else if script_path.is_none() { - script_path = Some(PathBuf::from(arg)); - } else { - return Err(format!("unexpected argument: {arg}")); - } - } - - let Some(script_path) = script_path else { - return Err( - "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--allow-md5-mismatch] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" - .to_owned(), - ); - }; - - let runtime_root = match root { - Some(dir) => dir, - None => { - env::current_dir().map_err(|err| format!("failed to get current directory: {err}"))? - } - }; - normalize_loader_paths(&runtime_root, &mut loader); - - // auto-index: detect and build missing indexes for CRAM/BAM/FASTA - let mut cli_timings: Vec = Vec::new(); - if auto_index { - let auto_index_started = Instant::now(); - let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; - let effective_cache = cache_dir - .clone() - .unwrap_or_else(|| cwd.join(".bioscript-cache")); - let request = PrepareRequest { - root: runtime_root.clone(), - cwd: cwd.clone(), - cache_dir: effective_cache, - input_file: input_file.clone(), - input_format: loader.format, - reference_file: loader - .reference_file - .as_ref() - .map(|p| p.to_string_lossy().to_string()), - }; - let prepared = prepare_indexes(&request)?; - if let Some(idx) = prepared.input_index - && loader.input_index.is_none() - { - eprintln!("bioscript: auto-indexed input -> {}", idx.display()); - loader.input_index = Some(idx); - } - if let Some(ref_file) = prepared.reference_file { - loader.reference_file = Some(ref_file); - } - if let Some(ref_idx) = prepared.reference_index - && loader.reference_index.is_none() - { - eprintln!("bioscript: auto-indexed reference -> {}", ref_idx.display()); - loader.reference_index = Some(ref_idx); - } - cli_timings.push(StageTiming { - stage: "auto_index".to_owned(), - duration_ms: auto_index_started.elapsed().as_millis(), - detail: "prepare_indexes".to_owned(), - }); - } - - if is_yaml_manifest(&script_path) { - let manifest_started = Instant::now(); - let manifest_options = ManifestRunOptions { - input_file: input_file.as_deref(), - output_file: output_file.as_deref(), - participant_id: participant_id.as_deref(), - trace_report: trace_report.as_deref(), - loader: &loader, - filters: &filters, - }; - run_manifest(&runtime_root, &script_path, &manifest_options)?; - cli_timings.push(StageTiming { - stage: "manifest_run".to_owned(), - duration_ms: manifest_started.elapsed().as_millis(), - detail: script_path.display().to_string(), - }); - if let Some(timing_path) = timing_report { - write_timing_report(&timing_path, &cli_timings)?; - } - return Ok(()); - } - - let runtime = BioscriptRuntime::with_config(runtime_root, RuntimeConfig { limits, loader }) - .map_err(|err| err.to_string())?; - let mut inputs = Vec::new(); - if let Some(input_file) = input_file { - inputs.push(("input_file", monty::MontyObject::String(input_file))); - } - if let Some(output_file) = output_file { - inputs.push(("output_file", monty::MontyObject::String(output_file))); - } - if let Some(participant_id) = participant_id { - inputs.push(("participant_id", monty::MontyObject::String(participant_id))); - } - - runtime - .run_file(&script_path, trace_report.as_deref(), inputs) - .map_err(|err| err.to_string())?; - if let Some(timing_path) = timing_report { - let mut all_timings = cli_timings; - all_timings.extend(runtime.timing_snapshot()); - write_timing_report(&timing_path, &all_timings)?; - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::manifest::{ - manifest_schema, matches_filters, render_rows_as_tsv, resolve_manifest_path, variant_row, - }; - use crate::paths::{normalize_loader_paths, resolve_cli_path, resolve_cli_path_buf}; - use bioscript_core::{Assembly, VariantObservation}; - use bioscript_schema::VariantManifest; - use std::fs; - use std::path::Path; - use std::time::{SystemTime, UNIX_EPOCH}; - - fn temp_dir(label: &str) -> PathBuf { - let nanos = SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("clock drift") - .as_nanos(); - let dir = std::env::temp_dir().join(format!( - "bioscript-cli-unit-{label}-{}-{nanos}", - std::process::id() - )); - fs::create_dir_all(&dir).unwrap(); - dir - } - - #[test] - fn cli_private_helpers_render_rows_filters_paths_and_loader_paths() { - let root = temp_dir("helpers-root"); - let manifest_path = root.join("panels/panel.yaml"); - let member_dir = root.join("panels/members"); - fs::create_dir_all(&member_dir).unwrap(); - let variant_path = member_dir.join("apol1.yaml"); - fs::write(&manifest_path, "schema: bioscript:panel:1.0\n").unwrap(); - fs::write(&variant_path, "schema: bioscript:variant:1.0\n").unwrap(); - - let manifest = VariantManifest { - name: "APOL1 G1".to_owned(), - path: variant_path.clone(), - tags: vec!["kidney".to_owned(), "apol1".to_owned()], - spec: bioscript_core::VariantSpec::default(), - }; - - assert!(matches_filters( - &manifest, - &variant_path, - &[ - "kind=variant".to_owned(), - "name=APOL1".to_owned(), - "tag=kidney".to_owned(), - "path=apol1".to_owned(), - ], - )); - assert!(!matches_filters( - &manifest, - &variant_path, - &["kind=panel".to_owned()] - )); - assert!(!matches_filters( - &manifest, - &variant_path, - &["bad".to_owned()] - )); - - assert_eq!( - resolve_manifest_path(&root, &manifest_path, "members/apol1.yaml").unwrap(), - variant_path.canonicalize().unwrap() - ); - let outside = temp_dir("helpers-outside").join("escape.yaml"); - fs::write(&outside, "schema: bioscript:variant:1.0\n").unwrap(); - let err = - resolve_manifest_path(&root, &manifest_path, &outside.to_string_lossy()).unwrap_err(); - assert!(err.contains("escapes bioscript root"), "{err}"); - - let observation = VariantObservation { - backend: "vcf".to_owned(), - matched_rsid: Some("rs1".to_owned()), - assembly: Some(Assembly::Grch38), - genotype: Some("AG".to_owned()), - ref_count: Some(7), - alt_count: Some(3), - depth: Some(10), - evidence: vec!["one\twith tab".to_owned(), "two".to_owned()], - ..VariantObservation::default() - }; - let row = variant_row( - &root, - &variant_path, - "APOL1 G1", - &["kidney".to_owned()], - &observation, - Some("p1"), - ); - let tsv = render_rows_as_tsv(&[row]); - assert!(tsv.contains("participant_id\tbackend"), "{tsv}"); - assert!(tsv.contains("p1\tvcf\trs1\tgrch38\tAG\t7\t3\t10"), "{tsv}"); - assert!(tsv.contains("one with tab | two"), "{tsv}"); - - assert_eq!( - resolve_cli_path(&root, "sample.txt"), - root.join("sample.txt").display().to_string() - ); - assert_eq!( - resolve_cli_path_buf(&root, Path::new("/tmp/abs")), - PathBuf::from("/tmp/abs") - ); - - let mut loader = GenotypeLoadOptions { - input_index: Some(PathBuf::from("input.crai")), - reference_file: Some(PathBuf::from("ref.fa")), - reference_index: Some(PathBuf::from("ref.fa.fai")), - ..GenotypeLoadOptions::default() - }; - normalize_loader_paths(&root, &mut loader); - assert_eq!( - loader.input_index.as_deref(), - Some(root.join("input.crai").as_path()) - ); - assert_eq!( - loader.reference_file.as_deref(), - Some(root.join("ref.fa").as_path()) - ); - assert_eq!( - loader.reference_index.as_deref(), - Some(root.join("ref.fa.fai").as_path()) - ); - } - - #[test] - fn cli_private_helpers_cover_manifest_schema_and_timing_errors() { - let dir = temp_dir("schema-timing"); - let valid = dir.join("valid.yaml"); - let missing_schema = dir.join("missing.yaml"); - let invalid_yaml = dir.join("invalid.yaml"); - fs::write(&valid, "schema: bioscript:variant:1.0\n").unwrap(); - fs::write(&missing_schema, "name: no schema\n").unwrap(); - fs::write(&invalid_yaml, "schema: [").unwrap(); - - assert_eq!(manifest_schema(&valid).unwrap(), "bioscript:variant:1.0"); - assert!( - manifest_schema(&missing_schema) - .unwrap_err() - .contains("missing schema") - ); - assert!( - manifest_schema(&invalid_yaml) - .unwrap_err() - .contains("failed to parse YAML") - ); - assert!( - manifest_schema(&dir.join("absent.yaml")) - .unwrap_err() - .contains("failed to read") - ); - - let timing_path = dir.join("nested/timing.tsv"); - write_timing_report( - &timing_path, - &[ - StageTiming { - stage: "one".to_owned(), - duration_ms: 2, - detail: "contains\ttab".to_owned(), - }, - StageTiming { - stage: "two".to_owned(), - duration_ms: 3, - detail: "plain".to_owned(), - }, - ], - ) - .unwrap(); - let report = fs::read_to_string(&timing_path).unwrap(); - assert!(report.contains("stage\tduration_ms\tdetail")); - assert!(report.contains("one\t2\tcontains tab")); - } - - #[test] - fn cli_arg_parser_reports_missing_and_invalid_values_without_spawning() { - for (flag, expected) in [ - ("--input-format", "--input-format requires a value"), - ("--max-duration-ms", "--max-duration-ms requires an integer"), - ( - "--max-memory-bytes", - "--max-memory-bytes requires an integer", - ), - ("--max-allocations", "--max-allocations requires an integer"), - ( - "--max-recursion-depth", - "--max-recursion-depth requires an integer", - ), - ] { - let err = run_cli_args(vec![flag.to_owned()]).unwrap_err(); - assert!(err.contains(expected), "{flag}: {err}"); - } - - for (flag, value, expected) in [ - ( - "--input-format", - "unknown", - "invalid --input-format value unknown", - ), - ( - "--max-duration-ms", - "nan", - "invalid --max-duration-ms value nan", - ), - ( - "--max-memory-bytes", - "nan", - "invalid --max-memory-bytes value nan", - ), - ( - "--max-allocations", - "nan", - "invalid --max-allocations value nan", - ), - ( - "--max-recursion-depth", - "nan", - "invalid --max-recursion-depth value nan", - ), - ] { - let err = run_cli_args(vec![flag.to_owned(), value.to_owned()]).unwrap_err(); - assert!(err.contains(expected), "{flag}: {err}"); - } - } -} +// Keep included source files small and named by responsibility. +// If a file approaches 500 lines, split it by domain behavior rather than +// creating arbitrary numbered chunks. +include!("cli_bootstrap.rs"); +include!("cli_commands.rs"); +include!("report_options.rs"); +include!("report_execution.rs"); +include!("report_observations.rs"); +include!("report_matching.rs"); +include!("report_output.rs"); +include!("report_html.rs"); +include!("manifest_runner.rs"); diff --git a/rust/bioscript-cli/src/manifest_runner.rs b/rust/bioscript-cli/src/manifest_runner.rs new file mode 100644 index 0000000..bc4e917 --- /dev/null +++ b/rust/bioscript-cli/src/manifest_runner.rs @@ -0,0 +1,424 @@ +struct ManifestRunOptions<'a> { + input_file: Option<&'a str>, + output_file: Option<&'a str>, + participant_id: Option<&'a str>, + trace_report: Option<&'a Path>, + loader: &'a GenotypeLoadOptions, + filters: &'a [String], +} + +fn run_manifest( + runtime_root: &Path, + manifest_path: &Path, + options: &ManifestRunOptions<'_>, +) -> Result<(), String> { + let schema = manifest_schema(manifest_path)?; + let resolved_input = options + .input_file + .map(|value| resolve_cli_path(runtime_root, value)); + let resolved_output = options + .output_file + .map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); + let resolved_trace = options + .trace_report + .map(|value| resolve_cli_path_buf(runtime_root, value)); + match schema.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = load_variant_manifest(manifest_path)?; + let row = run_variant_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + )?; + write_manifest_outputs( + std::slice::from_ref(&row), + resolved_output.as_deref(), + resolved_trace.as_deref(), + )?; + Ok(()) + } + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + let rows = run_panel_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + options.filters, + )?; + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + let rows = run_assay_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + options.participant_id, + options.loader, + options.filters, + )?; + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn run_variant_manifest( + runtime_root: &Path, + manifest: &VariantManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, +) -> Result, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + Ok(variant_row( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )) +} + +fn run_panel_manifest( + runtime_root: &Path, + panel: &PanelManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let mut rows = Vec::new(); + + for member in &panel.members { + let Some(path) = &member.path else { + return Err("remote panel members are not executable yet".to_owned()); + }; + let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + if member.kind == "variant" { + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + rows.push(variant_row( + runtime_root, + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } else if member.kind == "assay" { + let assay = load_assay_manifest(&resolved)?; + rows.extend(run_assay_manifest_with_store( + runtime_root, + &assay, + &store, + participant_id, + filters, + )?); + } else { + return Err(format!( + "panel member kind '{}' is not executable", + member.kind + )); + } + } + + Ok(rows) +} + +fn run_assay_manifest( + runtime_root: &Path, + assay: &AssayManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + run_assay_manifest_with_store(runtime_root, assay, &store, participant_id, filters) +} + +fn run_assay_manifest_with_store( + runtime_root: &Path, + assay: &AssayManifest, + store: &GenotypeStore, + participant_id: Option<&str>, + filters: &[String], +) -> Result>, String> { + let mut rows = Vec::new(); + + for member in &assay.members { + if member.kind != "variant" { + return Err(format!( + "assay member kind '{}' is not executable", + member.kind + )); + } + let Some(path) = &member.path else { + return Err("remote assay members are not executable yet".to_owned()); + }; + let resolved = resolve_manifest_path(runtime_root, &assay.path, path)?; + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + rows.push(variant_row( + runtime_root, + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } + + Ok(rows) +} + +fn variant_row( + runtime_root: &Path, + path: &Path, + name: &str, + tags: &[String], + observation: &bioscript_core::VariantObservation, + participant_id: Option<&str>, +) -> BTreeMap { + let mut row = BTreeMap::new(); + row.insert("kind".to_owned(), "variant".to_owned()); + row.insert("name".to_owned(), name.to_owned()); + row.insert( + "path".to_owned(), + path.strip_prefix(runtime_root) + .unwrap_or(path) + .display() + .to_string(), + ); + row.insert("tags".to_owned(), tags.join(",")); + row.insert("backend".to_owned(), observation.backend.clone()); + row.insert( + "participant_id".to_owned(), + participant_id.unwrap_or_default().to_owned(), + ); + row.insert( + "matched_rsid".to_owned(), + observation.matched_rsid.clone().unwrap_or_default(), + ); + row.insert( + "assembly".to_owned(), + observation + .assembly + .map(|value| match value { + bioscript_core::Assembly::Grch37 => "grch37".to_owned(), + bioscript_core::Assembly::Grch38 => "grch38".to_owned(), + }) + .unwrap_or_default(), + ); + row.insert( + "genotype".to_owned(), + observation.genotype.clone().unwrap_or_default(), + ); + row.insert( + "ref_count".to_owned(), + observation + .ref_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "alt_count".to_owned(), + observation + .alt_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "depth".to_owned(), + observation + .depth + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert("evidence".to_owned(), observation.evidence.join(" | ")); + row +} + +fn write_manifest_outputs( + rows: &[BTreeMap], + output_file: Option<&Path>, + trace_report: Option<&Path>, +) -> Result<(), String> { + let text = render_rows_as_tsv(rows); + if let Some(output_file) = output_file { + if let Some(parent) = output_file.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!("failed to create output dir {}: {err}", parent.display()) + })?; + } + fs::write(output_file, &text) + .map_err(|err| format!("failed to write output {}: {err}", output_file.display()))?; + } else { + print!("{text}"); + } + + if let Some(trace_report) = trace_report { + if let Some(parent) = trace_report.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create trace dir {}: {err}", parent.display()))?; + } + let mut trace = String::from("step\tline\tcode\n"); + for (idx, row) in rows.iter().enumerate() { + let _ = writeln!( + trace, + "{}\t{}\t{}", + idx + 1, + idx + 1, + row.get("path").cloned().unwrap_or_default() + ); + } + fs::write(trace_report, trace) + .map_err(|err| format!("failed to write trace {}: {err}", trace_report.display()))?; + } + + Ok(()) +} + +fn resolve_cli_path(root: &Path, value: &str) -> String { + resolve_cli_path_buf(root, Path::new(value)) + .display() + .to_string() +} + +fn resolve_cli_path_buf(root: &Path, value: &Path) -> PathBuf { + if value.is_absolute() { + value.to_path_buf() + } else { + root.join(value) + } +} + +fn render_rows_as_tsv(rows: &[BTreeMap]) -> String { + let headers = [ + "kind", + "name", + "path", + "tags", + "participant_id", + "backend", + "matched_rsid", + "assembly", + "genotype", + "ref_count", + "alt_count", + "depth", + "evidence", + ]; + let mut out = headers.join("\t"); + out.push('\n'); + for row in rows { + let line = headers + .iter() + .map(|header| { + row.get(*header) + .cloned() + .unwrap_or_default() + .replace('\t', " ") + }) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + out +} + +fn matches_filters(manifest: &VariantManifest, path: &Path, filters: &[String]) -> bool { + filters.iter().all(|filter| match filter.split_once('=') { + Some(("kind", value)) => value == "variant", + Some(("name", value)) => manifest.name.contains(value), + Some(("path", value)) => path.display().to_string().contains(value), + Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), + Some(_) | None => false, + }) +} + +fn resolve_manifest_path( + runtime_root: &Path, + manifest_path: &Path, + relative: &str, +) -> Result { + let base_dir = manifest_path + .parent() + .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; + let joined = base_dir.join(relative); + let canonical_root = runtime_root + .canonicalize() + .map_err(|err| format!("failed to resolve root {}: {err}", runtime_root.display()))?; + let canonical_base = base_dir.canonicalize().map_err(|err| { + format!( + "failed to resolve manifest dir {}: {err}", + base_dir.display() + ) + })?; + let canonical_joined = joined + .canonicalize() + .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; + let boundary = if canonical_base.starts_with(&canonical_root) { + &canonical_root + } else { + &canonical_base + }; + if !canonical_joined.starts_with(boundary) { + return Err(format!( + "manifest member path escapes bioscript root: {}", + canonical_joined.display() + )); + } + Ok(canonical_joined) +} + +fn manifest_schema(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; + value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{} is missing schema", path.display())) +} + +fn normalize_loader_paths(root: &Path, loader: &mut GenotypeLoadOptions) { + if let Some(path) = loader.input_index.take() { + loader.input_index = Some(resolve_cli_path_buf(root, &path)); + } + if let Some(path) = loader.reference_file.take() { + loader.reference_file = Some(resolve_cli_path_buf(root, &path)); + } + if let Some(path) = loader.reference_index.take() { + loader.reference_index = Some(resolve_cli_path_buf(root, &path)); + } +} diff --git a/rust/bioscript-cli/src/report_execution.rs b/rust/bioscript-cli/src/report_execution.rs new file mode 100644 index 0000000..d657f6c --- /dev/null +++ b/rust/bioscript-cli/src/report_execution.rs @@ -0,0 +1,308 @@ +fn run_manifest_rows_for_report( + runtime_root: &Path, + manifest_path: &Path, + input_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_text = input_file.display().to_string(); + match manifest_schema(manifest_path)?.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = load_variant_manifest(manifest_path)?; + Ok(vec![run_variant_manifest( + runtime_root, + &manifest, + Some(&input_text), + Some(participant_id), + loader, + )?]) + } + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + run_panel_manifest( + runtime_root, + &manifest, + Some(&input_text), + Some(participant_id), + loader, + filters, + ) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + run_assay_manifest( + runtime_root, + &manifest, + Some(&input_text), + Some(participant_id), + loader, + filters, + ) + } + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn run_manifest_analyses_for_report( + runtime_root: &Path, + manifest_path: &Path, + input_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, + output_dir: &Path, +) -> Result, String> { + match manifest_schema(manifest_path)?.as_str() { + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + let mut analyses = Vec::new(); + analyses.extend(run_interpretations_for_report( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.interpretations, + input_file, + participant_id, + loader, + output_dir, + )?); + for member in &manifest.members { + if member.kind != "assay" { + continue; + } + let Some(path) = &member.path else { + continue; + }; + let resolved = resolve_manifest_path(runtime_root, &manifest.path, path)?; + analyses.extend(run_manifest_analyses_for_report( + runtime_root, + &resolved, + input_file, + participant_id, + loader, + output_dir, + )?); + } + Ok(analyses) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + run_interpretations_for_report( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.interpretations, + input_file, + participant_id, + loader, + output_dir, + ) + } + "bioscript:variant:1.0" | "bioscript:variant" => Ok(Vec::new()), + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +#[allow(clippy::too_many_arguments)] +fn run_interpretations_for_report( + runtime_root: &Path, + manifest_path: &Path, + manifest_name: &str, + interpretations: &[PanelInterpretation], + input_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, + output_dir: &Path, +) -> Result, String> { + let mut outputs = Vec::new(); + for interpretation in interpretations { + if interpretation.kind != "bioscript" { + return Err(format!( + "analysis '{}' uses unsupported kind '{}'", + interpretation.id, interpretation.kind + )); + } + let script_path = resolve_manifest_path(runtime_root, manifest_path, &interpretation.path)?; + let format = interpretation + .output_format + .as_deref() + .unwrap_or("json") + .to_ascii_lowercase(); + let analysis_dir = output_dir.join("analysis").join(participant_id); + fs::create_dir_all(&analysis_dir).map_err(|err| { + format!( + "failed to create analysis output dir {}: {err}", + analysis_dir.display() + ) + })?; + let extension = match format.as_str() { + "tsv" => "tsv", + "json" => "json", + "jsonl" => "jsonl", + other => return Err(format!("unsupported analysis output_format '{other}'")), + }; + let output_file = analysis_dir.join(format!("{}.{}", interpretation.id, extension)); + run_bioscript_analysis_script( + runtime_root, + &script_path, + input_file, + &output_file, + participant_id, + loader, + )?; + let rows = parse_analysis_output(&output_file, &format)?; + outputs.push(serde_json::json!({ + "schema": "bioscript:analysis-output:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": manifest_name, + "analysis_id": interpretation.id, + "kind": interpretation.kind, + "output_format": format, + "manifest_path": manifest_path.strip_prefix(runtime_root).unwrap_or(manifest_path).display().to_string(), + "script_path": script_path.strip_prefix(runtime_root).unwrap_or(&script_path).display().to_string(), + "output_file": output_file.strip_prefix(runtime_root).unwrap_or(&output_file).display().to_string(), + "derived_from": interpretation.derived_from.clone(), + "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ + "key": emit.key.clone(), + "label": emit.label.clone(), + "value_type": emit.value_type.clone(), + "format": emit.format.clone(), + })).collect::>(), + "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ + "description": logic.description.clone(), + "source": logic.source.as_ref().map(|source| serde_json::json!({ + "name": source.name.clone(), + "url": source.url.clone(), + })), + })), + "rows": rows, + })); + } + Ok(outputs) +} + +fn run_bioscript_analysis_script( + runtime_root: &Path, + script_path: &Path, + input_file: &Path, + output_file: &Path, + participant_id: &str, + loader: &GenotypeLoadOptions, +) -> Result<(), String> { + let limits = ResourceLimits::new() + .max_duration(Duration::from_secs(1)) + .max_memory(16 * 1024 * 1024) + .max_allocations(400_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + let runtime = BioscriptRuntime::with_config( + runtime_root.to_path_buf(), + RuntimeConfig { + limits, + loader: loader.clone(), + }, + ) + .map_err(|err| err.to_string())?; + runtime + .run_file( + script_path, + None, + vec![ + ( + "input_file", + monty::MontyObject::String(runtime_path_string(runtime_root, input_file)), + ), + ( + "output_file", + monty::MontyObject::String(runtime_path_string(runtime_root, output_file)), + ), + ( + "participant_id", + monty::MontyObject::String(participant_id.to_owned()), + ), + ], + ) + .map(|_| ()) + .map_err(|err| err.to_string()) +} + +fn runtime_path_string(runtime_root: &Path, path: &Path) -> String { + path.strip_prefix(runtime_root) + .unwrap_or(path) + .display() + .to_string() +} + +fn parse_analysis_output(path: &Path, format: &str) -> Result, String> { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read analysis output {}: {err}", path.display()))?; + match format { + "tsv" => Ok(parse_analysis_tsv(&text)), + "json" => { + let value: serde_json::Value = serde_json::from_str(&text).map_err(|err| { + format!("failed to parse analysis JSON {}: {err}", path.display()) + })?; + Ok(match value { + serde_json::Value::Array(rows) => rows, + serde_json::Value::Object(mut object) => object + .remove("rows") + .and_then(|rows| rows.as_array().cloned()) + .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), + other => vec![other], + }) + } + "jsonl" => text + .lines() + .filter(|line| !line.trim().is_empty()) + .map(|line| serde_json::from_str(line).map_err(|err| err.to_string())) + .collect(), + other => Err(format!("unsupported analysis output_format '{other}'")), + } +} + +fn parse_analysis_tsv(text: &str) -> Vec { + let mut lines = text.lines().filter(|line| !line.trim().is_empty()); + let Some(header_line) = lines.next() else { + return Vec::new(); + }; + let headers: Vec<&str> = header_line.split('\t').collect(); + let mut rows = Vec::new(); + for line in lines { + let values: Vec<&str> = line.split('\t').collect(); + let mut object = serde_json::Map::new(); + for (idx, header) in headers.iter().enumerate() { + object.insert( + (*header).to_owned(), + serde_json::Value::String(values.get(idx).copied().unwrap_or_default().to_owned()), + ); + } + rows.push(serde_json::Value::Object(object)); + } + rows +} + +fn app_assay_id(path: &Path) -> Result { + match manifest_schema(path)?.as_str() { + "bioscript:panel:1.0" => Ok(load_panel_manifest(path)?.name), + "bioscript:assay:1.0" => Ok(load_assay_manifest(path)?.name), + "bioscript:variant:1.0" | "bioscript:variant" => Ok(load_variant_manifest(path)?.name), + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn participant_id_from_path(path: &Path) -> String { + let file_name = path + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or("participant"); + file_name + .trim_end_matches(".txt.zip") + .trim_end_matches(".csv.zip") + .trim_end_matches(".vcf.gz") + .trim_end_matches(".cram") + .trim_end_matches(".zip") + .trim_end_matches(".txt") + .trim_end_matches(".csv") + .to_owned() +} diff --git a/rust/bioscript-cli/src/report_html.rs b/rust/bioscript-cli/src/report_html.rs new file mode 100644 index 0000000..caed479 --- /dev/null +++ b/rust/bioscript-cli/src/report_html.rs @@ -0,0 +1,3 @@ +include!("report_html_sections.rs"); +include!("report_html_pgx.rs"); +include!("report_html_helpers.rs"); diff --git a/rust/bioscript-cli/src/report_html_helpers.rs b/rust/bioscript-cli/src/report_html_helpers.rs new file mode 100644 index 0000000..d43ef46 --- /dev/null +++ b/rust/bioscript-cli/src/report_html_helpers.rs @@ -0,0 +1,98 @@ +fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { + let escaped_id = html_escape(table_id); + let refs_control = if table_id == "observations-table" { + "" + } else { + "" + }; + let _ = write!( + out, + "
{refs_control}
" + ); + for (index, header) in headers.iter().enumerate() { + let _ = write!( + out, + "", + escaped_id, + index, + html_escape(header) + ); + } + out.push_str(""); +} + +fn render_table_end(out: &mut String) { + out.push_str("
{}
"); +} + +fn table_cell(out: &mut String, value: &str) { + class_cell(out, value, ""); +} + +fn class_cell(out: &mut String, value: &str, class_name: &str) { + if class_name.is_empty() { + let _ = write!(out, "{}", html_escape(value)); + } else { + let _ = write!( + out, + "{}", + class_name, + html_escape(value) + ); + } +} + +fn link_cell(out: &mut String, url: &str) { + if url.is_empty() { + out.push_str(""); + } else { + let escaped = html_escape(url); + let _ = write!( + out, + "source" + ); + } +} + +fn value_str<'a>(value: &'a serde_json::Value, key: &str) -> &'a str { + value + .get(key) + .and_then(serde_json::Value::as_str) + .unwrap_or_default() +} + +fn join_string_array(value: Option<&serde_json::Value>) -> String { + value + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .filter_map(serde_json::Value::as_str) + .collect::>() + .join(", ") + }) + .unwrap_or_default() +} + +fn join_drugs(finding: &serde_json::Value) -> String { + finding + .get("drugs") + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .filter_map(|drug| drug.get("name").and_then(serde_json::Value::as_str)) + .collect::>() + .join(", ") + }) + .unwrap_or_default() +} + +fn html_escape(value: &str) -> String { + value + .replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) +} + diff --git a/rust/bioscript-cli/src/report_html_pgx.rs b/rust/bioscript-cli/src/report_html_pgx.rs new file mode 100644 index 0000000..bbf0065 --- /dev/null +++ b/rust/bioscript-cli/src/report_html_pgx.rs @@ -0,0 +1,218 @@ +fn render_pgx_label_table(out: &mut String, findings: &[serde_json::Value]) { + let headers = [ + "Variant", + "Ref/Alt", + "Genes", + "Drug(s)", + "Regulator", + "Action", + "Label", + "Evidence", + ]; + render_pgx_label_filters(out); + render_table_start(out, "labels-table", &headers); + for finding in findings { + let evidence = finding.get("evidence"); + let url = evidence + .and_then(|value| value.get("url")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let pgx_level = value_str(finding, "pgx_action_level"); + let _ = write!( + out, + "", + html_escape(&pgx_level_slug(pgx_level)) + ); + table_cell(out, value_str(finding, "variant")); + class_cell(out, &matched_ref_alt(finding), "mono"); + table_cell(out, &join_string_array(finding.get("genes"))); + table_cell(out, &join_drugs(finding)); + table_cell(out, &join_string_array(finding.get("regulatory_sources"))); + pgx_level_cell(out, pgx_level); + table_cell(out, value_str(finding, "label")); + link_cell(out, url); + out.push_str(""); + } + render_table_end(out); +} + +fn render_pgx_summary_table(out: &mut String, findings: &[serde_json::Value]) { + let headers = [ + "Variant", + "Ref/Alt", + "Genotype", + "Drug(s)", + "Category", + "Level", + "Phenotype", + "Effect", + "Evidence", + ]; + render_evidence_level_filters(out); + render_table_start(out, "summaries-table", &headers); + for finding in findings { + let effect = finding + .get("matched_effect") + .unwrap_or(&serde_json::Value::Null); + let evidence = finding.get("evidence"); + let url = evidence + .and_then(|value| value.get("url")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let evidence_level = value_str(finding, "evidence_level"); + let _ = write!( + out, + "", + html_escape(&evidence_level_group(evidence_level)) + ); + table_cell(out, value_str(finding, "variant")); + class_cell(out, &matched_ref_alt(finding), "mono"); + table_cell(out, value_str(effect, "label")); + table_cell(out, &join_drugs(finding)); + table_cell(out, &join_string_array(finding.get("phenotype_categories"))); + evidence_level_cell(out, evidence_level); + table_cell(out, &join_string_array(finding.get("phenotypes"))); + class_cell(out, value_str(effect, "text"), "effect"); + link_cell(out, url); + out.push_str(""); + } + render_table_end(out); +} + +fn render_evidence_level_filters(out: &mut String) { + out.push_str("
Evidence:"); + for (level, label) in [ + ("1", "Level 1"), + ("1a", "Level 1A"), + ("1b", "Level 1B"), + ("2", "Level 2"), + ("2a", "Level 2A"), + ("2b", "Level 2B"), + ("3", "Level 3"), + ("4", "Level 4"), + ] { + let _ = write!( + out, + "" + ); + } + out.push_str(""); + out.push_str("i
"); +} + +fn render_pgx_label_filters(out: &mut String) { + out.push_str("
PGx level:"); + for (level, label) in [ + ("required", "Testing Required"), + ("recommended", "Testing Recommended"), + ("actionable", "Actionable PGx"), + ("informative", "Informative PGx"), + ("no-clinical", "No Clinical PGx"), + ("criteria", "Criteria Not Met"), + ] { + let _ = write!( + out, + "" + ); + } + out.push_str(""); + out.push_str("i
"); +} + +fn matched_ref_alt(finding: &serde_json::Value) -> String { + let Some(observation) = finding.get("matched_observation") else { + return String::new(); + }; + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if ref_allele.is_empty() && alt_allele.is_empty() { + String::new() + } else { + let alt_display = alt_allele.replace(',', "/"); + format!("{ref_allele}->{alt_display}") + } +} + +fn evidence_level_group(level: &str) -> String { + let normalized = level.trim().to_ascii_lowercase(); + if normalized.starts_with("1a") { + "1a".to_owned() + } else if normalized.starts_with("1b") { + "1b".to_owned() + } else if normalized.starts_with('1') { + "1".to_owned() + } else if normalized.starts_with("2a") { + "2a".to_owned() + } else if normalized.starts_with("2b") { + "2b".to_owned() + } else if normalized.starts_with('2') { + "2".to_owned() + } else if normalized.starts_with('3') { + "3".to_owned() + } else if normalized.starts_with('4') { + "4".to_owned() + } else { + "unknown".to_owned() + } +} + +fn evidence_level_color_group(level: &str) -> String { + level + .chars() + .find(char::is_ascii_digit) + .map_or_else(|| "unknown".to_owned(), |ch| ch.to_string()) +} + +fn evidence_level_cell(out: &mut String, level: &str) { + if level.is_empty() { + out.push_str(""); + return; + } + let group = evidence_level_color_group(level); + let _ = write!( + out, + "{}", + html_escape(&group), + html_escape(level) + ); +} + +fn pgx_level_slug(level: &str) -> String { + let normalized = level.to_ascii_lowercase(); + if normalized.contains("required") { + "required".to_owned() + } else if normalized.contains("recommended") { + "recommended".to_owned() + } else if normalized.contains("actionable") { + "actionable".to_owned() + } else if normalized.contains("informative") { + "informative".to_owned() + } else if normalized.contains("no clinical") { + "no-clinical".to_owned() + } else if normalized.contains("criteria") { + "criteria".to_owned() + } else { + "unknown".to_owned() + } +} + +fn pgx_level_cell(out: &mut String, level: &str) { + if level.is_empty() { + out.push_str(""); + return; + } + let slug = pgx_level_slug(level); + let _ = write!( + out, + "{}", + html_escape(&slug), + html_escape(level) + ); +} + diff --git a/rust/bioscript-cli/src/report_html_sections.rs b/rust/bioscript-cli/src/report_html_sections.rs new file mode 100644 index 0000000..61ce304 --- /dev/null +++ b/rust/bioscript-cli/src/report_html_sections.rs @@ -0,0 +1,268 @@ +fn collect_report_analyses(reports: &[serde_json::Value]) -> Vec { + reports + .iter() + .filter_map(|report| report.get("analyses").and_then(serde_json::Value::as_array)) + .flat_map(|analyses| analyses.iter()) + .cloned() + .collect() +} + +fn collect_report_findings(reports: &[serde_json::Value], schema: &str) -> Vec { + reports + .iter() + .filter_map(|report| report.get("findings").and_then(serde_json::Value::as_array)) + .flat_map(|findings| findings.iter()) + .filter(|finding| finding.get("schema").and_then(serde_json::Value::as_str) == Some(schema)) + .cloned() + .collect() +} + +fn render_analysis_tables(out: &mut String, analyses: &[serde_json::Value]) { + if analyses.is_empty() { + out.push_str("

No analysis outputs.

"); + return; + } + for (index, analysis) in analyses.iter().enumerate() { + let table_id = format!("analysis-table-{index}"); + let title = format!( + "{} / {}", + value_str(analysis, "participant_id"), + value_str(analysis, "analysis_id") + ); + let _ = write!(out, "

{}

", html_escape(&title)); + render_analysis_logic(out, analysis); + let rows = analysis + .get("rows") + .and_then(serde_json::Value::as_array) + .cloned() + .unwrap_or_default(); + if rows.is_empty() { + out.push_str("

No rows emitted.

"); + continue; + } + let headers = analysis_row_headers(&rows); + let header_refs = headers.iter().map(String::as_str).collect::>(); + render_table_start(out, &table_id, &header_refs); + for row in rows { + out.push_str(""); + for header in &headers { + table_cell(out, &json_field_as_tsv(row.get(header))); + } + out.push_str(""); + } + render_table_end(out); + } +} + +fn analysis_row_headers(rows: &[serde_json::Value]) -> Vec { + let mut headers = Vec::new(); + for row in rows { + let Some(object) = row.as_object() else { + continue; + }; + for key in object.keys() { + if !headers.contains(key) { + headers.push(key.clone()); + } + } + } + headers +} + +fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { + let Some(logic) = analysis.get("logic") else { + return; + }; + if logic.is_null() { + return; + } + let description = logic + .get("description") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let source = logic.get("source").unwrap_or(&serde_json::Value::Null); + let source_name = source + .get("name") + .and_then(serde_json::Value::as_str) + .unwrap_or("source"); + let source_url = source + .get("url") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + out.push_str("
"); + if !description.is_empty() { + let _ = write!(out, "

{}

", html_escape(description)); + } + if !source_url.is_empty() { + let _ = write!( + out, + "

Logic source: {}

", + html_escape(source_url), + html_escape(source_name) + ); + } + out.push_str("
"); +} + +fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { + let mut links = BTreeMap::::new(); + for report in reports { + collect_provenance_links_from_value(report, &mut links); + } + if links.is_empty() { + out.push_str("

No provenance links.

"); + return; + } + out.push_str("
    "); + for (url, label) in links { + let display = if label.is_empty() { &url } else { &label }; + let _ = write!( + out, + "
  • {}
    {}
  • ", + html_escape(&url), + html_escape(display), + html_escape(&url) + ); + } + out.push_str("
"); +} + +fn collect_provenance_links_from_value( + value: &serde_json::Value, + links: &mut BTreeMap, +) { + match value { + serde_json::Value::Object(object) => { + if let Some(url) = object.get("url").and_then(serde_json::Value::as_str) + && url.starts_with("http") + { + let label = object + .get("name") + .or_else(|| object.get("label")) + .or_else(|| object.get("source")) + .and_then(value_as_string) + .unwrap_or_default(); + links.entry(url.to_owned()).or_insert(label); + } + for child in object.values() { + collect_provenance_links_from_value(child, links); + } + } + serde_json::Value::Array(items) => { + for item in items { + collect_provenance_links_from_value(item, links); + } + } + _ => {} + } +} + +fn render_observation_table(out: &mut String, observations: &[serde_json::Value]) { + let headers = [ + "participant_id", + "rsid", + "ref", + "alt", + "genotype_display", + "genotype", + "zygosity", + "outcome", + "match_status", + "coverage_status", + "call_status", + "assembly", + "chrom", + "pos_start", + "pos_end", + "kind", + "ref_count", + "alt_count", + "depth", + "genotype_quality", + "allele_balance", + "evidence_type", + "evidence_raw", + "facets", + "assay_id", + "assay_version", + "variant_key", + ]; + render_table_start(out, "observations-table", &headers); + for observation in observations { + let _ = write!(out, "", observation_row_class(observation)); + for header in headers { + render_observation_cell(out, observation, header); + } + out.push_str(""); + } + out.push_str(""); +} + +fn observation_row_class(observation: &serde_json::Value) -> &'static str { + match observation + .get("outcome") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "variant" => "row-variant", + "reference" => "row-reference", + _ => "", + } +} + +fn render_observation_cell(out: &mut String, observation: &serde_json::Value, header: &str) { + if header == "genotype_display" { + let outcome = observation + .get("outcome") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let value = json_field_as_tsv(observation.get(header)); + if outcome == "variant" { + let alt = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let _ = write!( + out, + "{}", + highlight_allele(&value, alt) + ); + return; + } + } + let _ = write!( + out, + "{}", + html_escape(&json_field_as_tsv(observation.get(header))) + ); +} + +fn highlight_allele(value: &str, allele: &str) -> String { + if value.is_empty() || allele.is_empty() { + return html_escape(value); + } + if allele.chars().count() == 1 { + let target = allele + .chars() + .next() + .unwrap_or_default() + .to_ascii_uppercase(); + let mut out = String::new(); + for ch in value.chars() { + let escaped = html_escape(&ch.to_string()); + if ch.to_ascii_uppercase() == target { + let _ = write!(out, "{escaped}"); + } else { + out.push_str(&escaped); + } + } + return out; + } + let escaped_value = html_escape(value); + let escaped_allele = html_escape(allele); + escaped_value.replace( + &escaped_allele, + &format!("{escaped_allele}"), + ) +} + diff --git a/rust/bioscript-cli/src/report_matching.rs b/rust/bioscript-cli/src/report_matching.rs new file mode 100644 index 0000000..76f4f93 --- /dev/null +++ b/rust/bioscript-cli/src/report_matching.rs @@ -0,0 +1,292 @@ +fn app_finding_match_observation<'a>( + finding: &serde_json::Value, + observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let binding = finding.get("binding")?; + match binding.get("source").and_then(serde_json::Value::as_str) { + Some("variant") => app_variant_binding_match_observation(binding, observations), + _ => None, + } +} + +fn app_finding_match_analysis( + finding: &serde_json::Value, + analyses: &[serde_json::Value], +) -> Option { + let binding = finding.get("binding")?; + if binding.get("source").and_then(serde_json::Value::as_str) != Some("analysis") { + return None; + } + let analysis_id = binding + .get("analysis_id") + .or_else(|| binding.get("analysis")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let key = binding.get("key").and_then(serde_json::Value::as_str)?; + for analysis in analyses { + if !analysis_id.is_empty() + && analysis + .get("analysis_id") + .and_then(serde_json::Value::as_str) + != Some(analysis_id) + { + continue; + } + let Some(rows) = analysis.get("rows").and_then(serde_json::Value::as_array) else { + continue; + }; + for row in rows { + if app_binding_matches_value(row.get(key), binding) { + return Some(serde_json::json!({ + "participant_id": analysis.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), + "assay_id": analysis.get("assay_id").cloned().unwrap_or(serde_json::Value::Null), + "analysis_id": analysis.get("analysis_id").cloned().unwrap_or(serde_json::Value::Null), + "key": key, + "value": row.get(key).cloned().unwrap_or(serde_json::Value::Null), + "row": row, + })); + } + } + } + None +} + +fn app_variant_binding_match_observation<'a>( + binding: &serde_json::Value, + observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let operator = binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or("equals"); + if matches!(operator, "dosage_equals" | "dosage_in") { + let allele = binding + .get("allele") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + return observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .find(|observation| { + let dosage = app_observation_allele_dosage(observation, allele); + app_binding_matches_dosage(dosage, binding) + }); + } + + let key = binding + .get("key") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if key.is_empty() { + return None; + } + observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .find(|observation| app_binding_matches_value(observation.get(key), binding)) +} + +fn app_finding_observation_context(observation: &serde_json::Value) -> serde_json::Value { + serde_json::json!({ + "participant_id": observation.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), + "rsid": observation.get("rsid").cloned().unwrap_or(serde_json::Value::Null), + "ref": observation.get("ref").cloned().unwrap_or(serde_json::Value::Null), + "alt": observation.get("alt").cloned().unwrap_or(serde_json::Value::Null), + "genotype_display": observation.get("genotype_display").cloned().unwrap_or(serde_json::Value::Null), + "outcome": observation.get("outcome").cloned().unwrap_or(serde_json::Value::Null), + }) +} + +fn app_variant_ref_mismatch(binding: &serde_json::Value, observation: &serde_json::Value) -> bool { + let variant_ref = binding + .get("variant") + .or_else(|| binding.get("path")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if variant_ref.is_empty() { + return false; + } + let basename = Path::new(variant_ref) + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or(variant_ref); + let candidates = [ + observation + .get("variant_key") + .and_then(serde_json::Value::as_str), + observation + .get("variant_path") + .and_then(serde_json::Value::as_str), + observation.get("rsid").and_then(serde_json::Value::as_str), + ]; + !candidates.into_iter().flatten().any(|candidate| { + candidate == variant_ref + || Path::new(candidate) + .file_name() + .and_then(|value| value.to_str()) + .is_some_and(|value| value == basename) + }) +} + +fn app_observation_allele_dosage(observation: &serde_json::Value, allele: &str) -> Option { + if allele.is_empty() { + return None; + } + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let zygosity = observation + .get("zygosity") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if allele == ref_allele { + return match zygosity { + "hom_ref" => Some(2), + "het" => Some(1), + "hom_alt" => Some(0), + _ => None, + }; + } + if allele == alt_allele { + return match zygosity { + "hom_ref" => Some(0), + "het" => Some(1), + "hom_alt" => Some(2), + _ => None, + }; + } + let display = observation + .get("genotype_display") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if allele.len() == 1 { + let allele_ch = allele.chars().next()?.to_ascii_uppercase(); + return display + .chars() + .filter(|ch| ch.to_ascii_uppercase() == allele_ch) + .count() + .try_into() + .ok(); + } + None +} + +fn app_binding_matches_value( + actual: Option<&serde_json::Value>, + binding: &serde_json::Value, +) -> bool { + let actual = actual.and_then(value_as_string).unwrap_or_default(); + match binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or("equals") + { + "equals" => binding + .get("value") + .and_then(value_as_string) + .is_some_and(|value| value == actual), + "in" => binding + .get("values") + .and_then(serde_json::Value::as_array) + .is_some_and(|values| { + values + .iter() + .filter_map(value_as_string) + .any(|value| value == actual) + }), + _ => false, + } +} + +fn app_binding_matches_dosage(dosage: Option, binding: &serde_json::Value) -> bool { + let Some(dosage) = dosage else { + return false; + }; + match binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "dosage_equals" => binding + .get("value") + .and_then(serde_json::Value::as_i64) + .is_some_and(|value| value == dosage), + "dosage_in" => binding + .get("values") + .and_then(serde_json::Value::as_array) + .is_some_and(|values| { + values + .iter() + .filter_map(serde_json::Value::as_i64) + .any(|value| value == dosage) + }), + _ => false, + } +} + +fn value_as_string(value: &serde_json::Value) -> Option { + match value { + serde_json::Value::String(value) => Some(value.clone()), + serde_json::Value::Number(value) => Some(value.to_string()), + serde_json::Value::Bool(value) => Some(value.to_string()), + _ => None, + } +} + +fn app_finding_dedupe_key(finding: &serde_json::Value) -> String { + let effect_key = finding + .get("matched_effect") + .and_then(|effect| { + effect + .get("id") + .or_else(|| effect.get("label")) + .or_else(|| effect.get("text")) + }) + .and_then(value_as_string) + .unwrap_or_default(); + if let Some(evidence) = finding.get("evidence") { + let source = evidence + .get("source") + .and_then(value_as_string) + .unwrap_or_default(); + let kind = evidence + .get("kind") + .and_then(value_as_string) + .unwrap_or_default(); + let id = evidence + .get("id") + .and_then(value_as_string) + .unwrap_or_default(); + if !source.is_empty() || !kind.is_empty() || !id.is_empty() { + return format!("evidence|{source}|{kind}|{id}|{effect_key}"); + } + if let Some(url) = evidence.get("url").and_then(value_as_string) { + return format!("evidence_url|{url}|{effect_key}"); + } + } + if let Some(id) = finding.get("id").and_then(value_as_string) { + return format!("id|{id}|{effect_key}"); + } + format!( + "content|{}|{}|{}|{}", + finding + .get("schema") + .and_then(value_as_string) + .unwrap_or_default(), + finding + .get("label") + .and_then(value_as_string) + .unwrap_or_default(), + finding + .get("notes") + .and_then(value_as_string) + .unwrap_or_default(), + effect_key + ) +} + diff --git a/rust/bioscript-cli/src/report_observations.rs b/rust/bioscript-cli/src/report_observations.rs new file mode 100644 index 0000000..73ddd46 --- /dev/null +++ b/rust/bioscript-cli/src/report_observations.rs @@ -0,0 +1,391 @@ +fn app_observation_from_manifest_row( + runtime_root: &Path, + row: &BTreeMap, + assay_id: &str, +) -> Result { + let row_path = row.get("path").cloned().unwrap_or_default(); + let manifest_path = if Path::new(&row_path).is_absolute() { + PathBuf::from(&row_path) + } else { + runtime_root.join(&row_path) + }; + let manifest = load_variant_manifest(&manifest_path)?; + let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); + let genotype_display = row.get("genotype").cloned().unwrap_or_default(); + let alt_alleles = variant_alt_alleles(&manifest_path)?; + let alt_allele = observed_alt_allele(&genotype_display, &ref_allele, &alt_alleles) + .or_else(|| manifest.spec.alternate.clone()) + .unwrap_or_default(); + let (genotype, zygosity) = normalize_app_genotype(&genotype_display, &ref_allele, &alt_allele); + let depth = parse_optional_u32(row.get("depth")); + let ref_count = parse_optional_u32(row.get("ref_count")); + let alt_count = parse_optional_u32(row.get("alt_count")); + let allele_balance = match (alt_count, depth) { + (Some(alt_count), Some(depth)) if depth > 0 => { + Some(f64::from(alt_count) / f64::from(depth)) + } + _ => None, + }; + let assembly = row.get("assembly").cloned().unwrap_or_default(); + let locus = if assembly.eq_ignore_ascii_case("grch37") { + manifest.spec.grch37.as_ref() + } else { + manifest + .spec + .grch38 + .as_ref() + .or(manifest.spec.grch37.as_ref()) + }; + let outcome = if genotype == "./." { + "no_call" + } else if zygosity == "hom_ref" { + "reference" + } else if zygosity == "het" || zygosity == "hom_alt" { + "variant" + } else { + "unknown" + }; + let evidence_raw = row.get("evidence").cloned().unwrap_or_default(); + Ok(serde_json::json!({ + "participant_id": row.get("participant_id").cloned().unwrap_or_default(), + "assay_id": assay_id, + "assay_version": "1.0", + "variant_key": manifest.name, + "variant_path": row_path, + "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), + "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, + "chrom": locus.map_or(String::new(), |locus| locus.chrom.clone()), + "pos_start": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), + "pos_end": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), + "ref": ref_allele, + "alt": alt_allele, + "kind": manifest.spec.kind.map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()), + "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, + "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), + "call_status": if genotype == "./." { "no_call" } else { "called" }, + "genotype": genotype, + "genotype_display": genotype_display, + "zygosity": zygosity, + "ref_count": ref_count, + "alt_count": alt_count, + "depth": depth, + "genotype_quality": serde_json::Value::Null, + "allele_balance": allele_balance, + "outcome": outcome, + "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, + "evidence_raw": evidence_raw, + "facets": serde_json::Value::Null, + })) +} + +fn variant_alt_alleles(path: &Path) -> Result, String> { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read variant YAML {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse variant YAML {}: {err}", path.display()))?; + let Some(items) = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("alleles".to_owned()))) + .and_then(serde_yaml::Value::as_mapping) + .and_then(|mapping| { + mapping + .get(serde_yaml::Value::String("observed_alts".to_owned())) + .or_else(|| mapping.get(serde_yaml::Value::String("alts".to_owned()))) + }) + .and_then(serde_yaml::Value::as_sequence) + else { + return Ok(Vec::new()); + }; + Ok(items + .iter() + .filter_map(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .collect()) +} + +fn observed_alt_allele( + genotype_display: &str, + ref_allele: &str, + alts: &[String], +) -> Option { + if ref_allele.len() != 1 { + return None; + } + let ref_ch = ref_allele.chars().next()?; + genotype_display + .chars() + .filter(|ch| ch.is_ascii_alphabetic() && *ch != ref_ch) + .find_map(|ch| { + alts.iter() + .find(|alt| alt.len() == 1 && alt.starts_with(ch)) + .cloned() + }) +} + +fn normalize_app_genotype(display: &str, ref_allele: &str, alt_allele: &str) -> (String, String) { + if display.is_empty() { + return ("./.".to_owned(), "unknown".to_owned()); + } + let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); + if alleles.len() != 2 || ref_allele.len() != 1 || alt_allele.len() != 1 { + return (display.to_owned(), "unknown".to_owned()); + } + let ref_ch = ref_allele.chars().next().unwrap_or_default(); + let alt_ch = alt_allele.chars().next().unwrap_or_default(); + let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); + let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); + match (ref_count, alt_count) { + (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), + (1, 1) => ("0/1".to_owned(), "het".to_owned()), + (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), + _ => (display.to_owned(), "unknown".to_owned()), + } +} + +fn parse_optional_u32(value: Option<&String>) -> Option { + value.and_then(|value| value.parse::().ok()) +} + +fn load_manifest_findings( + root: &Path, + manifest_path: &Path, +) -> Result, String> { + let value = load_yaml_value(manifest_path)?; + let schema = value + .get("schema") + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default(); + let mut findings = Vec::new(); + + if matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) && let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let include = json_item + .get("include") + .and_then(serde_json::Value::as_str) + .map(str::to_owned); + if let Some(include) = include { + let include_path = resolve_manifest_path(root, manifest_path, &include)?; + let mut included = load_manifest_findings(root, &include_path)?; + let inherited_binding = json_item.get("binding").cloned(); + for included_item in &mut included { + if inherited_binding.is_some() + && included_item.get("binding").is_none() + && included_item.get("effects").is_none() + && let Some(object) = included_item.as_object_mut() + { + object.insert( + "binding".to_owned(), + inherited_binding.clone().unwrap_or(serde_json::Value::Null), + ); + } + } + findings.extend(included); + continue; + } + if json_item.get("include").is_none() { + findings.push(json_item); + } + } + } + + if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") + && let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { + continue; + }; + let member_path = resolve_manifest_path(root, manifest_path, path)?; + findings.extend(load_manifest_findings(root, &member_path)?); + } + } + + Ok(findings) +} + +fn load_yaml_value(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read YAML {}: {err}", path.display()))?; + serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) +} + +fn yaml_to_json(value: serde_yaml::Value) -> Result { + serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) +} + +fn load_manifest_provenance_links( + root: &Path, + manifest_path: &Path, +) -> Result, String> { + let value = load_yaml_value(manifest_path)?; + let schema = value + .get("schema") + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default(); + let mut links = BTreeMap::::new(); + collect_manifest_provenance_entries(&value, &mut links)?; + + if matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) && let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) else { + continue; + }; + let include_path = resolve_manifest_path(root, manifest_path, include)?; + for item in load_manifest_provenance_links(root, &include_path)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + + if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") + && let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { + continue; + }; + let member_path = resolve_manifest_path(root, manifest_path, path)?; + for item in load_manifest_provenance_links(root, &member_path)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + + Ok(links.into_values().collect()) +} + +fn collect_manifest_provenance_entries( + value: &serde_yaml::Value, + links: &mut BTreeMap, +) -> Result<(), String> { + if let Some(sources) = value + .get("provenance") + .and_then(|provenance| provenance.get("sources")) + .and_then(serde_yaml::Value::as_sequence) + { + for source in sources { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + } + if let Some(source) = value.get("source") { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + Ok(()) +} + +fn match_app_findings( + findings: &[serde_json::Value], + observations: &[serde_json::Value], + analyses: &[serde_json::Value], +) -> Vec { + let mut matched = Vec::new(); + let mut seen = std::collections::BTreeSet::new(); + for finding in findings { + if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { + for effect in effects { + if let Some(observation) = app_finding_match_observation(effect, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert("matched_analysis".to_owned(), analysis); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } + } + } else if let Some(observation) = app_finding_match_observation(finding, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_analysis".to_owned(), analysis); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } + } + matched +} + diff --git a/rust/bioscript-cli/src/report_options.rs b/rust/bioscript-cli/src/report_options.rs new file mode 100644 index 0000000..b988a08 --- /dev/null +++ b/rust/bioscript-cli/src/report_options.rs @@ -0,0 +1,225 @@ +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AppOutputFormat { + Tsv, + Json, + Jsonl, + Both, +} + +struct AppReportOptions { + manifest_path: PathBuf, + input_files: Vec, + output_dir: PathBuf, + root: PathBuf, + html: bool, + observations_format: AppOutputFormat, + reports_format: AppOutputFormat, + loader: GenotypeLoadOptions, + filters: Vec, +} + +fn run_app_report(args: Vec) -> Result<(), String> { + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let mut manifest_path: Option = None; + let mut input_files: Vec = Vec::new(); + let mut output_dir: Option = None; + let mut root: Option = None; + let mut html = false; + let mut observations_format = AppOutputFormat::Tsv; + let mut reports_format = AppOutputFormat::Jsonl; + let mut filters = Vec::new(); + let mut loader = GenotypeLoadOptions::default(); + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--input-file" => input_files.push(PathBuf::from( + iter.next().ok_or("--input-file requires a path")?, + )), + "--output-dir" => { + output_dir = Some(PathBuf::from( + iter.next().ok_or("--output-dir requires a path")?, + )); + } + "--root" => { + root = Some(PathBuf::from( + iter.next().ok_or("--root requires a directory")?, + )); + } + "--html" => html = true, + "--filter" => filters.push(iter.next().ok_or("--filter requires key=value")?), + "--observations-format" => { + observations_format = parse_app_output_format( + &iter + .next() + .ok_or("--observations-format requires a value")?, + )?; + } + "--reports-format" => { + reports_format = parse_app_output_format( + &iter.next().ok_or("--reports-format requires a value")?, + )?; + } + "--input-format" => { + let value = iter.next().ok_or("--input-format requires a value")?; + if value.eq_ignore_ascii_case("auto") { + loader.format = None; + } else { + loader.format = + Some(value.parse::().map_err(|err| { + format!("invalid --input-format value {value}: {err}") + })?); + } + } + "--input-index" => { + loader.input_index = Some(PathBuf::from( + iter.next().ok_or("--input-index requires a path")?, + )); + } + "--reference-file" => { + loader.reference_file = Some(PathBuf::from( + iter.next().ok_or("--reference-file requires a path")?, + )); + } + "--reference-index" => { + loader.reference_index = Some(PathBuf::from( + iter.next().ok_or("--reference-index requires a path")?, + )); + } + value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")), + value => { + if manifest_path.is_none() { + manifest_path = Some(PathBuf::from(value)); + } else { + input_files.push(PathBuf::from(value)); + } + } + } + } + + let Some(manifest_path) = manifest_path else { + return Err("usage: bioscript report --input-file [--input-file ...] --output-dir [--html]".to_owned()); + }; + if input_files.is_empty() { + return Err("bioscript report requires at least one --input-file".to_owned()); + } + let output_dir = output_dir.ok_or("bioscript report requires --output-dir")?; + let root = root.unwrap_or(cwd); + normalize_loader_paths(&root, &mut loader); + + let options = AppReportOptions { + manifest_path: absolutize(&root, &manifest_path), + input_files: input_files + .iter() + .map(|path| absolutize(&root, path)) + .collect(), + output_dir: absolutize(&root, &output_dir), + root, + html, + observations_format, + reports_format, + loader, + filters, + }; + generate_app_report(&options) +} + +fn parse_app_output_format(value: &str) -> Result { + match value { + "tsv" => Ok(AppOutputFormat::Tsv), + "json" => Ok(AppOutputFormat::Json), + "jsonl" => Ok(AppOutputFormat::Jsonl), + "both" => Ok(AppOutputFormat::Both), + other => Err(format!( + "unsupported output format '{other}'; expected tsv, json, jsonl, or both" + )), + } +} + +fn absolutize(root: &Path, path: &Path) -> PathBuf { + if path.is_absolute() { + path.to_path_buf() + } else { + root.join(path) + } +} + +fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { + fs::create_dir_all(&options.output_dir).map_err(|err| { + format!( + "failed to create output dir {}: {err}", + options.output_dir.display() + ) + })?; + + let assay_id = app_assay_id(&options.manifest_path)?; + let findings = load_manifest_findings(&options.root, &options.manifest_path)?; + let provenance = load_manifest_provenance_links(&options.root, &options.manifest_path)?; + let mut observations = Vec::new(); + let mut analyses = Vec::new(); + let mut reports = Vec::new(); + + for input_file in &options.input_files { + let participant_id = participant_id_from_path(input_file); + let rows = run_manifest_rows_for_report( + &options.root, + &options.manifest_path, + input_file, + &participant_id, + &options.loader, + &options.filters, + )?; + let input_observations = rows + .iter() + .map(|row| app_observation_from_manifest_row(&options.root, row, &assay_id)) + .collect::, _>>()?; + observations.extend(input_observations.clone()); + let input_analyses = run_manifest_analyses_for_report( + &options.root, + &options.manifest_path, + input_file, + &participant_id, + &options.loader, + &options.output_dir, + )?; + analyses.extend(input_analyses.clone()); + let matched_findings = match_app_findings(&findings, &input_observations, &input_analyses); + reports.push(app_report_json( + &assay_id, + &participant_id, + input_file, + &input_observations, + &input_analyses, + &matched_findings, + &provenance, + )); + } + + write_app_observations( + &options.output_dir, + &observations, + options.observations_format, + )?; + write_app_analyses(&options.output_dir, &analyses)?; + write_app_reports(&options.output_dir, &reports, options.reports_format)?; + if options.html { + write_app_html(&options.output_dir, &observations, &reports)?; + } + + println!( + "observations: {}", + options.output_dir.join("observations.tsv").display() + ); + println!( + "analysis: {}", + options.output_dir.join("analysis.jsonl").display() + ); + println!( + "reports: {}", + options.output_dir.join("reports.jsonl").display() + ); + if options.html { + println!("html: {}", options.output_dir.join("index.html").display()); + } + Ok(()) +} diff --git a/rust/bioscript-cli/src/report_output.rs b/rust/bioscript-cli/src/report_output.rs new file mode 100644 index 0000000..63fea7c --- /dev/null +++ b/rust/bioscript-cli/src/report_output.rs @@ -0,0 +1,165 @@ +fn app_report_json( + assay_id: &str, + participant_id: &str, + input_file: &Path, + observations: &[serde_json::Value], + analyses: &[serde_json::Value], + findings: &[serde_json::Value], + provenance: &[serde_json::Value], +) -> serde_json::Value { + let called = observations + .iter() + .filter(|item| { + item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") + }) + .count(); + serde_json::json!({ + "schema": "bioscript:report:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": assay_id, + "assay_version": "1.0", + "input": { + "file_name": input_file.file_name().and_then(|value| value.to_str()).unwrap_or_default(), + "file_path": input_file.display().to_string(), + }, + "report_status": if called == observations.len() { "complete" } else { "partial" }, + "derived_from": observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), + "analyses": analyses, + "findings": findings, + "provenance": provenance, + "metrics": { + "n_sites_tested": observations.len(), + "n_sites_called": called, + "n_sites_missing": observations.len().saturating_sub(called), + "n_analyses": analyses.len(), + "n_findings_matched": findings.len(), + } + }) +} + +fn write_app_observations( + output_dir: &Path, + observations: &[serde_json::Value], + format: AppOutputFormat, +) -> Result<(), String> { + if matches!(format, AppOutputFormat::Tsv | AppOutputFormat::Both) { + let mut out = bioscript_core::OBSERVATION_TSV_HEADERS.join("\t"); + out.push('\n'); + for observation in observations { + let line = bioscript_core::OBSERVATION_TSV_HEADERS + .iter() + .map(|header| json_field_as_tsv(observation.get(*header))) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + fs::write(output_dir.join("observations.tsv"), out) + .map_err(|err| format!("failed to write observations.tsv: {err}"))?; + } + if matches!(format, AppOutputFormat::Jsonl | AppOutputFormat::Both) { + write_jsonl(&output_dir.join("observations.jsonl"), observations)?; + } + if matches!(format, AppOutputFormat::Json) { + write_json_pretty( + &output_dir.join("observations.json"), + &serde_json::json!({"observations": observations}), + )?; + } + Ok(()) +} + +fn write_app_analyses(output_dir: &Path, analyses: &[serde_json::Value]) -> Result<(), String> { + write_jsonl(&output_dir.join("analysis.jsonl"), analyses) +} + +fn write_app_reports( + output_dir: &Path, + reports: &[serde_json::Value], + format: AppOutputFormat, +) -> Result<(), String> { + if matches!(format, AppOutputFormat::Jsonl | AppOutputFormat::Both) { + write_jsonl(&output_dir.join("reports.jsonl"), reports)?; + } + if matches!(format, AppOutputFormat::Json | AppOutputFormat::Both) { + write_json_pretty( + &output_dir.join("reports.json"), + &serde_json::json!({ + "schema": "bioscript:report-set:1.0", + "version": "1.0", + "reports": reports, + }), + )?; + } + Ok(()) +} + +fn write_jsonl(path: &Path, rows: &[serde_json::Value]) -> Result<(), String> { + let mut out = String::new(); + for row in rows { + let line = serde_json::to_string(row).map_err(|err| err.to_string())?; + out.push_str(&line); + out.push('\n'); + } + fs::write(path, out).map_err(|err| format!("failed to write {}: {err}", path.display())) +} + +fn write_json_pretty(path: &Path, value: &serde_json::Value) -> Result<(), String> { + let text = serde_json::to_string_pretty(value).map_err(|err| err.to_string())?; + fs::write(path, text).map_err(|err| format!("failed to write {}: {err}", path.display())) +} + +fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { + match value { + Some(serde_json::Value::Null) | None => String::new(), + Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), + Some(value) => value.to_string().replace(['\t', '\n'], " "), + } +} + +fn write_app_html( + output_dir: &Path, + observations: &[serde_json::Value], + reports: &[serde_json::Value], +) -> Result<(), String> { + let mut out = String::from( + r##"BioScript report

BioScript Report

"##, + ); + let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); + let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); + let analysis_outputs = collect_report_analyses(reports); + let _ = write!( + out, + "
{} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
", + observations.len(), + analysis_outputs.len(), + label_findings.len(), + summary_findings.len() + ); + out.push_str(""); + out.push_str("

Observations

"); + render_observation_table(&mut out, observations); + out.push_str("
"); + out.push_str("

Analysis

"); + render_analysis_tables(&mut out, &analysis_outputs); + out.push_str("
"); + out.push_str("

PGx Label Annotations

"); + render_pgx_label_table(&mut out, &label_findings); + out.push_str("
"); + out.push_str("

PGx Summary Annotations

"); + render_pgx_summary_table(&mut out, &summary_findings); + out.push_str("
"); + out.push_str("

Provenance

"); + render_provenance_links(&mut out, reports); + out.push_str("
"); + out.push_str("

Raw Reports JSON

"); + for report in reports { + let text = serde_json::to_string_pretty(report).map_err(|err| err.to_string())?; + let _ = write!(out, "
{}
", html_escape(&text)); + } + out.push_str("
"); + fs::write(output_dir.join("index.html"), out) + .map_err(|err| format!("failed to write index.html: {err}")) +} + diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index 13a3bb4..e7e4bb8 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -1,8 +1,7 @@ use std::{ - ffi::OsStr, fs, path::PathBuf, - process::{Command, Output}, + process::Command, time::{SystemTime, UNIX_EPOCH}, }; @@ -28,27 +27,355 @@ fn temp_dir(label: &str) -> PathBuf { dir } -fn run_bioscript(root: &PathBuf, args: I) -> Output -where - I: IntoIterator, - S: AsRef, -{ - Command::new(env!("CARGO_BIN_EXE_bioscript")) - .current_dir(root) - .args(args) +#[test] +fn hello_world_script_runs_via_cli_and_writes_within_root() { + let root = repo_root(); + let output_path = root.join("bioscripts/output/hello-world.txt"); + if output_path.exists() { + fs::remove_file(&output_path).unwrap(); + } + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("bioscripts/hello-world.py") + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("hello from bioscript")); + assert!(stdout.contains("2 + 3 = 5")); + + let written = fs::read_to_string(output_path).unwrap(); + assert!(written.contains("hello from bioscript")); + assert!(written.contains("loaded: sample input for bioscript")); +} + +#[test] +fn path_escape_is_rejected() { + let root = repo_root(); + let script = root.join("rust/bioscript-cli/tests/fixtures/path_escape.py"); + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg(script) + .output() + .unwrap(); + + assert!(!output.status.success()); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("path escapes bioscript root")); +} + +#[test] +fn trace_report_is_written_for_hello_world() { + let root = repo_root(); + let trace_path = root.join("bioscripts/output/hello-world.trace.tsv"); + if trace_path.exists() { + fs::remove_file(&trace_path).unwrap(); + } + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--trace-report") + .arg("bioscripts/output/hello-world.trace.tsv") + .arg("bioscripts/hello-world.py") + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let trace = fs::read_to_string(trace_path).unwrap(); + assert!(trace.contains("step\tline\tcode")); + assert!(trace.contains("hello from bioscript")); +} + +#[test] +fn batch_lookup_query_plan_runs_and_preserves_requested_result_order() { + let root = repo_root(); + let script = root.join("rust/bioscript-cli/tests/fixtures/batch_lookup.py"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(script) .output() - .unwrap() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("AG")); + assert!(stdout.contains("TC")); + assert!(stdout.contains("II")); } -fn stderr_text(output: &Output) -> String { - String::from_utf8_lossy(&output.stderr).into_owned() +#[test] +fn lookup_variant_details_returns_counts_and_decision_fields() { + let root = repo_root(); + let script = root.join("rust/bioscript-cli/tests/fixtures/lookup_details.py"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(script) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("VariantObservation")); + assert!(stdout.contains("genotype='AG'")); + assert!(stdout.contains("raw_counts={")); + assert!(stdout.contains("decision=")); + assert!(stdout.contains("evidence=[")); } -#[path = "cli/args.rs"] -mod args; -#[path = "cli/manifests.rs"] -mod manifests; -#[path = "cli/runtime.rs"] -mod runtime; -#[path = "cli/subcommands.rs"] -mod subcommands; +#[test] +fn inspect_subcommand_reports_detected_vendor_and_platform() { + let root = repo_root(); + let path = root.join("rust/bioscript-formats/tests/fixtures/ancestrydna_v2_sample.txt"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("inspect") + .arg(path) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("kind\tgenotype_text")); + assert!(stdout.contains("vendor\tAncestryDNA")); + assert!(stdout.contains("platform_version\tV2.0")); + assert!(stdout.contains("assembly\tgrch37")); + assert!(stdout.contains("duration_ms\t")); +} + +#[test] +fn variant_manifest_runs_directly_via_cli() { + let root = repo_root(); + let dir = temp_dir("variant-manifest"); + let manifest = dir.join("rs1.yaml"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&manifest) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("kind\tname\tpath")); + assert!(stdout.contains("example-rs73885319")); + assert!(stdout.contains("AG")); +} + +#[test] +fn panel_manifest_runs_directly_via_cli() { + let root = repo_root(); + let dir = temp_dir("panel-manifest"); + let variants_dir = dir.join("variants"); + fs::create_dir_all(&variants_dir).unwrap(); + fs::write( + variants_dir.join("rs73885319.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + fs::write( + variants_dir.join("rs60910145.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs60910145" +tags: + - "type:trait" +identifiers: + rsids: + - "rs60910145" +coordinates: + grch38: + chrom: "22" + pos: 36265988 +alleles: + kind: "snv" + ref: "T" + alts: + - "G" +"#, + ) + .unwrap(); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "example-panel" +tags: + - "type:trait" +members: + - kind: "variant" + path: "variants/rs73885319.yaml" + version: "1.0" + - kind: "variant" + path: "variants/rs60910145.yaml" + version: "1.0" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--filter") + .arg("name=rs73885319") + .arg(&panel) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("example-rs73885319")); + assert!(!stdout.contains("example-rs60910145")); +} + +#[test] +fn assay_manifest_runs_directly_via_cli() { + let root = repo_root(); + let dir = temp_dir("assay-manifest"); + fs::write( + dir.join("rs73885319.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + let assay = dir.join("assay.yaml"); + fs::write( + &assay, + r#" +schema: "bioscript:assay:1.0" +version: "1.0" +name: "example-assay" +members: + - kind: "variant" + path: "rs73885319.yaml" + version: "1.0" +interpretations: + - id: "example_status" + kind: "bioscript" + path: "example.py" + derived_from: + - "rs73885319.yaml" + emits: + - key: "example_status" + label: "Example status" + value_type: "string" + format: "badge" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&assay) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("example-rs73885319")); + assert!(stdout.contains("AG")); +} diff --git a/rust/bioscript-formats/src/genotype/delimited.rs b/rust/bioscript-formats/src/genotype/delimited.rs index c4b8e81..a4240a8 100644 --- a/rust/bioscript-formats/src/genotype/delimited.rs +++ b/rust/bioscript-formats/src/genotype/delimited.rs @@ -40,6 +40,7 @@ pub(crate) struct ParsedDelimitedRow { pub(crate) chrom: Option, pub(crate) position: Option, pub(crate) genotype: String, + pub(crate) raw_line: String, } #[derive(Debug, Clone, Copy)] @@ -183,6 +184,7 @@ impl RowParser { chrom, position, genotype: normalize_genotype(&genotype), + raw_line: sanitize_evidence_line(line), })) } @@ -249,6 +251,17 @@ pub(crate) fn strip_inline_comment(value: &str) -> String { value.trim().to_owned() } +pub(crate) fn sanitize_evidence_line(line: &str) -> String { + line.trim_end_matches(['\n', '\r']) + .chars() + .map(|ch| match ch { + '\t' => " ".to_owned(), + ch if ch.is_control() => " ".to_owned(), + ch => ch.to_string(), + }) + .collect::() +} + pub(crate) fn split_csv_line(line: &str) -> Vec { let mut fields = Vec::new(); let mut current = String::new(); @@ -366,6 +379,7 @@ pub(crate) fn parse_streaming_row( chrom, position, genotype: normalize_genotype(&genotype), + raw_line: sanitize_evidence_line(line), })) } diff --git a/rust/bioscript-formats/src/genotype/delimited/scan.rs b/rust/bioscript-formats/src/genotype/delimited/scan.rs index 5f51fd4..2bbfbd2 100644 --- a/rust/bioscript-formats/src/genotype/delimited/scan.rs +++ b/rust/bioscript-formats/src/genotype/delimited/scan.rs @@ -77,7 +77,10 @@ pub(crate) fn scan_delimited_variants( backend: backend.backend_name().to_owned(), matched_rsid: Some(rsid.clone()), genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], + evidence: vec![ + format!("resolved by rsid {rsid}"), + format!("source line: {}", row.raw_line), + ], ..VariantObservation::default() }; unresolved = unresolved.saturating_sub(1); @@ -101,7 +104,10 @@ pub(crate) fn scan_delimited_variants( backend: backend.backend_name().to_owned(), matched_rsid: row.rsid.clone(), genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by locus {}:{}", chrom, position)], + evidence: vec![ + format!("resolved by locus {}:{}", chrom, position), + format!("source line: {}", row.raw_line), + ], ..VariantObservation::default() }; unresolved = unresolved.saturating_sub(1); diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index 288c8cf..8ef2fcb 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -30,6 +30,7 @@ pub(crate) struct ParsedVcfRow { pub(crate) reference: String, pub(crate) alternates: Vec, pub(crate) genotype: String, + pub(crate) raw_line: String, } pub(crate) fn scan_vcf_variants( @@ -272,7 +273,10 @@ fn resolve_vcf_row( matched_rsid: Some(rsid.clone()), assembly: targets.detected_assembly, genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], + evidence: vec![ + format!("resolved by rsid {rsid}"), + format!("source line: {}", row.raw_line), + ], ..VariantObservation::default() }; *unresolved = (*unresolved).saturating_sub(1); @@ -299,7 +303,10 @@ fn resolve_vcf_row( matched_rsid: row.rsid.clone(), assembly: targets.detected_assembly, genotype: Some(row.genotype.clone()), - evidence: vec![format!("resolved by locus {}:{}", row.chrom, row.position)], + evidence: vec![ + format!("resolved by locus {}:{}", row.chrom, row.position), + format!("source line: {}", row.raw_line), + ], ..VariantObservation::default() }; *unresolved = (*unresolved).saturating_sub(1); @@ -355,9 +362,21 @@ pub(crate) fn parse_vcf_record(line: &str) -> Result, Runti reference: reference.to_owned(), alternates, genotype, + raw_line: sanitize_evidence_line(line), })) } +fn sanitize_evidence_line(line: &str) -> String { + line.trim_end_matches(['\n', '\r']) + .chars() + .map(|ch| match ch { + '\t' => " ".to_owned(), + ch if ch.is_control() => " ".to_owned(), + ch => ch.to_string(), + }) + .collect::() +} + pub(crate) fn extract_vcf_sample_genotype( format_field: &str, sample_field: &str, diff --git a/rust/bioscript-formats/tests/file_formats/delimited.rs b/rust/bioscript-formats/tests/file_formats/delimited.rs index 350672e..77448d8 100644 --- a/rust/bioscript-formats/tests/file_formats/delimited.rs +++ b/rust/bioscript-formats/tests/file_formats/delimited.rs @@ -30,9 +30,14 @@ fn delimited_parser_handles_comments_blank_lines_csv_and_split_alleles() { }) .unwrap(); assert_eq!(observation.genotype.as_deref(), Some("AG")); - assert_eq!( - observation.evidence, - vec!["resolved by locus chr22:36265860".to_owned()] + assert_eq!(observation.evidence[0], "resolved by locus chr22:36265860"); + assert!( + observation + .evidence + .get(1) + .is_some_and(|line| line.contains("source line: rs73885319,chr22,36265860")), + "{:?}", + observation.evidence ); } @@ -96,8 +101,13 @@ fn delimited_parser_handles_space_delimited_rows_without_headers_and_inline_comm }) .unwrap(); assert_eq!(observation.genotype.as_deref(), Some("AA")); - assert_eq!( - observation.evidence, - vec!["resolved by locus chr2:201".to_owned()] + assert_eq!(observation.evidence[0], "resolved by locus chr2:201"); + assert!( + observation + .evidence + .get(1) + .is_some_and(|line| line.contains("source line: chrOnly chr2 201")), + "{:?}", + observation.evidence ); } diff --git a/rust/bioscript-formats/tests/file_formats/vcf.rs b/rust/bioscript-formats/tests/file_formats/vcf.rs index 0496b82..fe16072 100644 --- a/rust/bioscript-formats/tests/file_formats/vcf.rs +++ b/rust/bioscript-formats/tests/file_formats/vcf.rs @@ -31,9 +31,11 @@ fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { assert_eq!(observation.genotype.as_deref(), Some("GC")); assert_eq!(observation.assembly, Some(bioscript_core::Assembly::Grch38)); - assert_eq!( - observation.evidence, - vec!["resolved by locus chr1:1000".to_owned()] + assert_eq!(observation.evidence[0], "resolved by locus chr1:1000"); + assert!( + observation.evidence[1].contains("source line: chr1 1000"), + "{:?}", + observation.evidence ); } @@ -69,7 +71,12 @@ fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { .unwrap(); assert_eq!(deletion.genotype.as_deref(), Some("ID")); assert_eq!(deletion.assembly, Some(bioscript_core::Assembly::Grch37)); - assert_eq!(deletion.evidence, vec!["resolved by locus 1:99".to_owned()]); + assert_eq!(deletion.evidence[0], "resolved by locus 1:99"); + assert!( + deletion.evidence[1].contains("source line: 1 99"), + "{:?}", + deletion.evidence + ); let insertion = store .lookup_variant(&VariantSpec { @@ -85,9 +92,11 @@ fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { }) .unwrap(); assert_eq!(insertion.genotype.as_deref(), Some("DI")); - assert_eq!( - insertion.evidence, - vec!["resolved by locus chr1:199".to_owned()] + assert_eq!(insertion.evidence[0], "resolved by locus chr1:199"); + assert!( + insertion.evidence[1].contains("source line: chr1 199"), + "{:?}", + insertion.evidence ); let unresolved = store diff --git a/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs b/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs index 0c40663..825c72d 100644 --- a/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs +++ b/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs @@ -35,14 +35,24 @@ fn batch_lookup_preserves_input_order_after_coordinate_sorting() { .unwrap(); assert_eq!(results[0].genotype.as_deref(), Some("CT")); - assert_eq!( - results[0].evidence, - vec!["resolved by locus 1:20".to_owned()] + assert_eq!(results[0].evidence[0], "resolved by locus 1:20"); + assert!( + results[0] + .evidence + .get(1) + .is_some_and(|line| line.contains("source line: rs2 1 20 CT")), + "{:?}", + results[0].evidence ); assert_eq!(results[1].genotype.as_deref(), Some("AG")); - assert_eq!( - results[1].evidence, - vec!["resolved by locus 1:10".to_owned()] + assert_eq!(results[1].evidence[0], "resolved by locus 1:10"); + assert!( + results[1] + .evidence + .get(1) + .is_some_and(|line| line.contains("source line: rs1 1 10 AG")), + "{:?}", + results[1].evidence ); } diff --git a/rust/bioscript-schema/src/lib.rs b/rust/bioscript-schema/src/lib.rs index 174a358..3da1f4c 100644 --- a/rust/bioscript-schema/src/lib.rs +++ b/rust/bioscript-schema/src/lib.rs @@ -5,8 +5,9 @@ pub use remote_resource::{ RemoteDependency, RemoteResourceKind, RemoteResourceResolution, resolve_remote_resource_text, }; pub use validator::{ - Download, FileReport, Issue, PanelManifest, PanelMember, Permissions, Severity, - ValidationReport, VariantManifest, load_panel_manifest, load_variant_manifest, - load_variant_manifest_text, load_variant_manifest_text_for_lookup, validate_panels_path, - validate_variants_path, + AssayManifest, Download, FileReport, Issue, PanelInterpretation, PanelInterpretationLogic, + PanelInterpretationLogicSource, PanelManifest, PanelMember, Permissions, Severity, + ValidationReport, VariantManifest, load_assay_manifest, load_panel_manifest, + load_variant_manifest, load_variant_manifest_text, load_variant_manifest_text_for_lookup, + validate_assays_path, validate_panels_path, validate_variants_path, }; diff --git a/rust/bioscript-schema/src/validator.rs b/rust/bioscript-schema/src/validator.rs index 3e6e7f8..0bae7f6 100644 --- a/rust/bioscript-schema/src/validator.rs +++ b/rust/bioscript-schema/src/validator.rs @@ -1,359 +1,10 @@ -use std::{ - fmt::{self, Write as _}, - path::{Path, PathBuf}, -}; - -use bioscript_core::VariantSpec; -use serde_yaml::Value; - -mod common; -mod panel; -mod spec; -mod variant; - -use common::{ - collect_yaml_files, load_yaml, render_single_manifest_errors, required_non_empty_string, - scalar_at, seq_of_strings, validate_schema_and_identity, -}; -use panel::{parse_downloads, parse_panel_members, validate_panel_root}; -use spec::variant_spec_from_root; -use variant::{ - validate_alleles, validate_coordinates, validate_identifiers, validate_variant_root, -}; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Severity { - Error, - Warning, -} - -impl fmt::Display for Severity { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Error => f.write_str("error"), - Self::Warning => f.write_str("warning"), - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Issue { - pub severity: Severity, - pub path: String, - pub message: String, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct FileReport { - pub file: PathBuf, - pub issues: Vec, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct ValidationReport { - pub files_scanned: usize, - pub reports: Vec, -} - -impl ValidationReport { - #[must_use] - pub fn total_issues(&self) -> usize { - self.reports.iter().map(|report| report.issues.len()).sum() - } - - #[must_use] - pub fn total_errors(&self) -> usize { - self.reports - .iter() - .flat_map(|report| &report.issues) - .filter(|issue| issue.severity == Severity::Error) - .count() - } - - #[must_use] - pub fn total_warnings(&self) -> usize { - self.reports - .iter() - .flat_map(|report| &report.issues) - .filter(|issue| issue.severity == Severity::Warning) - .count() - } - - #[must_use] - pub fn has_errors(&self) -> bool { - self.total_errors() > 0 - } - - #[must_use] - pub fn render_text(&self) -> String { - let mut out = String::new(); - let _ = write!( - out, - "files_scanned: {}\nerrors: {}\nwarnings: {}\n", - self.files_scanned, - self.total_errors(), - self.total_warnings() - ); - for report in &self.reports { - out.push('\n'); - let _ = writeln!(out, "file: {}", report.file.display()); - for issue in &report.issues { - let _ = writeln!( - out, - " - [{}] {}: {}", - issue.severity, issue.path, issue.message - ); - } - } - out - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct VariantManifest { - pub path: PathBuf, - pub name: String, - pub tags: Vec, - pub spec: VariantSpec, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PanelManifest { - pub path: PathBuf, - pub name: String, - pub tags: Vec, - pub permissions: Permissions, - pub downloads: Vec, - pub members: Vec, -} - -#[derive(Debug, Clone, PartialEq, Eq, Default)] -pub struct Permissions { - pub domains: Vec, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Download { - pub id: String, - pub url: String, - pub origin: String, - pub sha256: String, - pub version: String, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct PanelMember { - pub kind: String, - pub path: Option, - pub download: Option, - pub sha256: Option, - pub version: Option, -} - -/// Validate a variant file or directory of variant files. -/// -/// # Errors -/// -/// Returns an error when the input path cannot be read, traversed, or parsed -/// as YAML. -pub fn validate_variants_path(path: &Path) -> Result { - validate_manifest_path(path, ManifestSelector::Variant) -} - -/// Validate a panel file or directory of panel files. -/// -/// # Errors -/// -/// Returns an error when the input path cannot be read, traversed, or parsed -/// as YAML. -pub fn validate_panels_path(path: &Path) -> Result { - validate_manifest_path(path, ManifestSelector::Panel) -} - -/// Load a single variant manifest from YAML. -/// -/// # Errors -/// -/// Returns an error when the file does not parse or is not a valid variant -/// manifest. -pub fn load_variant_manifest(path: &Path) -> Result { - let value = load_yaml(path)?; - variant_manifest_from_root(path, &value) -} - -/// Load a single variant manifest from YAML text. -/// -/// # Errors -/// -/// Returns an error when the text does not parse or is not a valid variant -/// manifest. -pub fn load_variant_manifest_text(name: &str, text: &str) -> Result { - let value: Value = - serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; - variant_manifest_from_root(Path::new(name), &value) -} - -/// Compile a variant manifest from YAML text for lookup execution. -/// -/// This validates the execution-critical fields only: identity, identifiers, -/// coordinates, and alleles. Full manifest validation still reports metadata -/// issues such as missing finding schemas, but those do not block local lookup. -/// -/// # Errors -/// -/// Returns an error when the text does not parse or the execution-critical -/// fields are invalid. -pub fn load_variant_manifest_text_for_lookup( - name: &str, - text: &str, -) -> Result { - let value: Value = - serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; - let path = Path::new(name); - let mut issues = Vec::new(); - validate_schema_and_identity( - &value, - "bioscript:variant:1.0", - Some("bioscript:variant"), - &mut issues, - ); - validate_identifiers(&value, &mut issues); - validate_coordinates(&value, &mut issues); - validate_alleles(&value, &mut issues); - if issues.iter().any(|issue| issue.severity == Severity::Error) { - return Err(render_single_manifest_errors(path, &issues)); - } - - Ok(VariantManifest { - path: path.to_path_buf(), - name: required_non_empty_string(&value, &["name"])?, - tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), - spec: variant_spec_from_root(&value)?, - }) -} - -fn variant_manifest_from_root(path: &Path, value: &Value) -> Result { - let mut issues = Vec::new(); - validate_variant_root(value, &mut issues); - if issues.iter().any(|issue| issue.severity == Severity::Error) { - return Err(render_single_manifest_errors(path, &issues)); - } - - Ok(VariantManifest { - path: path.to_path_buf(), - name: required_non_empty_string(value, &["name"])?, - tags: seq_of_strings(value, &["tags"]).unwrap_or_default(), - spec: variant_spec_from_root(value)?, - }) -} - -/// Load a single panel manifest from YAML. -/// -/// # Errors -/// -/// Returns an error when the file does not parse or is not a valid panel -/// manifest. -pub fn load_panel_manifest(path: &Path) -> Result { - let value = load_yaml(path)?; - let mut issues = Vec::new(); - validate_panel_root(&value, &mut issues); - if issues.iter().any(|issue| issue.severity == Severity::Error) { - return Err(render_single_manifest_errors(path, &issues)); - } - - let permissions = Permissions { - domains: seq_of_strings(&value, &["permissions", "domains"]).unwrap_or_default(), - }; - let downloads = parse_downloads(&value)?; - let members = parse_panel_members(&value)?; - - Ok(PanelManifest { - path: path.to_path_buf(), - name: required_non_empty_string(&value, &["name"])?, - tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), - permissions, - downloads, - members, - }) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum ManifestSelector { - Variant, - Panel, -} - -fn validate_manifest_path( - path: &Path, - selector: ManifestSelector, -) -> Result { - let files = collect_yaml_files(path)?; - let mut reports = Vec::new(); - for file in &files { - let report = match selector { - ManifestSelector::Variant => validate_variant_file(file)?, - ManifestSelector::Panel => validate_panel_file(file)?, - }; - if !report.issues.is_empty() { - reports.push(report); - } - } - Ok(ValidationReport { - files_scanned: files.len(), - reports, - }) -} - -fn validate_variant_file(path: &Path) -> Result { - let value = load_yaml(path)?; - let Some(schema) = scalar_at(&value, &["schema"]) else { - return Ok(FileReport { - file: path.to_path_buf(), - issues: vec![Issue { - severity: Severity::Error, - path: "schema".to_owned(), - message: "missing schema".to_owned(), - }], - }); - }; - if !schema.contains("variant") { - return Ok(FileReport { - file: path.to_path_buf(), - issues: Vec::new(), - }); - } - - let mut issues = Vec::new(); - validate_variant_root(&value, &mut issues); - Ok(FileReport { - file: path.to_path_buf(), - issues, - }) -} - -fn validate_panel_file(path: &Path) -> Result { - let value = load_yaml(path)?; - let Some(schema) = scalar_at(&value, &["schema"]) else { - return Ok(FileReport { - file: path.to_path_buf(), - issues: vec![Issue { - severity: Severity::Error, - path: "schema".to_owned(), - message: "missing schema".to_owned(), - }], - }); - }; - if !schema.contains("panel") { - return Ok(FileReport { - file: path.to_path_buf(), - issues: Vec::new(), - }); - } - - let mut issues = Vec::new(); - validate_panel_root(&value, &mut issues); - Ok(FileReport { - file: path.to_path_buf(), - issues, - }) -} +// Keep validator source files small and grouped by schema responsibility. +// If a file approaches 500 lines, split it by validation domain rather than +// creating arbitrary numbered chunks. +include!("validator_types.rs"); +include!("validator_load.rs"); +include!("validator_roots.rs"); +include!("validator_alleles_findings.rs"); +include!("validator_panel.rs"); +include!("validator_parse.rs"); +include!("validator_helpers.rs"); diff --git a/rust/bioscript-schema/src/validator_alleles.rs b/rust/bioscript-schema/src/validator_alleles.rs new file mode 100644 index 0000000..12b2759 --- /dev/null +++ b/rust/bioscript-schema/src/validator_alleles.rs @@ -0,0 +1,146 @@ +fn validate_alleles(root: &Value, issues: &mut Vec) { + require_path(root, &["alleles"], issues); + require_path(root, &["alleles", "kind"], issues); + require_path(root, &["alleles", "ref"], issues); + require_path(root, &["alleles", "alts"], issues); + + let Some(kind) = scalar_at(root, &["alleles", "kind"]) else { + return; + }; + if !matches!(kind.as_str(), "snv" | "deletion" | "insertion" | "indel") { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.kind".to_owned(), + message: "expected one of snv, deletion, insertion, indel".to_owned(), + }); + } + + if value_at(root, &["alleles", "canonical_alt"]).is_some() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.canonical_alt".to_owned(), + message: "canonical_alt is not part of the current schema".to_owned(), + }); + } + + let Some(reference) = scalar_at(root, &["alleles", "ref"]) else { + return; + }; + if reference.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "empty string".to_owned(), + }); + } + + let Some(alts_value) = value_at(root, &["alleles", "alts"]) else { + return; + }; + let Some(alts_seq) = alts_value.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.alts".to_owned(), + message: "expected a non-empty sequence of strings".to_owned(), + }); + return; + }; + if alts_seq.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.alts".to_owned(), + message: "expected at least one alternate allele".to_owned(), + }); + return; + } + + let mut alts = Vec::new(); + for (idx, item) in alts_seq.iter().enumerate() { + let Some(alt) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if alt.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "empty string".to_owned(), + }); + continue; + } + alts.push(alt.to_owned()); + } + let observed_alts = match seq_of_strings(root, &["alleles", "observed_alts"]) { + Some(items) => { + if items.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.observed_alts".to_owned(), + message: "expected a non-empty sequence of strings when present".to_owned(), + }); + } + for alt in &alts { + if !items.iter().any(|item| item == alt) { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.observed_alts".to_owned(), + message: format!("significant alt '{alt}' is not present in observed_alts"), + }); + } + } + items + } + None => alts.clone(), + }; + validate_symbolic_alleles(&reference, &observed_alts, issues); + validate_snv_alleles(&kind, &reference, &observed_alts, issues); +} + +fn validate_symbolic_alleles(reference: &str, alts: &[String], issues: &mut Vec) { + if reference == "I" || reference == "D" { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" + .to_owned(), + }); + } + for (idx, alt) in alts.iter().enumerate() { + if alt == "I" || alt == "D" { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: + "symbolic I/D alleles are not allowed in stored YAML; use biological alleles" + .to_owned(), + }); + } + } +} + +fn validate_snv_alleles(kind: &str, reference: &str, alts: &[String], issues: &mut Vec) { + if kind != "snv" { + return; + } + if !is_base_allele(reference) { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "snv ref must be one of A/C/G/T".to_owned(), + }); + } + for (idx, alt) in alts.iter().enumerate() { + if !is_base_allele(alt) { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "snv alt must be one of A/C/G/T".to_owned(), + }); + } + } +} + diff --git a/rust/bioscript-schema/src/validator_alleles_findings.rs b/rust/bioscript-schema/src/validator_alleles_findings.rs new file mode 100644 index 0000000..94368b1 --- /dev/null +++ b/rust/bioscript-schema/src/validator_alleles_findings.rs @@ -0,0 +1,3 @@ +include!("validator_alleles.rs"); +include!("validator_findings.rs"); +include!("validator_resources.rs"); diff --git a/rust/bioscript-schema/src/validator_findings.rs b/rust/bioscript-schema/src/validator_findings.rs new file mode 100644 index 0000000..065c55c --- /dev/null +++ b/rust/bioscript-schema/src/validator_findings.rs @@ -0,0 +1,244 @@ +fn validate_findings(root: &Value, issues: &mut Vec) { + let alts = seq_of_strings(root, &["alleles", "alts"]).unwrap_or_default(); + let Some(findings) = value_at(root, &["findings"]).and_then(Value::as_sequence) else { + return; + }; + + for (idx, finding) in findings.iter().enumerate() { + let Some(mapping) = finding.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + + let Some(schema) = mapping + .get(Value::String("schema".to_owned())) + .and_then(Value::as_str) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].schema"), + message: "missing schema".to_owned(), + }); + continue; + }; + if schema.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].schema"), + message: "empty string".to_owned(), + }); + } + if schema == "bioscript:pgx:1.0" { + issues.push(Issue { + severity: Severity::Warning, + path: format!("findings[{idx}].schema"), + message: "legacy PGx finding schema; prefer bioscript:pgx-summary:1.0 or bioscript:pgx-label:1.0".to_owned(), + }); + } + if let Some(alt) = mapping + .get(Value::String("alt".to_owned())) + .and_then(Value::as_str) + && !alts.is_empty() + && alt != "*" + && !alts.iter().any(|item| item == alt) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].alt"), + message: format!("finding alt '{alt}' is not present in alleles.alts {alts:?}"), + }); + } + let has_summary = mapping + .get(Value::String("summary".to_owned())) + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + let has_notes = mapping + .get(Value::String("notes".to_owned())) + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + if !has_summary && !has_notes { + issues.push(Issue { + severity: Severity::Warning, + path: format!("findings[{idx}]"), + message: "finding has neither summary nor notes".to_owned(), + }); + } + validate_finding_binding(&format!("findings[{idx}]"), mapping, issues); + validate_finding_effects(idx, mapping, issues); + } +} + +fn validate_finding_effects(idx: usize, mapping: &Mapping, issues: &mut Vec) { + let Some(effects) = mapping.get(Value::String("effects".to_owned())) else { + return; + }; + let Some(effects) = effects.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].effects"), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (effect_idx, effect) in effects.iter().enumerate() { + let Some(effect) = effect.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].effects[{effect_idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_finding_binding( + &format!("findings[{idx}].effects[{effect_idx}]"), + effect, + issues, + ); + } +} + +fn validate_finding_binding(parent: &str, mapping: &Mapping, issues: &mut Vec) { + let Some(binding) = mapping.get(Value::String("binding".to_owned())) else { + return; + }; + let Some(binding) = binding.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_required_mapping_string(binding, "source", &format!("{parent}.binding"), issues); + validate_finding_binding_source(parent, binding, issues); + validate_finding_binding_operator(parent, binding, issues); +} + +fn validate_finding_binding_source(parent: &str, binding: &Mapping, issues: &mut Vec) { + let source = binding + .get(Value::String("source".to_owned())) + .and_then(Value::as_str); + match source { + Some("variant") + if !binding.contains_key(Value::String("variant".to_owned())) + && !binding.contains_key(Value::String("path".to_owned())) => + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.variant"), + message: "variant findings require variant or path".to_owned(), + }); + } + Some("variant") | None => {} + Some("analysis") => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + validate_required_mapping_string( + binding, + "analysis_id", + &format!("{parent}.binding"), + issues, + ); + } + Some(other) => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.source"), + message: format!("unsupported source '{other}'"), + }), + } +} + +fn validate_finding_binding_operator(parent: &str, binding: &Mapping, issues: &mut Vec) { + let operator = binding + .get(Value::String("operator".to_owned())) + .and_then(Value::as_str) + .unwrap_or("equals"); + match operator { + "equals" => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + if !binding.contains_key(Value::String("value".to_owned())) { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.value"), + message: "equals requires value".to_owned(), + }); + } + } + "in" => { + validate_required_mapping_string(binding, "key", &format!("{parent}.binding"), issues); + let values = binding + .get(Value::String("values".to_owned())) + .and_then(Value::as_sequence); + if values.is_none_or(Vec::is_empty) { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.values"), + message: "in requires non-empty values".to_owned(), + }); + } + } + "dosage_equals" => { + if binding + .get(Value::String("allele".to_owned())) + .and_then(Value::as_str) + .is_none_or(|value| value.trim().is_empty()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.allele"), + message: "dosage_equals requires allele".to_owned(), + }); + } + if binding + .get(Value::String("value".to_owned())) + .and_then(Value::as_i64) + .is_none_or(|value| !(0..=2).contains(&value)) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.value"), + message: "dosage_equals requires integer value 0, 1, or 2".to_owned(), + }); + } + } + "dosage_in" => { + if binding + .get(Value::String("allele".to_owned())) + .and_then(Value::as_str) + .is_none_or(|value| value.trim().is_empty()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.allele"), + message: "dosage_in requires allele".to_owned(), + }); + } + let values = binding + .get(Value::String("values".to_owned())) + .and_then(Value::as_sequence); + let invalid_values = match values { + Some(items) if !items.is_empty() => items + .iter() + .any(|value| value.as_i64().is_none_or(|n| !(0..=2).contains(&n))), + _ => true, + }; + if invalid_values { + issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.values"), + message: "dosage_in requires integer values from 0 to 2".to_owned(), + }); + } + } + other => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.binding.operator"), + message: format!( + "unsupported operator '{other}'; expected 'equals', 'in', 'dosage_equals', or 'dosage_in'" + ), + }), + } +} diff --git a/rust/bioscript-schema/src/validator_helpers.rs b/rust/bioscript-schema/src/validator_helpers.rs new file mode 100644 index 0000000..c3d8492 --- /dev/null +++ b/rust/bioscript-schema/src/validator_helpers.rs @@ -0,0 +1,161 @@ +fn validate_url_string( + value: &str, + path: &str, + require_origin_only: bool, + issues: &mut Vec, +) { + let normalized = if require_origin_only { + normalize_origin(value) + } else { + normalize_download_url(value) + }; + if let Err(message) = normalized { + issues.push(Issue { + severity: Severity::Error, + path: path.to_owned(), + message, + }); + } +} + +fn normalize_origin(value: &str) -> Result { + let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; + if !matches!(url.scheme(), "http" | "https") { + return Err("expected http or https origin".to_owned()); + } + if url.host_str().is_none() { + return Err("origin is missing host".to_owned()); + } + if url.path() != "/" || url.query().is_some() || url.fragment().is_some() { + return Err("expected origin only, without path, query, or fragment".to_owned()); + } + let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); + if let Some(port) = url.port() { + let _ = write!(origin, ":{port}"); + } + Ok(origin) +} + +fn normalize_download_url(value: &str) -> Result { + let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; + if !matches!(url.scheme(), "http" | "https") { + return Err("expected http or https URL".to_owned()); + } + if url.host_str().is_none() { + return Err("URL is missing host".to_owned()); + } + let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); + if let Some(port) = url.port() { + let _ = write!(origin, ":{port}"); + } + Ok(origin) +} + +fn is_allowed_chromosome(value: &str) -> bool { + matches!(value, "X" | "Y" | "MT") + || value + .parse::() + .is_ok_and(|chrom| (1..=22).contains(&chrom)) +} + +fn is_base_allele(value: &str) -> bool { + matches!(value, "A" | "C" | "G" | "T") +} + +fn is_rsid(value: &str) -> bool { + value.starts_with("rs") && value[2..].chars().all(|ch| ch.is_ascii_digit()) +} + +fn is_sha256(value: &str) -> bool { + value.len() == 64 + && value + .chars() + .all(|ch| ch.is_ascii_hexdigit() && !ch.is_ascii_uppercase()) +} + +fn i64_at_mapping(mapping: &Mapping, key: &str) -> Option { + mapping + .get(Value::String(key.to_owned())) + .and_then(Value::as_i64) +} + +fn required_non_empty_string(root: &Value, path: &[&str]) -> Result { + scalar_at(root, path) + .filter(|value| !value.trim().is_empty()) + .ok_or_else(|| format!("{} missing or empty", path.join("."))) +} + +fn render_single_manifest_errors(path: &Path, issues: &[Issue]) -> String { + let mut out = format!("invalid manifest {}:\n", path.display()); + for issue in issues { + let _ = writeln!( + out, + " - [{}] {}: {}", + issue.severity, issue.path, issue.message + ); + } + out +} + +fn load_yaml(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) +} + +fn require_const(root: &Value, path: &[&str], expected: &str, issues: &mut Vec) { + match scalar_at(root, path) { + Some(actual) if actual == expected => {} + Some(actual) => issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: format!("expected '{expected}', found '{actual}'"), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: "missing required field".to_owned(), + }), + } +} + +fn require_path(root: &Value, path: &[&str], issues: &mut Vec) { + if value_at(root, path).is_none() { + issues.push(Issue { + severity: Severity::Error, + path: path.join("."), + message: "missing required field".to_owned(), + }); + } +} + +fn value_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Value> { + let mut current = root; + for key in path { + let mapping = current.as_mapping()?; + current = mapping.get(Value::String((*key).to_owned()))?; + } + Some(current) +} + +fn mapping_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Mapping> { + value_at(root, path)?.as_mapping() +} + +fn scalar_at(root: &Value, path: &[&str]) -> Option { + value_at(root, path).and_then(|value| match value { + Value::String(text) => Some(text.clone()), + Value::Number(number) => Some(number.to_string()), + _ => None, + }) +} + +fn seq_of_strings(root: &Value, path: &[&str]) -> Option> { + value_at(root, path)?.as_sequence().map(|items| { + items + .iter() + .filter_map(|item| item.as_str().map(ToOwned::to_owned)) + .collect() + }) +} diff --git a/rust/bioscript-schema/src/validator_load.rs b/rust/bioscript-schema/src/validator_load.rs new file mode 100644 index 0000000..f3532ca --- /dev/null +++ b/rust/bioscript-schema/src/validator_load.rs @@ -0,0 +1,347 @@ +/// Validate a variant file or directory of variant files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +pub fn validate_variants_path(path: &Path) -> Result { + validate_manifest_path(path, ManifestSelector::Variant) +} + +/// Validate a panel file or directory of panel files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +/// Validate a panel file or directory of panel files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +pub fn validate_panels_path(path: &Path) -> Result { + validate_manifest_path(path, ManifestSelector::Panel) +} + +/// Validate an assay file or directory of assay files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +/// Validate an assay file or directory of assay files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +pub fn validate_assays_path(path: &Path) -> Result { + validate_manifest_path(path, ManifestSelector::Assay) +} + +/// Load a single variant manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid variant +/// manifest. +/// Load a variant manifest from a YAML file. +/// +/// # Errors +/// +/// Returns an error when the file cannot be read, parsed, or converted into a +/// valid variant manifest shape. +pub fn load_variant_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + variant_manifest_from_root(path, &value) +} + +/// Load a single variant manifest from YAML text. +/// +/// # Errors +/// +/// Returns an error when the text does not parse or is not a valid variant +/// manifest. +/// Load a variant manifest from YAML text. +/// +/// # Errors +/// +/// Returns an error when the text cannot be parsed or converted into a valid +/// variant manifest shape. +pub fn load_variant_manifest_text(name: &str, text: &str) -> Result { + let value: Value = + serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; + variant_manifest_from_root(Path::new(name), &value) +} + +/// Compile a variant manifest from YAML text for lookup execution. +/// +/// This validates the execution-critical fields only: identity, identifiers, +/// coordinates, and alleles. Full manifest validation still reports metadata +/// issues such as missing finding schemas, but those do not block local lookup. +/// +/// # Errors +/// +/// Returns an error when the text does not parse or the execution-critical +/// fields are invalid. +pub fn load_variant_manifest_text_for_lookup( + name: &str, + text: &str, +) -> Result { + let value: Value = + serde_yaml::from_str(text).map_err(|err| format!("failed to parse YAML {name}: {err}"))?; + let path = Path::new(name); + let mut issues = Vec::new(); + validate_schema_and_identity( + &value, + "bioscript:variant:1.0", + Some("bioscript:variant"), + &mut issues, + ); + validate_identifiers(&value, &mut issues); + validate_coordinates(&value, &mut issues); + validate_alleles(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + Ok(VariantManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + spec: variant_spec_from_root(&value)?, + }) +} + +fn variant_manifest_from_root(path: &Path, value: &Value) -> Result { + let mut issues = Vec::new(); + validate_variant_root(value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + Ok(VariantManifest { + path: path.to_path_buf(), + name: required_non_empty_string(value, &["name"])?, + tags: seq_of_strings(value, &["tags"]).unwrap_or_default(), + spec: variant_spec_from_root(value)?, + }) +} + +/// Load a single panel manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid panel +/// manifest. +/// Load a panel manifest from a YAML file. +/// +/// # Errors +/// +/// Returns an error when the file cannot be read, parsed, or converted into a +/// valid panel manifest shape. +pub fn load_panel_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + let mut issues = Vec::new(); + validate_panel_root(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + let permissions = Permissions { + domains: seq_of_strings(&value, &["permissions", "domains"]).unwrap_or_default(), + }; + let downloads = parse_downloads(&value)?; + let members = parse_panel_members(&value)?; + let interpretations = parse_panel_interpretations(&value)?; + + Ok(PanelManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + permissions, + downloads, + members, + interpretations, + }) +} + +/// Load a single assay manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid assay +/// manifest. +/// Load an assay manifest from a YAML file. +/// +/// # Errors +/// +/// Returns an error when the file cannot be read, parsed, or converted into a +/// valid assay manifest shape. +pub fn load_assay_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + let mut issues = Vec::new(); + validate_assay_root(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + Ok(AssayManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + members: parse_panel_members(&value)?, + interpretations: parse_panel_interpretations(&value)?, + }) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ManifestSelector { + Assay, + Variant, + Panel, +} + +fn validate_manifest_path( + path: &Path, + selector: ManifestSelector, +) -> Result { + let files = collect_yaml_files(path)?; + let mut reports = Vec::new(); + for file in &files { + let report = match selector { + ManifestSelector::Assay => validate_assay_file(file)?, + ManifestSelector::Variant => validate_variant_file(file)?, + ManifestSelector::Panel => validate_panel_file(file)?, + }; + if !report.issues.is_empty() { + reports.push(report); + } + } + Ok(ValidationReport { + files_scanned: files.len(), + reports, + }) +} + +fn collect_yaml_files(path: &Path) -> Result, String> { + if path.is_file() { + return Ok(vec![path.to_path_buf()]); + } + + let mut files = Vec::new(); + collect_yaml_files_recursive(path, &mut files)?; + files.sort(); + Ok(files) +} + +fn collect_yaml_files_recursive(path: &Path, files: &mut Vec) -> Result<(), String> { + let entries = fs::read_dir(path) + .map_err(|err| format!("failed to read directory {}: {err}", path.display()))?; + for entry in entries { + let entry = entry.map_err(|err| format!("failed to read directory entry: {err}"))?; + let entry_path = entry.path(); + if entry_path.is_dir() { + collect_yaml_files_recursive(&entry_path, files)?; + continue; + } + if entry_path.extension().is_some_and(|extension| { + ["yaml", "yml"] + .iter() + .any(|item| extension.eq_ignore_ascii_case(item)) + }) { + files.push(entry_path); + } + } + Ok(()) +} + +fn validate_assay_file(path: &Path) -> Result { + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], + }); + }; + if !schema.contains("assay") { + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), + }); + } + + let mut issues = Vec::new(); + validate_assay_root(&value, &mut issues); + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} + +fn validate_variant_file(path: &Path) -> Result { + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], + }); + }; + if !schema.contains("variant") { + if schema == "bioscript:pgx-findings:1.0" { + let mut issues = Vec::new(); + validate_pgx_findings_root(&value, &mut issues); + return Ok(FileReport { + file: path.to_path_buf(), + issues, + }); + } + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), + }); + } + + let mut issues = Vec::new(); + validate_variant_root(&value, &mut issues); + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} + +fn validate_panel_file(path: &Path) -> Result { + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], + }); + }; + if !schema.contains("panel") { + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), + }); + } + + let mut issues = Vec::new(); + validate_panel_root(&value, &mut issues); + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} diff --git a/rust/bioscript-schema/src/validator_panel.rs b/rust/bioscript-schema/src/validator_panel.rs new file mode 100644 index 0000000..4ef3c43 --- /dev/null +++ b/rust/bioscript-schema/src/validator_panel.rs @@ -0,0 +1,450 @@ +fn validate_panel_members(root: &Value, allowed_kinds: &[&str], issues: &mut Vec) { + let Some(members) = value_at(root, &["members"]).and_then(Value::as_sequence) else { + issues.push(Issue { + severity: Severity::Error, + path: "members".to_owned(), + message: "missing required field".to_owned(), + }); + return; + }; + if members.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "members".to_owned(), + message: "expected at least one member".to_owned(), + }); + return; + } + + let download_ids = panel_download_ids(root); + + for (idx, item) in members.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_panel_member(idx, mapping, allowed_kinds, &download_ids, issues); + } +} + +fn panel_download_ids(root: &Value) -> BTreeSet { + value_at(root, &["downloads"]) + .and_then(Value::as_sequence) + .into_iter() + .flatten() + .filter_map(|item| { + item.as_mapping()? + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .collect() +} + +fn validate_panel_member( + idx: usize, + mapping: &Mapping, + allowed_kinds: &[&str], + download_ids: &BTreeSet, + issues: &mut Vec, +) { + let kind = mapping + .get(Value::String("kind".to_owned())) + .and_then(Value::as_str); + match kind { + Some(kind) if allowed_kinds.contains(&kind) => {} + Some(other) => issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].kind"), + message: format!("unsupported member kind '{other}'"), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].kind"), + message: "missing required field".to_owned(), + }), + } + + let path_value = mapping + .get(Value::String("path".to_owned())) + .and_then(Value::as_str); + let download_value = mapping + .get(Value::String("download".to_owned())) + .and_then(Value::as_str); + if path_value.is_some() == download_value.is_some() { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}]"), + message: "expected exactly one of path or download".to_owned(), + }); + } + validate_panel_member_path(idx, path_value, issues); + validate_panel_member_download(idx, download_value, download_ids, issues); + validate_panel_member_metadata(idx, mapping, issues); +} + +fn validate_panel_member_path(idx: usize, path_value: Option<&str>, issues: &mut Vec) { + if let Some(path) = path_value + && path.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].path"), + message: "empty string".to_owned(), + }); + } +} + +fn validate_panel_member_download( + idx: usize, + download_value: Option<&str>, + download_ids: &BTreeSet, + issues: &mut Vec, +) { + let Some(download) = download_value else { + return; + }; + if download.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].download"), + message: "empty string".to_owned(), + }); + } else if !download_ids.contains(download) { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].download"), + message: format!("unknown download id '{download}'"), + }); + } +} + +fn validate_panel_member_metadata(idx: usize, mapping: &Mapping, issues: &mut Vec) { + if let Some(version) = mapping + .get(Value::String("version".to_owned())) + .and_then(Value::as_str) + && version.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].version"), + message: "empty string".to_owned(), + }); + } + if let Some(sha) = mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + && !is_sha256(sha) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].sha256"), + message: "expected 64 lowercase hex characters".to_owned(), + }); + } +} + +fn validate_panel_interpretations(root: &Value, issues: &mut Vec) { + if value_at(root, &["analyses"]).is_some() && value_at(root, &["interpretations"]).is_some() { + issues.push(Issue { + severity: Severity::Warning, + path: "interpretations".to_owned(), + message: "use analyses instead of interpretations; do not define both".to_owned(), + }); + } + let key = if value_at(root, &["analyses"]).is_some() { + "analyses" + } else { + "interpretations" + }; + let Some(items) = value_at(root, &[key]) else { + return; + }; + let Some(items) = items.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: key.to_owned(), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_panel_interpretation(key, idx, mapping, issues); + } +} + +fn validate_panel_interpretation( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + for field in ["id", "kind", "path"] { + validate_required_mapping_string(mapping, field, &format!("{key}[{idx}]"), issues); + } + if let Some(kind) = mapping + .get(Value::String("kind".to_owned())) + .and_then(Value::as_str) + && kind != "bioscript" + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].kind"), + message: "expected 'bioscript'".to_owned(), + }); + } + if let Some(output_format) = mapping + .get(Value::String("output_format".to_owned())) + .and_then(Value::as_str) + && !matches!(output_format, "tsv" | "json" | "jsonl") + { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].output_format"), + message: "expected 'tsv', 'json', or 'jsonl'".to_owned(), + }); + } + let Some(derived_from) = mapping + .get(Value::String("derived_from".to_owned())) + .and_then(Value::as_sequence) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from"), + message: "expected a non-empty sequence of strings".to_owned(), + }); + return; + }; + if derived_from.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from"), + message: "expected at least one source variant".to_owned(), + }); + } + for (source_idx, source) in derived_from.iter().enumerate() { + match source.as_str() { + Some(value) if !value.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from[{source_idx}]"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].derived_from[{source_idx}]"), + message: "expected string".to_owned(), + }), + } + } + validate_panel_interpretation_logic(key, idx, mapping, issues); + validate_panel_interpretation_emits(key, idx, mapping, issues); +} + +fn validate_panel_interpretation_logic( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + let Some(logic) = mapping.get(Value::String("logic".to_owned())) else { + return; + }; + let Some(logic) = logic.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].logic"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_optional_mapping_string(logic, "description", &format!("{key}[{idx}].logic"), issues); + let Some(source) = logic.get(Value::String("source".to_owned())) else { + return; + }; + let Some(source) = source.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].logic.source"), + message: "expected mapping".to_owned(), + }); + return; + }; + validate_optional_mapping_string( + source, + "name", + &format!("{key}[{idx}].logic.source"), + issues, + ); + validate_optional_mapping_string(source, "url", &format!("{key}[{idx}].logic.source"), issues); + if let Some(url) = source + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + validate_url_string( + url, + &format!("{key}[{idx}].logic.source.url"), + false, + issues, + ); + } +} + +fn validate_panel_interpretation_emits( + key: &str, + idx: usize, + mapping: &Mapping, + issues: &mut Vec, +) { + let Some(emits) = mapping.get(Value::String("emits".to_owned())) else { + return; + }; + let Some(emits) = emits.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].emits"), + message: "expected a sequence of mappings".to_owned(), + }); + return; + }; + for (emit_idx, item) in emits.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("{key}[{idx}].emits[{emit_idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + validate_required_mapping_string( + mapping, + "key", + &format!("{key}[{idx}].emits[{emit_idx}]"), + issues, + ); + for field in ["label", "value_type", "format"] { + validate_optional_mapping_string( + mapping, + field, + &format!("{key}[{idx}].emits[{emit_idx}]"), + issues, + ); + } + } +} + +fn validate_required_mapping_string( + mapping: &Mapping, + field: &str, + parent: &str, + issues: &mut Vec, +) { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(value) if !value.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "missing required field".to_owned(), + }), + } +} + +fn validate_optional_mapping_string( + mapping: &Mapping, + field: &str, + parent: &str, + issues: &mut Vec, +) { + if let Some(value) = mapping.get(Value::String(field.to_owned())) { + match value.as_str() { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Warning, + path: format!("{parent}.{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("{parent}.{field}"), + message: "expected string".to_owned(), + }), + } + } +} + +fn variant_spec_from_root(root: &Value) -> Result { + let rsids = seq_of_strings(root, &["identifiers", "rsids"]).unwrap_or_default(); + let grch37 = locus_from_root(root, "grch37")?; + let grch38 = locus_from_root(root, "grch38")?; + let reference = scalar_at(root, &["alleles", "ref"]); + let alternate = seq_of_strings(root, &["alleles", "observed_alts"]) + .or_else(|| seq_of_strings(root, &["alleles", "alts"])) + .and_then(|alts| alts.first().cloned()); + let deletion_length = value_at(root, &["alleles", "deletion_length"]) + .and_then(Value::as_u64) + .and_then(|value| usize::try_from(value).ok()); + let motifs = seq_of_strings(root, &["alleles", "motifs"]).unwrap_or_default(); + let kind = scalar_at(root, &["alleles", "kind"]).map(|kind| match kind.as_str() { + "snv" => VariantKind::Snp, + "deletion" => VariantKind::Deletion, + "insertion" => VariantKind::Insertion, + "indel" => VariantKind::Indel, + _ => VariantKind::Other, + }); + + Ok(VariantSpec { + rsids, + grch37, + grch38, + reference, + alternate, + kind, + deletion_length, + motifs, + }) +} + +fn locus_from_root(root: &Value, assembly: &str) -> Result, String> { + let Some(mapping) = mapping_at(root, &["coordinates", assembly]) else { + return Ok(None); + }; + let chrom = mapping + .get(Value::String("chrom".to_owned())) + .and_then(Value::as_str) + .ok_or_else(|| format!("coordinates.{assembly}.chrom missing"))?; + let (start, end) = if let Some(pos) = i64_at_mapping(mapping, "pos") { + (pos, pos) + } else { + let start = i64_at_mapping(mapping, "start") + .ok_or_else(|| format!("coordinates.{assembly}.start missing"))?; + let end = i64_at_mapping(mapping, "end") + .ok_or_else(|| format!("coordinates.{assembly}.end missing"))?; + (start, end) + }; + Ok(Some(GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + })) +} + diff --git a/rust/bioscript-schema/src/validator_parse.rs b/rust/bioscript-schema/src/validator_parse.rs new file mode 100644 index 0000000..f31f28f --- /dev/null +++ b/rust/bioscript-schema/src/validator_parse.rs @@ -0,0 +1,201 @@ +fn parse_downloads(root: &Value) -> Result, String> { + let mut downloads = Vec::new(); + let Some(items) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { + return Ok(downloads); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("downloads[{idx}] must be a mapping")); + }; + let id = mapping_required_string(mapping, "id", idx, "downloads")?; + let url = mapping_required_string(mapping, "url", idx, "downloads")?; + let sha256 = mapping_required_string(mapping, "sha256", idx, "downloads")?; + let version = mapping_required_string(mapping, "version", idx, "downloads")?; + let origin = normalize_download_url(&url)?; + downloads.push(Download { + id, + url, + origin, + sha256, + version, + }); + } + Ok(downloads) +} + +fn parse_panel_members(root: &Value) -> Result, String> { + let mut members = Vec::new(); + let Some(items) = value_at(root, &["members"]).and_then(Value::as_sequence) else { + return Ok(members); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("members[{idx}] must be a mapping")); + }; + members.push(PanelMember { + kind: mapping_required_string(mapping, "kind", idx, "members")?, + path: mapping + .get(Value::String("path".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + download: mapping + .get(Value::String("download".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + sha256: mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + version: mapping + .get(Value::String("version".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }); + } + Ok(members) +} + +fn parse_panel_interpretations(root: &Value) -> Result, String> { + let mut interpretations = Vec::new(); + let key = if value_at(root, &["analyses"]).is_some() { + "analyses" + } else { + "interpretations" + }; + let Some(items) = value_at(root, &[key]).and_then(Value::as_sequence) else { + return Ok(interpretations); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("{key}[{idx}] must be a mapping")); + }; + interpretations.push(PanelInterpretation { + id: mapping_required_string(mapping, "id", idx, key)?, + kind: mapping_required_string(mapping, "kind", idx, key)?, + path: mapping_required_string(mapping, "path", idx, key)?, + output_format: mapping + .get(Value::String("output_format".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + derived_from: mapping_sequence_of_strings(mapping, "derived_from", idx, key)?, + emits: parse_panel_interpretation_emits(mapping, idx)?, + logic: parse_panel_interpretation_logic(mapping)?, + }); + } + Ok(interpretations) +} + +fn parse_panel_interpretation_logic( + mapping: &Mapping, +) -> Result, String> { + let Some(logic) = mapping.get(Value::String("logic".to_owned())) else { + return Ok(None); + }; + let Some(logic_mapping) = logic.as_mapping() else { + return Err("analysis logic must be a mapping".to_owned()); + }; + let source = match logic_mapping.get(Value::String("source".to_owned())) { + Some(source) => { + let Some(source_mapping) = source.as_mapping() else { + return Err("analysis logic.source must be a mapping".to_owned()); + }; + Some(PanelInterpretationLogicSource { + name: source_mapping + .get(Value::String("name".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + url: source_mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }) + } + None => None, + }; + Ok(Some(PanelInterpretationLogic { + source, + description: logic_mapping + .get(Value::String("description".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + })) +} + +fn parse_panel_interpretation_emits( + mapping: &Mapping, + interpretation_idx: usize, +) -> Result, String> { + let Some(items) = mapping + .get(Value::String("emits".to_owned())) + .and_then(Value::as_sequence) + else { + return Ok(Vec::new()); + }; + let mut emits = Vec::new(); + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!( + "interpretations[{interpretation_idx}].emits[{idx}] must be a mapping" + )); + }; + emits.push(PanelInterpretationEmit { + key: mapping_required_string( + mapping, + "key", + idx, + &format!("interpretations[{interpretation_idx}].emits"), + )?, + label: mapping + .get(Value::String("label".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + value_type: mapping + .get(Value::String("value_type".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + format: mapping + .get(Value::String("format".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }); + } + Ok(emits) +} + +fn mapping_sequence_of_strings( + mapping: &Mapping, + field: &str, + idx: usize, + parent: &str, +) -> Result, String> { + let value = mapping + .get(Value::String(field.to_owned())) + .ok_or_else(|| format!("{parent}[{idx}].{field} is required"))?; + let items = value + .as_sequence() + .ok_or_else(|| format!("{parent}[{idx}].{field} must be a sequence"))?; + items + .iter() + .enumerate() + .map(|(item_idx, item)| { + item.as_str() + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{parent}[{idx}].{field}[{item_idx}] must be a string")) + }) + .collect() +} + +fn mapping_required_string( + mapping: &Mapping, + field: &str, + idx: usize, + parent: &str, +) -> Result { + mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + .filter(|value| !value.trim().is_empty()) + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{parent}[{idx}].{field} missing or empty")) +} + diff --git a/rust/bioscript-schema/src/validator_resources.rs b/rust/bioscript-schema/src/validator_resources.rs new file mode 100644 index 0000000..9f5ff22 --- /dev/null +++ b/rust/bioscript-schema/src/validator_resources.rs @@ -0,0 +1,173 @@ +fn validate_provenance(root: &Value, issues: &mut Vec) { + let Some(sources) = value_at(root, &["provenance", "sources"]).and_then(Value::as_sequence) + else { + return; + }; + for (idx, source) in sources.iter().enumerate() { + let Some(mapping) = source.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["kind", "label", "url"] { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].{field}"), + message: "missing required field".to_owned(), + }), + } + } + if let Some(url) = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + validate_url_string( + url, + &format!("provenance.sources[{idx}].url"), + false, + issues, + ); + } + } +} + +fn validate_permissions(root: &Value, issues: &mut Vec) { + let Some(domains) = value_at(root, &["permissions", "domains"]) else { + return; + }; + let Some(items) = domains.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "permissions.domains".to_owned(), + message: "expected a sequence of origins".to_owned(), + }); + return; + }; + let mut seen = BTreeSet::new(); + for (idx, item) in items.iter().enumerate() { + let Some(value) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("permissions.domains[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + match normalize_origin(value) { + Ok(origin) => { + if !seen.insert(origin.clone()) { + issues.push(Issue { + severity: Severity::Warning, + path: format!("permissions.domains[{idx}]"), + message: format!("duplicate origin '{origin}'"), + }); + } + } + Err(message) => issues.push(Issue { + severity: Severity::Error, + path: format!("permissions.domains[{idx}]"), + message, + }), + } + } +} + +fn validate_downloads(root: &Value, issues: &mut Vec) { + let allowed_origins: BTreeSet = seq_of_strings(root, &["permissions", "domains"]) + .unwrap_or_default() + .into_iter() + .filter_map(|domain| normalize_origin(&domain).ok()) + .collect(); + let Some(downloads) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { + return; + }; + let mut ids = BTreeSet::new(); + for (idx, item) in downloads.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["id", "url", "sha256", "version"] { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].{field}"), + message: "missing required field".to_owned(), + }), + } + } + + if let Some(id) = mapping + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + && !ids.insert(id.to_owned()) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].id"), + message: format!("duplicate download id '{id}'"), + }); + } + if let Some(sha) = mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + && !is_sha256(sha) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].sha256"), + message: "expected 64 lowercase hex characters".to_owned(), + }); + } + if let Some(url) = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + match normalize_download_url(url) { + Ok(origin) => { + if !allowed_origins.is_empty() && !allowed_origins.contains(&origin) { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].url"), + message: format!( + "download origin '{origin}' is not listed in permissions.domains" + ), + }); + } + } + Err(message) => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].url"), + message, + }), + } + } + } +} + diff --git a/rust/bioscript-schema/src/validator_roots.rs b/rust/bioscript-schema/src/validator_roots.rs new file mode 100644 index 0000000..9600eb7 --- /dev/null +++ b/rust/bioscript-schema/src/validator_roots.rs @@ -0,0 +1,334 @@ +fn validate_variant_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity( + root, + "bioscript:variant:1.0", + Some("bioscript:variant"), + issues, + ); + validate_optional_strings(root, &["name", "label", "gene", "summary"], issues); + validate_tags(root, issues); + validate_identifiers(root, issues); + validate_coordinates(root, issues); + validate_alleles(root, issues); + validate_findings(root, issues); + validate_provenance(root, issues); + + let has_identifiers = value_at(root, &["identifiers"]) + .and_then(Value::as_mapping) + .is_some_and(|mapping| !mapping.is_empty()); + let has_coordinates = ["grch37", "grch38"] + .iter() + .any(|assembly| value_at(root, &["coordinates", assembly]).is_some()); + if !has_identifiers && !has_coordinates { + issues.push(Issue { + severity: Severity::Error, + path: "identifiers/coordinates".to_owned(), + message: "expected at least one identifier block or one coordinate block".to_owned(), + }); + } +} + +fn validate_panel_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity(root, "bioscript:panel:1.0", None, issues); + validate_optional_strings(root, &["name", "label", "summary"], issues); + validate_tags(root, issues); + validate_permissions(root, issues); + validate_downloads(root, issues); + validate_panel_members(root, &["variant", "assay"], issues); + validate_panel_interpretations(root, issues); + validate_findings(root, issues); +} + +fn validate_assay_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity(root, "bioscript:assay:1.0", None, issues); + validate_optional_strings(root, &["name", "label", "summary"], issues); + validate_tags(root, issues); + validate_panel_members(root, &["variant"], issues); + validate_panel_interpretations(root, issues); + validate_findings(root, issues); +} + +fn validate_pgx_findings_root(root: &Value, issues: &mut Vec) { + require_const(root, &["schema"], "bioscript:pgx-findings:1.0", issues); + require_const(root, &["version"], "1.0", issues); + validate_optional_strings(root, &["variant", "gene", "rsid", "variant_pa_id"], issues); + if value_at(root, &["variant"]).is_none() && value_at(root, &["rsid"]).is_none() { + issues.push(Issue { + severity: Severity::Error, + path: "variant/rsid".to_owned(), + message: "expected at least one variant identifier".to_owned(), + }); + } + match value_at(root, &["findings"]) { + Some(Value::Sequence(_)) => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: "findings".to_owned(), + message: "expected a sequence of findings".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: "findings".to_owned(), + message: "missing required field".to_owned(), + }), + } + validate_findings(root, issues); +} + +fn validate_schema_and_identity( + root: &Value, + canonical_schema: &str, + legacy_schema: Option<&str>, + issues: &mut Vec, +) { + let schema = scalar_at(root, &["schema"]); + let valid_schema = schema + .as_deref() + .is_some_and(|value| value == canonical_schema || legacy_schema == Some(value)); + if !valid_schema { + issues.push(Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: format!("expected schema to be '{canonical_schema}'"), + }); + } + if let Some(legacy_schema) = legacy_schema + && matches!(schema.as_deref(), Some(value) if value == legacy_schema) + { + issues.push(Issue { + severity: Severity::Warning, + path: "schema".to_owned(), + message: format!("legacy schema value '{legacy_schema}'; prefer '{canonical_schema}'"), + }); + } + require_const(root, &["version"], "1.0", issues); + match scalar_at(root, &["name"]) { + Some(name) if !name.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: "name".to_owned(), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: "name".to_owned(), + message: "missing required field".to_owned(), + }), + } + if value_at(root, &["variant_id"]).is_some() { + issues.push(Issue { + severity: Severity::Warning, + path: "variant_id".to_owned(), + message: "variant_id is legacy; prefer name".to_owned(), + }); + } +} + +fn validate_optional_strings(root: &Value, fields: &[&str], issues: &mut Vec) { + for field in fields { + if let Some(value) = value_at(root, &[*field]) { + match value.as_str() { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Warning, + path: (*field).to_owned(), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: (*field).to_owned(), + message: "expected string".to_owned(), + }), + } + } + } +} + +fn validate_tags(root: &Value, issues: &mut Vec) { + let Some(value) = value_at(root, &["tags"]) else { + return; + }; + let Some(items) = value.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "tags".to_owned(), + message: "expected a sequence of strings".to_owned(), + }); + return; + }; + + for (idx, item) in items.iter().enumerate() { + let Some(tag) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("tags[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if tag.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("tags[{idx}]"), + message: "empty tag string".to_owned(), + }); + } + } +} + +fn validate_identifiers(root: &Value, issues: &mut Vec) { + for field in ["rsids", "aliases"] { + let Some(values) = value_at(root, &["identifiers", field]) else { + continue; + }; + let Some(items) = values.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}"), + message: "expected a sequence of strings".to_owned(), + }); + continue; + }; + let mut seen = BTreeSet::new(); + for (idx, item) in items.iter().enumerate() { + let Some(value) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if !is_rsid(value) { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}[{idx}]"), + message: format!("expected rsid like rs123, found '{value}'"), + }); + } + if !seen.insert(value.to_owned()) { + issues.push(Issue { + severity: Severity::Warning, + path: format!("identifiers.{field}[{idx}]"), + message: format!("duplicate identifier '{value}'"), + }); + } + } + } +} + +fn validate_coordinates(root: &Value, issues: &mut Vec) { + for assembly in ["grch37", "grch38"] { + let Some(coord) = mapping_at(root, &["coordinates", assembly]) else { + continue; + }; + + let Some(chrom) = coord + .get(Value::String("chrom".to_owned())) + .and_then(Value::as_str) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.chrom"), + message: "missing chrom".to_owned(), + }); + continue; + }; + if !is_allowed_chromosome(chrom) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.chrom"), + message: format!("invalid chromosome '{chrom}'; expected 1-22, X, Y, or MT"), + }); + } + + let has_pos = coord.contains_key(Value::String("pos".to_owned())); + let has_start = coord.contains_key(Value::String("start".to_owned())); + let has_end = coord.contains_key(Value::String("end".to_owned())); + if has_pos && (has_start || has_end) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "use either pos or start/end, not both".to_owned(), + }); + continue; + } + if !(has_pos || has_start && has_end) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "expected either pos or start/end".to_owned(), + }); + continue; + } + + if has_pos { + validate_coordinate_pos(coord, assembly, issues); + } else { + validate_coordinate_range(coord, assembly, issues); + } + } +} + +fn validate_coordinate_pos(coord: &Mapping, assembly: &str, issues: &mut Vec) { + if let Some(pos) = i64_at_mapping(coord, "pos") { + if pos < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.pos"), + message: "expected integer >= 1".to_owned(), + }); + } + } else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.pos"), + message: "expected integer".to_owned(), + }); + } +} + +fn validate_coordinate_range(coord: &Mapping, assembly: &str, issues: &mut Vec) { + let start = i64_at_mapping(coord, "start"); + let end = i64_at_mapping(coord, "end"); + match (start, end) { + (Some(start), Some(end)) => validate_coordinate_range_values(start, end, assembly, issues), + _ => issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "expected integer start/end".to_owned(), + }), + } +} + +fn validate_coordinate_range_values(start: i64, end: i64, assembly: &str, issues: &mut Vec) { + if start < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.start"), + message: "expected integer >= 1".to_owned(), + }); + } + if end < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.end"), + message: "expected integer >= 1".to_owned(), + }); + } + if end < start { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.end"), + message: "expected end >= start".to_owned(), + }); + } + if start == end { + issues.push(Issue { + severity: Severity::Warning, + path: format!("coordinates.{assembly}"), + message: "single-position coordinate uses start/end; prefer pos".to_owned(), + }); + } +} + diff --git a/rust/bioscript-schema/src/validator_types.rs b/rust/bioscript-schema/src/validator_types.rs new file mode 100644 index 0000000..13f7a96 --- /dev/null +++ b/rust/bioscript-schema/src/validator_types.rs @@ -0,0 +1,180 @@ +use std::{ + collections::BTreeSet, + fmt::{self, Write as _}, + fs, + path::{Path, PathBuf}, +}; + +use bioscript_core::{GenomicLocus, VariantKind, VariantSpec}; +use serde_yaml::{Mapping, Value}; +use url::Url; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Severity { + Error, + Warning, +} + +impl fmt::Display for Severity { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Error => f.write_str("error"), + Self::Warning => f.write_str("warning"), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Issue { + pub severity: Severity, + pub path: String, + pub message: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FileReport { + pub file: PathBuf, + pub issues: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ValidationReport { + pub files_scanned: usize, + pub reports: Vec, +} + +impl ValidationReport { + #[must_use] + pub fn total_issues(&self) -> usize { + self.reports.iter().map(|report| report.issues.len()).sum() + } + + #[must_use] + pub fn total_errors(&self) -> usize { + self.reports + .iter() + .flat_map(|report| &report.issues) + .filter(|issue| issue.severity == Severity::Error) + .count() + } + + #[must_use] + pub fn total_warnings(&self) -> usize { + self.reports + .iter() + .flat_map(|report| &report.issues) + .filter(|issue| issue.severity == Severity::Warning) + .count() + } + + #[must_use] + pub fn has_errors(&self) -> bool { + self.total_errors() > 0 + } + + #[must_use] + pub fn render_text(&self) -> String { + let mut out = String::new(); + let _ = write!( + out, + "files_scanned: {}\nerrors: {}\nwarnings: {}\n", + self.files_scanned, + self.total_errors(), + self.total_warnings() + ); + for report in &self.reports { + out.push('\n'); + let _ = writeln!(out, "file: {}", report.file.display()); + for issue in &report.issues { + let _ = writeln!( + out, + " - [{}] {}: {}", + issue.severity, issue.path, issue.message + ); + } + } + out + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct VariantManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub spec: VariantSpec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub permissions: Permissions, + pub downloads: Vec, + pub members: Vec, + pub interpretations: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct AssayManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub members: Vec, + pub interpretations: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct Permissions { + pub domains: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Download { + pub id: String, + pub url: String, + pub origin: String, + pub sha256: String, + pub version: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelMember { + pub kind: String, + pub path: Option, + pub download: Option, + pub sha256: Option, + pub version: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretation { + pub id: String, + pub kind: String, + pub path: String, + pub output_format: Option, + pub derived_from: Vec, + pub emits: Vec, + pub logic: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationLogic { + pub source: Option, + pub description: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationLogicSource { + pub name: Option, + pub url: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelInterpretationEmit { + pub key: String, + pub label: Option, + pub value_type: Option, + pub format: Option, +} diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index 104bf94..82a0fa0 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -5,8 +5,8 @@ use std::{ }; use bioscript_schema::{ - RemoteResourceKind, load_panel_manifest, load_variant_manifest_text, - load_variant_manifest_text_for_lookup, resolve_remote_resource_text, validate_panels_path, + RemoteResourceKind, load_variant_manifest_text, load_variant_manifest_text_for_lookup, + resolve_remote_resource_text, validate_assays_path, validate_panels_path, validate_variants_path, }; @@ -89,7 +89,7 @@ findings: - schema: "bioscript:trait:1.0" alt: "A" summary: "Example finding" - - schema: "bioscript:pgx:1.0" + - schema: "bioscript:pgx-summary:1.0" alt: "*" summary: "Example multiallelic finding" provenance: @@ -106,47 +106,6 @@ provenance: assert_eq!(report.total_warnings(), 0); } -#[test] -fn validate_variants_scans_nested_yaml_files_and_ignores_other_files() { - let dir = temp_dir("validate-dir"); - let nested = dir.join("nested"); - fs::create_dir_all(&nested).unwrap(); - fs::write(dir.join("notes.txt"), "not yaml").unwrap(); - fs::write( - dir.join("valid.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "rs1" -identifiers: - rsids: ["rs1"] -alleles: - kind: "snv" - ref: "A" - alts: ["G"] -"#, - ) - .unwrap(); - fs::write( - nested.join("missing-schema.yml"), - r#" -version: "1.0" -name: "rs2" -"#, - ) - .unwrap(); - - let report = validate_variants_path(&dir).unwrap(); - let text = report.render_text(); - - assert_eq!(report.files_scanned, 2); - assert!(report.has_errors()); - assert_eq!(report.total_errors(), 1); - assert!(text.contains("missing-schema.yml")); - assert!(text.contains("missing schema")); - assert!(!text.contains("notes.txt")); -} - #[test] fn load_variant_manifest_text_accepts_start_end_coordinates() { let manifest = load_variant_manifest_text( @@ -178,45 +137,6 @@ alleles: assert_eq!(grch38.end, 45_679_786); } -#[test] -fn load_panel_manifest_parses_downloads_permissions_and_member_metadata() { - let dir = temp_dir("load-panel"); - let fixture = dir.join("panel.yaml"); - fs::write( - &fixture, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "traits-common" -tags: ["type:trait"] -permissions: - domains: - - "https://example.org" -downloads: - - id: "remote-rs1" - url: "https://example.org/variants/rs1.yaml" - sha256: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - version: "2026-01-01" -members: - - kind: "variant" - download: "remote-rs1" - sha256: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" - version: "2026-01-01" -"#, - ) - .unwrap(); - - let panel = load_panel_manifest(&fixture).unwrap(); - - assert_eq!(panel.name, "traits-common"); - assert_eq!(panel.tags, vec!["type:trait"]); - assert_eq!(panel.permissions.domains, vec!["https://example.org"]); - assert_eq!(panel.downloads.len(), 1); - assert_eq!(panel.downloads[0].origin, "https://example.org"); - assert_eq!(panel.members.len(), 1); - assert_eq!(panel.members[0].download.as_deref(), Some("remote-rs1")); -} - #[test] fn lookup_compile_allows_non_execution_metadata_issues() { let text = r#" @@ -234,7 +154,6 @@ alleles: kind: "snv" ref: "C" alts: - - "G" - "T" findings: - alt: "T" @@ -245,7 +164,6 @@ findings: let manifest = load_variant_manifest_text_for_lookup("rs10305420.yaml", text).unwrap(); assert_eq!(manifest.name, "GLP1-nature-23andme-rs10305420-C-T"); assert_eq!(manifest.spec.grch38.unwrap().start, 39_048_860); - assert_eq!(manifest.spec.alternate.as_deref(), Some("T")); } #[test] @@ -296,6 +214,21 @@ members: - kind: "variant" path: "variants/rs671.yaml" version: "1.0" + - kind: "assay" + path: "../risk/APOL1/assay.yaml" + version: "1.0" +interpretations: + - id: "taste_status" + kind: "bioscript" + path: "interpretations/taste.py" + label: "Taste status" + derived_from: + - "variants/rs671.yaml" + emits: + - key: "taste_status" + label: "Taste status" + value_type: "string" + format: "badge" "#, ) .unwrap(); @@ -306,253 +239,39 @@ members: } #[test] -fn validate_panels_reports_member_and_download_shape_issues() { - let dir = temp_dir("validate-panel-shape"); - let fixture = dir.join("panel.yaml"); +fn validate_assays_accepts_variant_members_and_interpretations() { + let dir = temp_dir("validate-assay-ok"); + let fixture = dir.join("assay.yaml"); fs::write( &fixture, r#" -schema: "bioscript:panel:1.0" +schema: "bioscript:assay:1.0" version: "1.0" -name: "traits-common" -permissions: - domains: - - "https://example.org/path" - - "ftp://example.org" - - "https://example.org" - - "https://example.org" -downloads: - - id: "remote-rs1" - url: "https://example.org/variants/rs1.yaml" - sha256: "not-a-sha" - version: "1.0" - - id: "remote-rs1" - url: "file:///tmp/rs1.yaml" - sha256: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - version: "" +name: "APOL1" +tags: + - "type:risk" members: - - kind: "script" - path: "variants/rs1.yaml" - download: "remote-rs1" - sha256: "not-a-sha" - version: "" - - kind: "variant" - download: "missing-download" - kind: "variant" - path: "" -"#, - ) - .unwrap(); - - let report = validate_panels_path(&fixture).unwrap(); - let text = report.render_text(); - - assert!(report.total_errors() >= 11, "{text}"); - assert!(report.total_warnings() >= 1, "{text}"); - assert!(text.contains("expected origin only")); - assert!(text.contains("expected http or https origin")); - assert!(text.contains("duplicate origin")); - assert!(text.contains("duplicate download id")); - assert!(text.contains("unknown download id")); - assert!(text.contains("unsupported member kind")); - assert!(text.contains("expected exactly one of path or download")); -} - -#[test] -fn validate_variants_reports_type_and_metadata_issues() { - let dir = temp_dir("validate-variant-edges"); - fs::write( - dir.join("typed-shape.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "2.0" -name: "" -label: 42 -gene: "" -summary: "" -tags: "type:trait" -identifiers: - rsids: "rs1" -coordinates: - grch37: - chrom: "0" - pos: "one" -alleles: - kind: "other" - canonical_alt: "G" - ref: "" - alts: - - 1 - - "" -findings: - - "not-a-map" - - schema: "bioscript:trait:1.0" - alt: "G" -provenance: - sources: - - "not-a-map" - - kind: "" - label: "" - url: "mailto:example" -"#, - ) - .unwrap(); - - let report = validate_variants_path(&dir).unwrap(); - let text = report.render_text(); - - assert_eq!(report.files_scanned, 1); - assert!(report.total_errors() >= 17, "{text}"); - assert!(report.total_warnings() >= 4, "{text}"); - for expected in [ - "expected '1.0'", - "expected string", - "expected a sequence of strings", - "expected integer", - "canonical_alt is not part of the current schema", - "expected one of snv, deletion, insertion, indel", - "finding alt 'G' is not present", - "expected http or https URL", - ] { - assert!(text.contains(expected), "{expected}\n{text}"); - } -} - -#[test] -fn validate_variants_reports_coordinate_edge_cases() { - let dir = temp_dir("validate-variant-coordinate-edges"); - fs::write( - dir.join("coordinate-range.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "range" -identifiers: - rsids: - - "rs1" - - "rs1" -coordinates: - grch38: - chrom: "MT" - start: 20 - end: 10 -alleles: - kind: "snv" - ref: "A" - alts: - - "N" -"#, - ) - .unwrap(); - fs::write( - dir.join("single-position-range.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "single-position-range" -coordinates: - grch38: - chrom: "X" - start: 5 - end: 5 -alleles: - kind: "snv" - ref: "A" - alts: - - "G" -"#, - ) - .unwrap(); - fs::write( - dir.join("pos-and-range.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "both-coordinate-styles" -coordinates: - grch38: - chrom: "Y" - pos: 5 - start: 5 - end: 6 -alleles: - kind: "snv" - ref: "A" - alts: - - "G" -"#, - ) - .unwrap(); - - let report = validate_variants_path(&dir).unwrap(); - let text = report.render_text(); - - assert_eq!(report.files_scanned, 3); - assert!(report.total_errors() >= 3, "{text}"); - assert!(report.total_warnings() >= 2, "{text}"); - for expected in [ - "duplicate identifier 'rs1'", - "expected end >= start", - "single-position coordinate uses start/end", - "use either pos or start/end", - ] { - assert!(text.contains(expected), "{expected}\n{text}"); - } -} - -#[test] -fn validate_panels_reports_missing_empty_and_type_issues() { - let dir = temp_dir("validate-panel-edges"); - fs::write( - dir.join("missing-members.yaml"), - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "missing-members" -label: 7 -summary: "" -tags: "type:trait" -permissions: - domains: "https://example.org" -downloads: - - "not-a-map" -"#, - ) - .unwrap(); - fs::write( - dir.join("empty-members.yaml"), - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "empty-members" -permissions: - domains: - - 3 - - "https://" - - "https://example.org:8443" -members: [] + path: "g1-site-1.yaml" + version: "1.0" +interpretations: + - id: "apol1_status" + kind: "bioscript" + path: "apol1.py" + derived_from: + - "g1-site-1.yaml" + emits: + - key: "apol1_status" + label: "APOL1 status" + value_type: "string" + format: "badge" "#, ) .unwrap(); - let report = validate_panels_path(&dir).unwrap(); - let text = report.render_text(); - - assert_eq!(report.files_scanned, 2); - assert!(report.total_errors() >= 7, "{text}"); - assert!(report.total_warnings() >= 1, "{text}"); - for expected in [ - "expected a sequence of strings", - "downloads[0]: expected mapping", - "members: missing required field", - "members: expected at least one member", - "permissions.domains[0]: expected string", - "invalid URL", - "expected string", - "empty string", - ] { - assert!(text.contains(expected), "{expected}\n{text}"); - } + let report = validate_assays_path(&fixture).unwrap(); + assert_eq!(report.total_errors(), 0); + assert_eq!(report.total_warnings(), 0); } #[test] @@ -588,424 +307,3 @@ members: "https://github.com/madhavajay/exvitae/blob/main/assays/pgx/GLP1/variants/rs2048683.yaml" ); } - -#[test] -fn remote_resource_resolution_classifies_python_without_parsing() { - let text = "print('hello from remote bioscript')\n"; - - let resolved = resolve_remote_resource_text( - "https://github.com/OpenMined/bioscript/blob/main/example.py", - "example.py", - text, - ) - .unwrap(); - - assert_eq!(resolved.kind, RemoteResourceKind::Python); - assert_eq!(resolved.schema, None); - assert_eq!(resolved.title, "example.py"); - assert_eq!( - resolved.sha256, - "b6d9c1ee20c7fb054ebd7defd271d7956b25d8d0c3ef451eaf6adcfda8a61b0f" - ); -} - -#[test] -fn remote_resource_resolution_classifies_schema_kinds() { - let cases = [ - ( - "variant.yaml", - "bioscript:variant:1.0", - RemoteResourceKind::Variant, - ), - ( - "panel.yaml", - "bioscript:panel:1.0", - RemoteResourceKind::Panel, - ), - ( - "catalogue.yaml", - "bioscript:catalogue:1.0", - RemoteResourceKind::Catalogue, - ), - ( - "assay.yaml", - "bioscript:assay:1.0", - RemoteResourceKind::Assay, - ), - ]; - - for (name, schema, expected) in cases { - let text = format!( - r#" -schema: "{schema}" -version: "1.0" -name: "{name}" -"# - ); - - let resolved = - resolve_remote_resource_text("https://example.com/resources/index.yaml", name, &text) - .unwrap(); - - assert_eq!(resolved.kind, expected, "{name}"); - assert_eq!(resolved.schema.as_deref(), Some(schema), "{name}"); - } -} - -#[test] -fn remote_resource_resolution_infers_kind_from_fields_without_schema() { - let cases = [ - ("members.yaml", "members: []\n", RemoteResourceKind::Panel), - ("variants.yaml", "variants: []\n", RemoteResourceKind::Panel), - ( - "catalogue.yaml", - "assays: []\n", - RemoteResourceKind::Catalogue, - ), - ( - "assay.yaml", - "assay:\n package_version: \"2026.1\"\n", - RemoteResourceKind::Assay, - ), - ( - "variant.yaml", - "variant_id: TEST_rs1\ncoordinates: {}\n", - RemoteResourceKind::Variant, - ), - ( - "unknown.yaml", - "name: just-a-file\n", - RemoteResourceKind::Unknown, - ), - ]; - - for (name, text, expected) in cases { - let resolved = - resolve_remote_resource_text("https://example.com/resources/index.yaml", name, text) - .unwrap(); - - assert_eq!(resolved.kind, expected, "{name}"); - } -} - -#[test] -fn remote_resource_resolution_resolves_github_dependencies_and_dedupes_urls() { - let text = r#" -schema: "bioscript:panel:1.0" -version: "1.0" -members: - - path: "variants/rs1.yaml" - version: "1.1" - - path: "variants/rs1.yaml" - version: "1.1" - - path: "/shared/rs2.yaml" -downloads: - - url: "https://example.com/reference.json" -"#; - - let resolved = resolve_remote_resource_text( - "https://github.com/OpenMined/bioscript/blob/main/panels/panel.yaml", - "panel.yaml", - text, - ) - .unwrap(); - - let urls = resolved - .dependencies - .iter() - .map(|dependency| dependency.url.as_str()) - .collect::>(); - - assert_eq!( - urls, - vec![ - "https://example.com/reference.json", - "https://github.com/OpenMined/bioscript/blob/main/panels/variants/rs1.yaml", - "https://github.com/OpenMined/bioscript/blob/main/shared/rs2.yaml", - ] - ); - assert_eq!(resolved.dependencies[0].kind, "download"); - assert_eq!(resolved.dependencies[1].kind, "member"); - assert_eq!(resolved.dependencies[1].version.as_deref(), Some("1.1")); -} - -#[test] -fn remote_resource_resolution_reports_invalid_structured_text() { - let err = resolve_remote_resource_text("https://example.com/bad.yaml", "bad.yaml", ":\n") - .unwrap_err(); - - assert!( - err.contains("failed to parse YAML resource bad.yaml"), - "{err}" - ); -} - -#[test] -fn remote_resource_resolution_handles_json_versions_and_plain_relative_urls() { - let resolved = resolve_remote_resource_text( - "https://example.com/catalogues/index.json", - "assay.json", - r#" -{ - "name": "json-assay", - "assay": { - "version": "2026.4", - "panel": "panels/common.yaml" - }, - "artifact_url": "../artifacts/compiled.json" -} -"#, - ) - .unwrap(); - - assert_eq!(resolved.kind, RemoteResourceKind::Assay); - assert_eq!(resolved.version.as_deref(), Some("2026.4")); - let urls = resolved - .dependencies - .iter() - .map(|dependency| dependency.url.as_str()) - .collect::>(); - assert_eq!( - urls, - vec![ - "https://example.com/artifacts/compiled.json", - "https://example.com/catalogues/panels/common.yaml", - ] - ); - - let err = - resolve_remote_resource_text("https://example.com/bad.json", "bad.json", "{").unwrap_err(); - assert!( - err.contains("failed to parse JSON resource bad.json"), - "{err}" - ); -} - -#[test] -#[allow(clippy::too_many_lines)] -fn validate_variants_covers_remaining_identity_coordinate_and_allele_edges() { - let dir = temp_dir("validate-variant-more-edges"); - fs::write( - dir.join("not-a-variant.yaml"), - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "panel-shape" -members: [] -"#, - ) - .unwrap(); - fs::write( - dir.join("many-errors.yaml"), - r#" -schema: "bioscript:variant:1.0" -tags: - - 7 - - "" -identifiers: - aliases: - - 7 - - "bad-alias" - - "rs22" - - "rs22" -coordinates: - grch37: - pos: 12 - grch38: - chrom: "2" -alleles: - kind: "snv" -provenance: - sources: - - kind: "database" -"#, - ) - .unwrap(); - fs::write( - dir.join("range-and-alleles.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "range-and-alleles" -coordinates: - grch37: - chrom: "3" - start: 0 - end: 0 - grch38: - chrom: "4" - start: "bad" - end: 9 -alleles: - kind: "deletion" - ref: "A" - alts: "T" -"#, - ) - .unwrap(); - fs::write( - dir.join("empty-alts.yaml"), - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "empty-alts" -coordinates: - grch38: - chrom: "5" - pos: 11 -alleles: - kind: "insertion" - ref: "A" - alts: [] -"#, - ) - .unwrap(); - - let report = validate_variants_path(&dir).unwrap(); - let text = report.render_text(); - - assert_eq!(report.files_scanned, 4); - assert!(report.total_issues() >= 19, "{text}"); - for expected in [ - "missing required field", - "tags[0]: expected string", - "tags[1]: empty tag string", - "identifiers.aliases[0]: expected string", - "expected rsid like rs123, found 'bad-alias'", - "duplicate identifier 'rs22'", - "coordinates.grch37.chrom: missing chrom", - "coordinates.grch38: expected either pos or start/end", - "coordinates.grch37.start: expected integer >= 1", - "coordinates.grch37.end: expected integer >= 1", - "coordinates.grch38: expected integer start/end", - "alleles.ref: missing required field", - "alleles.alts: expected a non-empty sequence of strings", - "alleles.alts: expected at least one alternate allele", - "provenance.sources[0].label: missing required field", - "provenance.sources[0].url: missing required field", - ] { - assert!(text.contains(expected), "{expected}\n{text}"); - } - assert!(!text.contains("panel-shape")); -} - -#[test] -#[allow(clippy::too_many_lines)] -fn validate_panels_and_loaders_cover_parse_error_edges() { - let dir = temp_dir("validate-panel-more-edges"); - let non_panel = dir.join("variant.yaml"); - fs::write( - &non_panel, - r#" -schema: "bioscript:variant:1.0" -version: "1.0" -name: "rs1" -"#, - ) - .unwrap(); - let missing_schema = dir.join("missing-schema.yaml"); - fs::write( - &missing_schema, - r#" -version: "1.0" -name: "missing-schema" -"#, - ) - .unwrap(); - - let report = validate_panels_path(&dir).unwrap(); - let text = report.render_text(); - assert_eq!(report.files_scanned, 2); - assert_eq!(report.total_errors(), 1, "{text}"); - assert!(text.contains("missing schema")); - assert!(!text.contains("rs1")); - - let invalid_panel = dir.join("invalid-panel.yaml"); - fs::write( - &invalid_panel, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -downloads: - - id: "" - url: "http://" - sha256: "" -members: - - download: "" - - "not-a-map" -"#, - ) - .unwrap(); - let err = load_panel_manifest(&invalid_panel).unwrap_err(); - assert!(err.contains("name: missing required field"), "{err}"); - assert!(err.contains("downloads[0].id: empty string"), "{err}"); - assert!( - err.contains("downloads[0].version: missing required field"), - "{err}" - ); - assert!( - err.contains("members[0].kind: missing required field"), - "{err}" - ); - assert!(err.contains("members[0].download: empty string"), "{err}"); - assert!(err.contains("members[1]: expected mapping"), "{err}"); - - let downloads_not_mapping = dir.join("downloads-not-mapping.yaml"); - fs::write( - &downloads_not_mapping, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "bad-download" -downloads: - - "not-a-map" -members: - - kind: "variant" - path: "rs1.yaml" -"#, - ) - .unwrap(); - let err = load_panel_manifest(&downloads_not_mapping).unwrap_err(); - assert!(err.contains("downloads[0]: expected mapping"), "{err}"); - - let members_not_mapping = dir.join("members-not-mapping.yaml"); - fs::write( - &members_not_mapping, - r#" -schema: "bioscript:panel:1.0" -version: "1.0" -name: "bad-member" -members: - - "not-a-map" -"#, - ) - .unwrap(); - let err = load_panel_manifest(&members_not_mapping).unwrap_err(); - assert!(err.contains("members[0]: expected mapping"), "{err}"); - - let invalid_lookup = load_variant_manifest_text_for_lookup( - "bad-lookup.yaml", - r#" -schema: "wrong" -version: "1.0" -name: "bad-lookup" -coordinates: - grch38: - chrom: "1" - pos: 0 -alleles: - kind: "snv" - ref: "A" - alts: ["G"] -"#, - ) - .unwrap_err(); - assert!( - invalid_lookup.contains("schema: expected schema"), - "{invalid_lookup}" - ); - assert!( - invalid_lookup.contains("coordinates.grch38.pos: expected integer >= 1"), - "{invalid_lookup}" - ); -}