diff --git a/docs/assay-schema.md b/docs/assay-schema.md index d087f4e..a6d34bc 100644 --- a/docs/assay-schema.md +++ b/docs/assay-schema.md @@ -86,6 +86,7 @@ Rules: - `derived_from` lists the variant YAML files used by the interpretation - `emits` is optional but recommended so report generators know which output columns to display and how to label them - `logic` is optional; use `logic.description` and `logic.source.url` to document where the script's derivation rules came from +- Analysis rows may emit `notes` or `report_notes` as a reporting convention. HTML reports render those notes below the analysis table and omit them from the table columns; this avoids a manifest-level template language while still letting the script build human-readable text from computed values. ## Findings diff --git a/rust/Cargo.lock b/rust/Cargo.lock index fbf0dee..37a9374 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -137,6 +137,7 @@ name = "bioscript-formats" version = "0.1.0" dependencies = [ "bioscript-core", + "flate2", "noodles", "zip", ] @@ -1694,6 +1695,7 @@ version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ + "indexmap", "itoa", "memchr", "serde", diff --git a/rust/bioscript-cli/Cargo.toml b/rust/bioscript-cli/Cargo.toml index cb78301..25cb202 100644 --- a/rust/bioscript-cli/Cargo.toml +++ b/rust/bioscript-cli/Cargo.toml @@ -13,10 +13,8 @@ bioscript-formats = { path = "../bioscript-formats" } bioscript-runtime = { path = "../bioscript-runtime" } bioscript-schema = { path = "../bioscript-schema" } monty = { path = "../../monty/crates/monty" } -serde_json = "1.0.133" +serde_json = { version = "1.0.133", features = ["preserve_order"] } serde_yaml = "0.9.34" - -[dev-dependencies] zip = { version = "2.2.0", default-features = false, features = ["deflate"] } [lints.clippy] diff --git a/rust/bioscript-cli/src/cli_bootstrap.rs b/rust/bioscript-cli/src/cli_bootstrap.rs index 5337229..ffd5398 100644 --- a/rust/bioscript-cli/src/cli_bootstrap.rs +++ b/rust/bioscript-cli/src/cli_bootstrap.rs @@ -9,8 +9,9 @@ use std::{ }; use bioscript_formats::{ - GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, InspectOptions, PrepareRequest, - inspect_file, prepare_indexes, shell_flags, + GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, InferredSex, InspectOptions, + PrepareRequest, SexDetectionConfidence, SexInference, inspect_file, prepare_indexes, + shell_flags, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; use bioscript_schema::{ @@ -46,6 +47,8 @@ fn run_cli() -> Result<(), String> { normalize_loader_paths(&runtime_root, &mut options.loader); let mut cli_timings = prepare_cli_indexes(&runtime_root, &mut options)?; + let script_path = prepare_package_entrypoint_from_arg(&runtime_root, &script_path)?; + if is_yaml_manifest(&script_path) { run_cli_manifest(&runtime_root, &script_path, &options, &mut cli_timings)?; } else { @@ -54,7 +57,7 @@ fn run_cli() -> Result<(), String> { Ok(()) } -const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--root ] [--input-format auto|text|zip|vcf|cram]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]"; +const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|cram] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]"; struct CliOptions { script_path: Option, @@ -78,6 +81,8 @@ fn dispatch_subcommand(args: &[String]) -> Result { let rest = rest.to_vec(); match first.as_str() { "report" => run_app_report(rest).map(|()| true), + "review" => run_review_report(rest).map(|()| true), + "import-package" => run_import_package(rest).map(|()| true), "validate-variants" => run_validate_variants(rest).map(|()| true), "validate-panels" => run_validate_panels(rest).map(|()| true), "validate-assays" => run_validate_assays(rest).map(|()| true), @@ -411,4 +416,3 @@ fn write_timing_report(path: &PathBuf, timings: &[StageTiming]) -> Result<(), St fs::write(path, output) .map_err(|err| format!("failed to write timing report {}: {err}", path.display())) } - diff --git a/rust/bioscript-cli/src/cli_commands.rs b/rust/bioscript-cli/src/cli_commands.rs index 2b5e577..26f7ac4 100644 --- a/rust/bioscript-cli/src/cli_commands.rs +++ b/rust/bioscript-cli/src/cli_commands.rs @@ -88,6 +88,9 @@ fn run_inspect(args: Vec) -> Result<(), String> { iter.next().ok_or("--reference-index requires a path")?, )); } + "--detect-sex" => { + options.detect_sex = true; + } other if path.is_none() => { path = Some(PathBuf::from(other)); } @@ -99,7 +102,7 @@ fn run_inspect(args: Vec) -> Result<(), String> { let Some(path) = path else { return Err( - "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]" .to_owned(), ); }; diff --git a/rust/bioscript-cli/src/commands.rs b/rust/bioscript-cli/src/commands.rs index ddf30b1..6be3b9b 100644 --- a/rust/bioscript-cli/src/commands.rs +++ b/rust/bioscript-cli/src/commands.rs @@ -94,6 +94,9 @@ pub(crate) fn run_inspect(args: Vec) -> Result<(), String> { iter.next().ok_or("--reference-index requires a path")?, )); } + "--detect-sex" => { + options.detect_sex = true; + } other if path.is_none() => { path = Some(PathBuf::from(other)); } @@ -105,7 +108,7 @@ pub(crate) fn run_inspect(args: Vec) -> Result<(), String> { let Some(path) = path else { return Err( - "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]" .to_owned(), ); }; diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index 7e0e20a..2f585b0 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -4,8 +4,11 @@ include!("cli_bootstrap.rs"); include!("cli_commands.rs"); include!("report_options.rs"); +include!("package.rs"); +include!("report_review.rs"); include!("report_execution.rs"); include!("report_observations.rs"); +include!("report_findings.rs"); include!("report_matching.rs"); include!("report_output.rs"); include!("report_html.rs"); diff --git a/rust/bioscript-cli/src/manifest.rs b/rust/bioscript-cli/src/manifest.rs index 03cf9ae..fe6e300 100644 --- a/rust/bioscript-cli/src/manifest.rs +++ b/rust/bioscript-cli/src/manifest.rs @@ -198,6 +198,10 @@ pub(crate) fn variant_row( .depth .map_or_else(String::new, |value| value.to_string()), ); + row.insert( + "raw_counts".to_owned(), + serde_json::to_string(&observation.raw_counts).unwrap_or_default(), + ); row.insert("evidence".to_owned(), observation.evidence.join(" | ")); row } diff --git a/rust/bioscript-cli/src/manifest_runner.rs b/rust/bioscript-cli/src/manifest_runner.rs index bc4e917..d750ff3 100644 --- a/rust/bioscript-cli/src/manifest_runner.rs +++ b/rust/bioscript-cli/src/manifest_runner.rs @@ -79,6 +79,15 @@ fn run_variant_manifest( let input_file = input_file.ok_or("manifest execution requires --input-file")?; let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) .map_err(|err| err.to_string())?; + run_variant_manifest_with_store(runtime_root, manifest, &store, participant_id) +} + +fn run_variant_manifest_with_store( + runtime_root: &Path, + manifest: &VariantManifest, + store: &GenotypeStore, + participant_id: Option<&str>, +) -> Result, String> { let observation = store .lookup_variant(&manifest.spec) .map_err(|err| err.to_string())?; @@ -103,6 +112,16 @@ fn run_panel_manifest( let input_file = input_file.ok_or("manifest execution requires --input-file")?; let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) .map_err(|err| err.to_string())?; + run_panel_manifest_with_store(runtime_root, panel, &store, participant_id, filters) +} + +fn run_panel_manifest_with_store( + runtime_root: &Path, + panel: &PanelManifest, + store: &GenotypeStore, + participant_id: Option<&str>, + filters: &[String], +) -> Result>, String> { let mut rows = Vec::new(); for member in &panel.members { @@ -115,23 +134,18 @@ fn run_panel_manifest( if !matches_filters(&manifest, &resolved, filters) { continue; } - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - rows.push(variant_row( + rows.push(run_variant_manifest_with_store( runtime_root, - &resolved, - &manifest.name, - &manifest.tags, - &observation, + &manifest, + store, participant_id, - )); + )?); } else if member.kind == "assay" { let assay = load_assay_manifest(&resolved)?; rows.extend(run_assay_manifest_with_store( runtime_root, &assay, - &store, + store, participant_id, filters, )?); @@ -260,6 +274,10 @@ fn variant_row( .depth .map_or_else(String::new, |value| value.to_string()), ); + row.insert( + "raw_counts".to_owned(), + serde_json::to_string(&observation.raw_counts).unwrap_or_default(), + ); row.insert("evidence".to_owned(), observation.evidence.join(" | ")); row } diff --git a/rust/bioscript-cli/src/package.rs b/rust/bioscript-cli/src/package.rs new file mode 100644 index 0000000..88c9fde --- /dev/null +++ b/rust/bioscript-cli/src/package.rs @@ -0,0 +1,410 @@ +const PACKAGE_DESCRIPTOR: &str = "manifest.yaml"; +const LEGACY_PACKAGE_DESCRIPTOR: &str = "bioscript-package.yaml"; +const PACKAGE_CACHE_DIR: &str = ".bioscript-cache/packages"; +const PACKAGE_DOWNLOAD_DIR: &str = ".bioscript-cache/downloads"; +const MAX_PACKAGE_FILES: usize = 1000; +const MAX_PACKAGE_FILE_BYTES: u64 = 16 * 1024 * 1024; +const MAX_PACKAGE_TOTAL_BYTES: u64 = 64 * 1024 * 1024; + +fn prepare_package_entrypoint_from_arg( + runtime_root: &Path, + source: &Path, +) -> Result { + let source_text = source.to_string_lossy(); + let package_path = if is_package_url(&source_text) { + download_package_url(runtime_root, &source_text)? + } else { + source.to_path_buf() + }; + if is_package_zip_path(&package_path) { + let imported = import_package_zip(runtime_root, &package_path, None)?; + Ok(imported.entrypoint) + } else { + Ok(package_path) + } +} + +fn run_import_package(args: Vec) -> Result<(), String> { + let mut source: Option = None; + let mut root: Option = None; + let mut output_dir: Option = None; + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--root" => root = Some(PathBuf::from(next_arg(&mut iter, "--root")?)), + "--output-dir" => { + output_dir = Some(PathBuf::from(next_arg(&mut iter, "--output-dir")?)); + } + value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")), + value if source.is_none() => source = Some(PathBuf::from(value)), + value => return Err(format!("unexpected argument: {value}")), + } + } + let source = source.ok_or( + "usage: bioscript import-package [--root ] [--output-dir ]", + )?; + let runtime_root = root + .map_or_else(env::current_dir, Ok) + .map_err(|err| format!("failed to get current directory: {err}"))?; + let source_text = source.to_string_lossy(); + let package_path = if is_package_url(&source_text) { + download_package_url(&runtime_root, &source_text)? + } else { + absolutize(&runtime_root, &source) + }; + let imported = import_package_zip(&runtime_root, &package_path, output_dir.as_deref())?; + println!("root\t{}", imported.root.display()); + println!("entrypoint\t{}", imported.entrypoint.display()); + if let Some(name) = imported.name { + println!("name\t{name}"); + } + Ok(()) +} + +struct ImportedPackage { + root: PathBuf, + entrypoint: PathBuf, + name: Option, +} + +fn import_package_zip( + runtime_root: &Path, + zip_path: &Path, + output_dir: Option<&Path>, +) -> Result { + let replace_existing = output_dir.is_none(); + let target_root = output_dir.map_or_else( + || package_cache_target(runtime_root, zip_path), + |path| absolutize(runtime_root, path), + ); + if target_root.exists() { + if replace_existing { + fs::remove_dir_all(&target_root).map_err(|err| { + format!( + "failed to remove previous package import {}: {err}", + target_root.display() + ) + })?; + } else if target_root + .read_dir() + .map_err(|err| format!("failed to read output dir {}: {err}", target_root.display()))? + .next() + .is_some() + { + return Err(format!( + "package output dir already exists and is not empty: {}", + target_root.display() + )); + } + } + fs::create_dir_all(&target_root).map_err(|err| { + format!( + "failed to create package import dir {}: {err}", + target_root.display() + ) + })?; + extract_package_zip(zip_path, &target_root)?; + let descriptor = load_package_descriptor(&target_root)?; + let entrypoint = target_root.join(&descriptor.entrypoint); + let canonical_root = target_root.canonicalize().map_err(|err| { + format!( + "failed to resolve package root {}: {err}", + target_root.display() + ) + })?; + let canonical_entrypoint = entrypoint.canonicalize().map_err(|err| { + format!( + "failed to resolve package entrypoint {}: {err}", + entrypoint.display() + ) + })?; + if !canonical_entrypoint.starts_with(&canonical_root) { + return Err(format!( + "package entrypoint escapes package root: {}", + descriptor.entrypoint.display() + )); + } + match manifest_schema(&canonical_entrypoint)?.as_str() { + "bioscript:panel:1.0" + | "bioscript:assay:1.0" + | "bioscript:variant:1.0" + | "bioscript:variant" => {} + other => { + return Err(format!( + "package entrypoint has unsupported schema '{other}'" + )) + } + } + Ok(ImportedPackage { + root: canonical_root, + entrypoint: canonical_entrypoint, + name: descriptor.name, + }) +} + +struct PackageDescriptor { + entrypoint: PathBuf, + name: Option, +} + +fn load_package_descriptor(root: &Path) -> Result { + for name in [PACKAGE_DESCRIPTOR, LEGACY_PACKAGE_DESCRIPTOR] { + let path = root.join(name); + if path.exists() { + let text = fs::read_to_string(&path).map_err(|err| { + format!( + "failed to read package descriptor {}: {err}", + path.display() + ) + })?; + let value: serde_yaml::Value = serde_yaml::from_str(&text).map_err(|err| { + format!( + "failed to parse package descriptor {}: {err}", + path.display() + ) + })?; + let schema = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .ok_or_else(|| { + format!("package descriptor {} is missing schema", path.display()) + })?; + if schema != "bioscript:package:1.0" { + return Err(format!( + "package descriptor {} has unsupported schema '{schema}'", + path.display() + )); + } + let entrypoint = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("entrypoint".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .ok_or_else(|| { + format!( + "package descriptor {} is missing entrypoint", + path.display() + ) + })?; + let entrypoint = checked_relative_package_path(entrypoint)?; + let name = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("name".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned); + return Ok(PackageDescriptor { entrypoint, name }); + } + } + for candidate in ["panel.yaml", "assay.yaml", "variant.yaml"] { + if root.join(candidate).exists() { + return Ok(PackageDescriptor { + entrypoint: PathBuf::from(candidate), + name: None, + }); + } + } + Err(format!( + "package does not contain {PACKAGE_DESCRIPTOR}, {LEGACY_PACKAGE_DESCRIPTOR}, panel.yaml, assay.yaml, or variant.yaml" + )) +} + +fn extract_package_zip(zip_path: &Path, target_root: &Path) -> Result<(), String> { + let file = fs::File::open(zip_path) + .map_err(|err| format!("failed to open package zip {}: {err}", zip_path.display()))?; + let mut archive = zip::ZipArchive::new(file) + .map_err(|err| format!("failed to read package zip {}: {err}", zip_path.display()))?; + if archive.len() > MAX_PACKAGE_FILES { + return Err(format!( + "package has too many entries: {} > {MAX_PACKAGE_FILES}", + archive.len() + )); + } + let mut seen = std::collections::BTreeSet::new(); + let mut total_size = 0_u64; + for idx in 0..archive.len() { + let mut entry = archive + .by_index(idx) + .map_err(|err| format!("failed to read package zip entry {idx}: {err}"))?; + let Some(enclosed) = entry.enclosed_name() else { + return Err(format!( + "package zip entry has unsafe path: {}", + entry.name() + )); + }; + let relative = checked_relative_package_path(&enclosed.to_string_lossy())?; + if entry.is_dir() { + fs::create_dir_all(target_root.join(relative)).map_err(|err| { + format!( + "failed to create package directory from {}: {err}", + entry.name() + ) + })?; + continue; + } + if entry + .unix_mode() + .is_some_and(|mode| mode & 0o170_000 == 0o120_000) + { + return Err(format!("package zip entry is a symlink: {}", entry.name())); + } + if !is_allowed_package_file(&relative) { + return Err(format!( + "package zip entry has unsupported extension: {}", + relative.display() + )); + } + if !seen.insert(relative.clone()) { + return Err(format!( + "package zip contains duplicate path: {}", + relative.display() + )); + } + let size = entry.size(); + if size > MAX_PACKAGE_FILE_BYTES { + return Err(format!( + "package file {} exceeds {} bytes", + relative.display(), + MAX_PACKAGE_FILE_BYTES + )); + } + total_size = total_size.saturating_add(size); + if total_size > MAX_PACKAGE_TOTAL_BYTES { + return Err(format!( + "package contents exceed {MAX_PACKAGE_TOTAL_BYTES} bytes" + )); + } + let target = target_root.join(&relative); + if let Some(parent) = target.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!("failed to create package dir {}: {err}", parent.display()) + })?; + } + let mut out = fs::File::create(&target) + .map_err(|err| format!("failed to create package file {}: {err}", target.display()))?; + std::io::copy(&mut entry, &mut out) + .map_err(|err| format!("failed to extract package file {}: {err}", target.display()))?; + } + Ok(()) +} + +fn checked_relative_package_path(raw: &str) -> Result { + let path = Path::new(raw); + if path.is_absolute() { + return Err(format!("package path must be relative: {raw}")); + } + let mut out = PathBuf::new(); + for component in path.components() { + match component { + std::path::Component::Normal(part) => out.push(part), + std::path::Component::CurDir => {} + std::path::Component::ParentDir + | std::path::Component::RootDir + | std::path::Component::Prefix(_) => { + return Err(format!("package path escapes package root: {raw}")); + } + } + } + if out.as_os_str().is_empty() { + return Err("package path is empty".to_owned()); + } + Ok(out) +} + +fn is_allowed_package_file(path: &Path) -> bool { + path.file_name() + .and_then(|name| name.to_str()) + .is_some_and(|name| name == PACKAGE_DESCRIPTOR || name == LEGACY_PACKAGE_DESCRIPTOR) + || path + .extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| { + matches!( + ext.to_ascii_lowercase().as_str(), + "yaml" | "yml" | "py" | "md" | "txt" | "tsv" | "json" | "jsonl" + ) + }) +} + +fn package_cache_target(runtime_root: &Path, zip_path: &Path) -> PathBuf { + let stem = zip_path + .file_stem() + .and_then(|value| value.to_str()) + .unwrap_or("package") + .chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { + ch + } else { + '-' + } + }) + .collect::(); + let nanos = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_or(0, |duration| duration.as_nanos()); + runtime_root + .join(PACKAGE_CACHE_DIR) + .join(format!("{stem}-{nanos}")) +} + +fn download_package_url(runtime_root: &Path, url: &str) -> Result { + if !url.starts_with("https://") { + return Err("package URLs must use https://".to_owned()); + } + let url_path = url.split('?').next().unwrap_or(url); + if !Path::new(url_path) + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("zip")) + { + return Err("package URL must point to a .zip file".to_owned()); + } + let downloads = runtime_root.join(PACKAGE_DOWNLOAD_DIR); + fs::create_dir_all(&downloads).map_err(|err| { + format!( + "failed to create package download dir {}: {err}", + downloads.display() + ) + })?; + let file_name = url + .split('?') + .next() + .unwrap_or(url) + .rsplit('/') + .next() + .filter(|value| !value.is_empty()) + .unwrap_or("package.zip"); + let safe_name = file_name + .chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || matches!(ch, '.' | '-' | '_') { + ch + } else { + '-' + } + }) + .collect::(); + let target = downloads.join(safe_name); + let status = std::process::Command::new("curl") + .arg("-fL") + .arg("--max-time") + .arg("60") + .arg("-o") + .arg(&target) + .arg(url) + .status() + .map_err(|err| format!("failed to run curl for package download: {err}"))?; + if !status.success() { + return Err(format!("package download failed for {url}")); + } + Ok(target) +} + +fn is_package_url(value: &str) -> bool { + value.starts_with("https://") || value.starts_with("http://") +} + +fn is_package_zip_path(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("zip")) +} diff --git a/rust/bioscript-cli/src/report_execution.rs b/rust/bioscript-cli/src/report_execution.rs index d657f6c..8bda914 100644 --- a/rust/bioscript-cli/src/report_execution.rs +++ b/rust/bioscript-cli/src/report_execution.rs @@ -44,28 +44,32 @@ fn run_manifest_rows_for_report( } } +struct ReportAnalysisOptions<'a> { + runtime_root: &'a Path, + input_file: &'a Path, + participant_id: &'a str, + loader: &'a GenotypeLoadOptions, + output_dir: &'a Path, + filters: &'a [String], + max_duration_ms: u64, +} + fn run_manifest_analyses_for_report( - runtime_root: &Path, manifest_path: &Path, - input_file: &Path, - participant_id: &str, - loader: &GenotypeLoadOptions, - output_dir: &Path, + options: &ReportAnalysisOptions<'_>, ) -> Result, String> { match manifest_schema(manifest_path)?.as_str() { "bioscript:panel:1.0" => { let manifest = load_panel_manifest(manifest_path)?; let mut analyses = Vec::new(); - analyses.extend(run_interpretations_for_report( - runtime_root, - &manifest.path, - &manifest.name, - &manifest.interpretations, - input_file, - participant_id, - loader, - output_dir, - )?); + if options.filters.is_empty() { + analyses.extend(run_interpretations_for_report( + &manifest.path, + &manifest.name, + &manifest.interpretations, + options, + )?); + } for member in &manifest.members { if member.kind != "assay" { continue; @@ -73,29 +77,22 @@ fn run_manifest_analyses_for_report( let Some(path) = &member.path else { continue; }; - let resolved = resolve_manifest_path(runtime_root, &manifest.path, path)?; - analyses.extend(run_manifest_analyses_for_report( - runtime_root, - &resolved, - input_file, - participant_id, - loader, - output_dir, - )?); + let resolved = + resolve_manifest_path(options.runtime_root, &manifest.path, path)?; + if !analysis_path_matches_filters(&resolved, options.filters) { + continue; + } + analyses.extend(run_manifest_analyses_for_report(&resolved, options)?); } Ok(analyses) } "bioscript:assay:1.0" => { let manifest = load_assay_manifest(manifest_path)?; run_interpretations_for_report( - runtime_root, &manifest.path, &manifest.name, &manifest.interpretations, - input_file, - participant_id, - loader, - output_dir, + options, ) } "bioscript:variant:1.0" | "bioscript:variant" => Ok(Vec::new()), @@ -103,16 +100,18 @@ fn run_manifest_analyses_for_report( } } -#[allow(clippy::too_many_arguments)] +fn analysis_path_matches_filters(path: &Path, filters: &[String]) -> bool { + filters.iter().all(|filter| match filter.split_once('=') { + Some(("path", value)) => path.display().to_string().contains(value), + _ => false, + }) +} + fn run_interpretations_for_report( - runtime_root: &Path, manifest_path: &Path, manifest_name: &str, interpretations: &[PanelInterpretation], - input_file: &Path, - participant_id: &str, - loader: &GenotypeLoadOptions, - output_dir: &Path, + options: &ReportAnalysisOptions<'_>, ) -> Result, String> { let mut outputs = Vec::new(); for interpretation in interpretations { @@ -122,13 +121,14 @@ fn run_interpretations_for_report( interpretation.id, interpretation.kind )); } - let script_path = resolve_manifest_path(runtime_root, manifest_path, &interpretation.path)?; + let script_path = + resolve_manifest_path(options.runtime_root, manifest_path, &interpretation.path)?; let format = interpretation .output_format .as_deref() .unwrap_or("json") .to_ascii_lowercase(); - let analysis_dir = output_dir.join("analysis").join(participant_id); + let analysis_dir = options.output_dir.join("analysis").join(options.participant_id); fs::create_dir_all(&analysis_dir).map_err(|err| { format!( "failed to create analysis output dir {}: {err}", @@ -143,25 +143,27 @@ fn run_interpretations_for_report( }; let output_file = analysis_dir.join(format!("{}.{}", interpretation.id, extension)); run_bioscript_analysis_script( - runtime_root, + options.runtime_root, &script_path, - input_file, + options.input_file, &output_file, - participant_id, - loader, + options.participant_id, + options.loader, + options.max_duration_ms, )?; - let rows = parse_analysis_output(&output_file, &format)?; + let (rows, row_headers) = parse_analysis_output(&output_file, &format)?; outputs.push(serde_json::json!({ "schema": "bioscript:analysis-output:1.0", "version": "1.0", - "participant_id": participant_id, + "participant_id": options.participant_id, "assay_id": manifest_name, "analysis_id": interpretation.id, + "analysis_label": interpretation.label.clone(), "kind": interpretation.kind, "output_format": format, - "manifest_path": manifest_path.strip_prefix(runtime_root).unwrap_or(manifest_path).display().to_string(), - "script_path": script_path.strip_prefix(runtime_root).unwrap_or(&script_path).display().to_string(), - "output_file": output_file.strip_prefix(runtime_root).unwrap_or(&output_file).display().to_string(), + "manifest_path": manifest_path.strip_prefix(options.runtime_root).unwrap_or(manifest_path).display().to_string(), + "script_path": script_path.strip_prefix(options.runtime_root).unwrap_or(&script_path).display().to_string(), + "output_file": output_file.strip_prefix(options.runtime_root).unwrap_or(&output_file).display().to_string(), "derived_from": interpretation.derived_from.clone(), "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ "key": emit.key.clone(), @@ -176,6 +178,7 @@ fn run_interpretations_for_report( "url": source.url.clone(), })), })), + "row_headers": row_headers, "rows": rows, })); } @@ -189,9 +192,10 @@ fn run_bioscript_analysis_script( output_file: &Path, participant_id: &str, loader: &GenotypeLoadOptions, + analysis_max_duration_ms: u64, ) -> Result<(), String> { let limits = ResourceLimits::new() - .max_duration(Duration::from_secs(1)) + .max_duration(Duration::from_millis(analysis_max_duration_ms)) .max_memory(16 * 1024 * 1024) .max_allocations(400_000) .gc_interval(1000) @@ -234,7 +238,10 @@ fn runtime_path_string(runtime_root: &Path, path: &Path) -> String { .to_string() } -fn parse_analysis_output(path: &Path, format: &str) -> Result, String> { +fn parse_analysis_output( + path: &Path, + format: &str, +) -> Result<(Vec, Vec), String> { let text = fs::read_to_string(path) .map_err(|err| format!("failed to read analysis output {}: {err}", path.display()))?; match format { @@ -243,28 +250,34 @@ fn parse_analysis_output(path: &Path, format: &str) -> Result rows, serde_json::Value::Object(mut object) => object .remove("rows") .and_then(|rows| rows.as_array().cloned()) .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), other => vec![other], - }) + }; + let headers = analysis_headers_from_rows(&rows); + Ok((rows, headers)) } - "jsonl" => text + "jsonl" => { + let rows = text .lines() .filter(|line| !line.trim().is_empty()) .map(|line| serde_json::from_str(line).map_err(|err| err.to_string())) - .collect(), + .collect::, _>>()?; + let headers = analysis_headers_from_rows(&rows); + Ok((rows, headers)) + } other => Err(format!("unsupported analysis output_format '{other}'")), } } -fn parse_analysis_tsv(text: &str) -> Vec { +fn parse_analysis_tsv(text: &str) -> (Vec, Vec) { let mut lines = text.lines().filter(|line| !line.trim().is_empty()); let Some(header_line) = lines.next() else { - return Vec::new(); + return (Vec::new(), Vec::new()); }; let headers: Vec<&str> = header_line.split('\t').collect(); let mut rows = Vec::new(); @@ -279,7 +292,22 @@ fn parse_analysis_tsv(text: &str) -> Vec { } rows.push(serde_json::Value::Object(object)); } - rows + (rows, headers.iter().map(|header| (*header).to_owned()).collect()) +} + +fn analysis_headers_from_rows(rows: &[serde_json::Value]) -> Vec { + let mut headers = Vec::new(); + for row in rows { + let Some(object) = row.as_object() else { + continue; + }; + for key in object.keys() { + if !headers.contains(key) { + headers.push(key.clone()); + } + } + } + headers } fn app_assay_id(path: &Path) -> Result { diff --git a/rust/bioscript-cli/src/report_findings.rs b/rust/bioscript-cli/src/report_findings.rs new file mode 100644 index 0000000..93bd872 --- /dev/null +++ b/rust/bioscript-cli/src/report_findings.rs @@ -0,0 +1,313 @@ +fn load_manifest_findings( + root: &Path, + manifest_path: &Path, +) -> Result, String> { + let value = load_yaml_value(manifest_path)?; + let schema = value + .get("schema") + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default(); + let mut findings = Vec::new(); + + if matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) && let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let include = json_item + .get("include") + .and_then(serde_json::Value::as_str) + .map(str::to_owned); + if let Some(include) = include { + let include_path = resolve_manifest_path(root, manifest_path, &include)?; + let mut included = load_manifest_findings(root, &include_path)?; + let inherited_binding = json_item.get("binding").cloned(); + for included_item in &mut included { + if inherited_binding.is_some() + && included_item.get("binding").is_none() + && included_item.get("effects").is_none() + && let Some(object) = included_item.as_object_mut() + { + object.insert( + "binding".to_owned(), + inherited_binding.clone().unwrap_or(serde_json::Value::Null), + ); + } + } + findings.extend(included); + continue; + } + if json_item.get("include").is_none() { + findings.push(json_item); + } + } + } + + if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") + && let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { + continue; + }; + let member_path = resolve_manifest_path(root, manifest_path, path)?; + findings.extend(load_manifest_findings(root, &member_path)?); + } + } + + Ok(findings) +} + +fn load_yaml_value(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read YAML {}: {err}", path.display()))?; + serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) +} + +fn yaml_to_json(value: serde_yaml::Value) -> Result { + serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) +} + +fn load_manifest_provenance_links( + root: &Path, + manifest_path: &Path, +) -> Result, String> { + let value = load_yaml_value(manifest_path)?; + let schema = value + .get("schema") + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default(); + let mut links = BTreeMap::::new(); + collect_manifest_provenance_entries(&value, &mut links)?; + + if matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) && let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) else { + continue; + }; + let include_path = resolve_manifest_path(root, manifest_path, include)?; + for item in load_manifest_provenance_links(root, &include_path)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + + if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") + && let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { + for member in items { + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { + continue; + }; + let member_path = resolve_manifest_path(root, manifest_path, path)?; + for item in load_manifest_provenance_links(root, &member_path)? { + if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(item); + } + } + } + } + + Ok(links.into_values().collect()) +} + +fn collect_manifest_provenance_entries( + value: &serde_yaml::Value, + links: &mut BTreeMap, +) -> Result<(), String> { + if let Some(sources) = value + .get("provenance") + .and_then(|provenance| provenance.get("sources")) + .and_then(serde_yaml::Value::as_sequence) + { + for source in sources { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + } + if let Some(source) = value.get("source") { + let json = yaml_to_json(source.clone())?; + if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { + links.entry(url.to_owned()).or_insert(json); + } + } + Ok(()) +} + +fn match_app_findings( + findings: &[serde_json::Value], + observations: &[serde_json::Value], + analyses: &[serde_json::Value], +) -> Vec { + let mut matched = Vec::new(); + let mut seen = std::collections::BTreeSet::new(); + for finding in findings { + if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { + for effect in effects { + if let Some(observation) = app_finding_match_observation(effect, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert("matched_analysis".to_owned(), analysis); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } + } + } else if let Some(observation) = app_finding_match_observation(finding, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_analysis".to_owned(), analysis); + } + let key = app_finding_dedupe_key(&item); + if seen.insert(key) { + matched.push(item); + } + } + } + matched +} + +#[cfg(test)] +mod report_observations_tests { + use super::*; + + #[test] + fn raw_counts_can_fill_display_for_homozygous_and_heterozygous_observations() { + assert_eq!( + genotype_display_from_raw_counts(r#"{"T": 24}"#).as_deref(), + Some("TT") + ); + assert_eq!( + genotype_display_from_raw_counts(r#"{"C": 12, "T": 10}"#).as_deref(), + Some("CT") + ); + } + + #[test] + fn non_reportable_alleles_are_classified_as_observed_or_unknown() { + assert_eq!( + classify_non_reportable_alleles("TT", "C", "G", &["T".to_owned()]), + Some("observed_alt") + ); + assert_eq!( + classify_non_reportable_alleles("AT", "C", "G", &["T".to_owned()]), + Some("unknown_alt") + ); + assert_eq!( + classify_non_reportable_alleles("CG", "C", "G", &["T".to_owned()]), + None + ); + } + + #[test] + fn single_allele_sex_chromosome_calls_are_treated_as_hemizygous() { + assert_eq!( + normalize_app_genotype("G", "C", "G", "X", None), + ("1".to_owned(), "hem_alt".to_owned()) + ); + assert_eq!( + normalize_app_genotype("C", "C", "G", "chrX", None), + ("0".to_owned(), "hem_ref".to_owned()) + ); + assert_eq!( + normalize_app_genotype("G", "C", "G", "1", None), + ("G".to_owned(), "unknown".to_owned()) + ); + assert_eq!( + normalize_app_genotype("GG", "C", "G", "X", None), + ("1/1".to_owned(), "hom_alt".to_owned()) + ); + } + + #[test] + fn confident_male_sex_chromosome_duplicate_calls_are_hemizygous() { + let inferred_sex = SexInference { + sex: InferredSex::Male, + confidence: SexDetectionConfidence::High, + method: "vcf_non_par_x_gt".to_owned(), + evidence: vec!["called_y_snps=1200".to_owned()], + }; + assert_eq!( + normalize_app_genotype("GG", "C", "G", "X", Some(&inferred_sex)), + ("1".to_owned(), "hem_alt".to_owned()) + ); + assert_eq!( + normalize_app_genotype("CC", "C", "G", "chrX", Some(&inferred_sex)), + ("0".to_owned(), "hem_ref".to_owned()) + ); + } +} diff --git a/rust/bioscript-cli/src/report_html.rs b/rust/bioscript-cli/src/report_html.rs index caed479..4013c20 100644 --- a/rust/bioscript-cli/src/report_html.rs +++ b/rust/bioscript-cli/src/report_html.rs @@ -1,3 +1,6 @@ include!("report_html_sections.rs"); +include!("report_html_analysis.rs"); +include!("report_html_provenance.rs"); +include!("report_html_observations.rs"); include!("report_html_pgx.rs"); include!("report_html_helpers.rs"); diff --git a/rust/bioscript-cli/src/report_html_analysis.rs b/rust/bioscript-cli/src/report_html_analysis.rs new file mode 100644 index 0000000..b667dc5 --- /dev/null +++ b/rust/bioscript-cli/src/report_html_analysis.rs @@ -0,0 +1,252 @@ +fn render_analysis_tables( + out: &mut String, + analyses: &[serde_json::Value], + show_participant_id: bool, +) { + if analyses.is_empty() { + out.push_str("

No analysis outputs.

"); + return; + } + for (index, analysis) in analyses.iter().enumerate() { + let table_id = format!("analysis-table-{index}"); + let title = analysis_title(analysis); + let row_count = analysis + .get("rows") + .and_then(serde_json::Value::as_array) + .map_or(0, Vec::len); + let _ = write!( + out, + "
{}{} result row(s)
", + html_escape(&title), + row_count + ); + render_analysis_logic(out, analysis); + let rows = analysis + .get("rows") + .and_then(serde_json::Value::as_array) + .cloned() + .unwrap_or_default(); + if rows.is_empty() { + out.push_str("

No rows emitted.

"); + out.push_str("
"); + continue; + } + let headers = analysis_row_headers(analysis, &rows, show_participant_id); + let notes = analysis_notes(&rows); + if rows.len() == 1 { + out.push_str("

Results

"); + render_analysis_key_values(out, analysis, &rows[0], &headers); + render_analysis_notes(out, ¬es); + out.push_str(""); + continue; + } + out.push_str("

Results

"); + let header_refs = headers.iter().map(String::as_str).collect::>(); + render_table_start(out, &table_id, &header_refs); + let participant = value_str(analysis, "participant_id"); + for row in rows { + let _ = write!( + out, + "", + html_escape(participant) + ); + for header in &headers { + table_cell(out, &json_field_as_tsv(row.get(header))); + } + out.push_str(""); + } + render_table_end(out); + render_analysis_notes(out, ¬es); + out.push_str(""); + } +} + +fn analysis_title(analysis: &serde_json::Value) -> String { + let label = value_str(analysis, "analysis_label"); + if label.is_empty() { + value_str(analysis, "analysis_id").to_owned() + } else { + label.to_owned() + } +} + +fn analysis_row_headers( + analysis: &serde_json::Value, + rows: &[serde_json::Value], + show_participant_id: bool, +) -> Vec { + let mut headers = Vec::new(); + if let Some(emits) = analysis.get("emits").and_then(serde_json::Value::as_array) { + for key in emits + .iter() + .filter_map(|emit| emit.get("key")) + .filter_map(serde_json::Value::as_str) + { + if should_show_analysis_header(key, show_participant_id) + && rows.iter().any(|row| row.get(key).is_some()) + && !headers.iter().any(|item| item == key) + { + headers.push(key.to_owned()); + } + } + } + if let Some(row_headers) = analysis + .get("row_headers") + .and_then(serde_json::Value::as_array) + { + for header in row_headers.iter().filter_map(serde_json::Value::as_str) { + if should_show_analysis_header(header, show_participant_id) + && !headers.iter().any(|item| item == header) + { + headers.push(header.to_owned()); + } + } + } + for row in rows { + let Some(object) = row.as_object() else { + continue; + }; + for key in object.keys() { + if !should_show_analysis_header(key, show_participant_id) { + continue; + } + if !headers.contains(key) { + headers.push(key.clone()); + } + } + } + headers +} + +fn should_show_analysis_header(key: &str, show_participant_id: bool) -> bool { + (show_participant_id || key != "participant_id") && key != "notes" && key != "report_notes" +} + +fn analysis_notes(rows: &[serde_json::Value]) -> Vec { + let mut notes = Vec::new(); + for row in rows { + if let Some(note) = row + .get("notes") + .or_else(|| row.get("report_notes")) + .and_then(serde_json::Value::as_str) + && !note.trim().is_empty() + && !notes.iter().any(|item: &String| item == note) + { + notes.push(note.to_owned()); + } + } + notes +} + +fn render_analysis_key_values( + out: &mut String, + analysis: &serde_json::Value, + row: &serde_json::Value, + headers: &[String], +) { + out.push_str("
"); + for header in headers { + let value = json_field_as_tsv(row.get(header)); + let _ = write!( + out, + "
{}
{}
", + html_escape(&analysis_header_label(analysis, header)), + render_analysis_value(header, &value) + ); + } + out.push_str("
"); +} + +fn analysis_header_label(analysis: &serde_json::Value, key: &str) -> String { + analysis + .get("emits") + .and_then(serde_json::Value::as_array) + .and_then(|emits| { + emits.iter().find_map(|emit| { + if emit.get("key").and_then(serde_json::Value::as_str) == Some(key) { + emit.get("label") + .and_then(serde_json::Value::as_str) + .map(ToOwned::to_owned) + } else { + None + } + }) + }) + .unwrap_or_else(|| table_header_label(key)) +} + +fn render_analysis_value(key: &str, value: &str) -> String { + if value.starts_with("http://") || value.starts_with("https://") { + return format!( + "Source", + html_escape(value) + ); + } + if key.ends_with("_status") || key.ends_with("_outcome") { + format!( + "{}", + analysis_badge_class(value), + html_escape(value) + ) + } else { + html_escape(value) + } +} + +fn analysis_badge_class(value: &str) -> &'static str { + match value { + "normal" | "reference" => "analysis-badge-normal", + "variant" => "analysis-badge-variant", + "unknown" | "unresolved_missing_variant" => "analysis-badge-unknown", + _ => "", + } +} + +fn render_analysis_notes(out: &mut String, notes: &[String]) { + if notes.is_empty() { + return; + } + out.push_str("
"); + out.push_str("

Notes

"); + for note in notes { + let _ = write!(out, "

{}

", html_escape(note)); + } + out.push_str("
"); +} + +fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { + let Some(logic) = analysis.get("logic") else { + return; + }; + if logic.is_null() { + return; + } + let description = logic + .get("description") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let source = logic.get("source").unwrap_or(&serde_json::Value::Null); + let source_name = source + .get("name") + .and_then(serde_json::Value::as_str) + .unwrap_or("source"); + let source_url = source + .get("url") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + out.push_str("
"); + out.push_str("

Description

"); + if !description.is_empty() { + let _ = write!(out, "

{}

", html_escape(description)); + } + if !source_url.is_empty() { + let _ = write!( + out, + "

Logic source: {}

", + html_escape(source_url), + html_escape(source_name) + ); + } + out.push_str("
"); +} + diff --git a/rust/bioscript-cli/src/report_html_helpers.rs b/rust/bioscript-cli/src/report_html_helpers.rs index d43ef46..65a6ae6 100644 --- a/rust/bioscript-cli/src/report_html_helpers.rs +++ b/rust/bioscript-cli/src/report_html_helpers.rs @@ -1,26 +1,82 @@ fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { let escaped_id = html_escape(table_id); - let refs_control = if table_id == "observations-table" { - "" - } else { - "" - }; let _ = write!( out, - "
{refs_control}
" + "
" ); for (index, header) in headers.iter().enumerate() { let _ = write!( out, - "", + "", + table_column_class(header), escaped_id, index, - html_escape(header) + html_escape(&table_header_label(header)) ); } out.push_str(""); } +fn table_column_class(header: &str) -> &'static str { + if is_debug_column(header) { + "debug-col" + } else { + "" + } +} + +fn is_debug_column(header: &str) -> bool { + matches!( + header, + "allele_balance" + | "genotype_quality" + | "evidence_type" + | "evidence_raw" + | "assay_id" + | "assay_version" + | "variant_key" + | "match_status" + | "coverage_status" + | "call_status" + ) +} + +fn table_header_label(header: &str) -> String { + match header { + "participant_id" => "Participant ID".to_owned(), + "rsid" => "RSID".to_owned(), + "gene" => "Gene".to_owned(), + "ref" => "Ref".to_owned(), + "alt" => "Alt".to_owned(), + "ref_alt" | "Ref/Alt" => "Ref / Alt".to_owned(), + "genotype_display" => "Genotype".to_owned(), + "genotype" => "GT".to_owned(), + "zygosity" => "Zygosity".to_owned(), + "outcome" => "Outcome".to_owned(), + "match_status" => "Match Status".to_owned(), + "coverage_status" => "Coverage Status".to_owned(), + "call_status" => "Call Status".to_owned(), + "assembly" => "Assembly".to_owned(), + "chrom" => "Chrom".to_owned(), + "pos_start" => "Start".to_owned(), + "pos_end" => "End".to_owned(), + "kind" => "Kind".to_owned(), + "ref_count" => "Ref Count".to_owned(), + "alt_count" => "Alt Count".to_owned(), + "depth" => "Depth".to_owned(), + "genotype_quality" => "Genotype Quality".to_owned(), + "allele_balance" => "Allele Balance".to_owned(), + "evidence_type" => "Evidence Type".to_owned(), + "source" => "Source".to_owned(), + "evidence_raw" => "Evidence Raw".to_owned(), + "facets" => "Facets".to_owned(), + "assay_id" => "Assay ID".to_owned(), + "assay_version" => "Assay Version".to_owned(), + "variant_key" => "Variant Key".to_owned(), + other => other.to_owned(), + } +} + fn render_table_end(out: &mut String) { out.push_str("
{}{}
"); } @@ -95,4 +151,3 @@ fn html_escape(value: &str) -> String { .replace('>', ">") .replace('"', """) } - diff --git a/rust/bioscript-cli/src/report_html_observations.rs b/rust/bioscript-cli/src/report_html_observations.rs new file mode 100644 index 0000000..8f68623 --- /dev/null +++ b/rust/bioscript-cli/src/report_html_observations.rs @@ -0,0 +1,230 @@ +fn render_observation_table( + out: &mut String, + observations: &[serde_json::Value], + show_participant_id: bool, +) { + render_observation_filters(out); + let all_headers = [ + "participant_id", + "outcome", + "rsid", + "gene", + "ref_alt", + "genotype_display", + "genotype", + "zygosity", + "assembly", + "chrom", + "pos_start", + "pos_end", + "kind", + "ref_count", + "alt_count", + "depth", + "genotype_quality", + "allele_balance", + "evidence_type", + "evidence_raw", + "facets", + "assay_id", + "assay_version", + "variant_key", + "match_status", + "coverage_status", + "call_status", + "source", + ]; + let show_counts = observations.iter().any(observation_has_quantitative_depth); + let show_genotype_quality = observations + .iter() + .any(|observation| !json_field_as_tsv(observation.get("genotype_quality")).is_empty()); + let show_facets = observations + .iter() + .any(|observation| !json_field_as_tsv(observation.get("facets")).is_empty()); + let headers = all_headers + .iter() + .copied() + .filter(|header| show_participant_id || *header != "participant_id") + .filter(|header| { + show_counts || !matches!(*header, "ref_count" | "alt_count" | "depth" | "allele_balance") + }) + .filter(|header| show_genotype_quality || *header != "genotype_quality") + .filter(|header| show_facets || *header != "facets") + .collect::>(); + render_table_start(out, "observations-table", &headers); + for observation in observations { + let _ = write!( + out, + "", + observation_row_class(observation), + observation_filter_group(observation), + html_escape(value_str(observation, "participant_id")) + ); + for header in &headers { + render_observation_cell(out, observation, header); + } + out.push_str(""); + } + out.push_str(""); +} + +fn observation_filter_group(observation: &serde_json::Value) -> &'static str { + match observation_row_class(observation) { + "row-reference" => "reference", + "row-missing" => "missing", + _ => "variant", + } +} + +fn render_observation_filters(out: &mut String) { + out.push_str("
Observations:"); + for (outcome, label) in [ + ("variant", "Show variants"), + ("reference", "Show reference"), + ("missing", "Show missing"), + ] { + let _ = write!( + out, + "" + ); + } + out.push_str(""); + out.push_str(""); + out.push_str("
"); +} + +fn observation_has_quantitative_depth(observation: &serde_json::Value) -> bool { + ["ref_count", "alt_count", "depth", "allele_balance"] + .iter() + .any(|key| !json_field_as_tsv(observation.get(*key)).is_empty()) +} + +fn observation_row_class(observation: &serde_json::Value) -> &'static str { + let outcome = observation + .get("outcome") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if outcome == "variant" { + "row-variant" + } else if outcome == "reference" { + "row-reference" + } else if observation + .get("call_status") + .and_then(serde_json::Value::as_str) + != Some("called") + || observation + .get("match_status") + .and_then(serde_json::Value::as_str) + == Some("not_found") + { + "row-missing" + } else { + "" + } +} + +fn render_observation_cell(out: &mut String, observation: &serde_json::Value, header: &str) { + let cell_class = table_column_class(header); + if header == "ref_alt" { + class_cell(out, &observation_ref_alt(observation), "mono"); + return; + } + if header == "allele_balance" { + let value = observation + .get(header) + .and_then(serde_json::Value::as_f64) + .map(|value| format!("{value:.2}")) + .unwrap_or_default(); + class_cell(out, &value, cell_class); + return; + } + if header == "source" { + let source = observation.get("source").unwrap_or(&serde_json::Value::Null); + let url = source + .get("url") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if url.is_empty() { + let _ = write!(out, ""); + } else { + let _ = write!( + out, + "Source", + cell_class, + html_escape(url) + ); + } + return; + } + if header == "genotype_display" { + let outcome = observation + .get("outcome") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let value = json_field_as_tsv(observation.get(header)); + if matches!(outcome, "variant" | "observed_alt" | "unknown_alt") { + let alt = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let _ = write!( + out, + "{}", + cell_class, + highlight_allele(&value, alt) + ); + return; + } + } + let _ = write!( + out, + "{}", + cell_class, + html_escape(&json_field_as_tsv(observation.get(header))) + ); +} + +fn observation_ref_alt(observation: &serde_json::Value) -> String { + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if ref_allele.is_empty() && alt_allele.is_empty() { + String::new() + } else { + format!("{ref_allele}->{}", alt_allele.replace(',', "/")) + } +} + +fn highlight_allele(value: &str, allele: &str) -> String { + if value.is_empty() || allele.is_empty() { + return html_escape(value); + } + if allele.chars().count() == 1 { + let target = allele + .chars() + .next() + .unwrap_or_default() + .to_ascii_uppercase(); + let mut out = String::new(); + for ch in value.chars() { + let escaped = html_escape(&ch.to_string()); + if ch.to_ascii_uppercase() == target { + let _ = write!(out, "{escaped}"); + } else { + out.push_str(&escaped); + } + } + return out; + } + let escaped_value = html_escape(value); + let escaped_allele = html_escape(allele); + escaped_value.replace( + &escaped_allele, + &format!("{escaped_allele}"), + ) +} diff --git a/rust/bioscript-cli/src/report_html_pgx.rs b/rust/bioscript-cli/src/report_html_pgx.rs index bbf0065..b205973 100644 --- a/rust/bioscript-cli/src/report_html_pgx.rs +++ b/rust/bioscript-cli/src/report_html_pgx.rs @@ -1,87 +1,214 @@ -fn render_pgx_label_table(out: &mut String, findings: &[serde_json::Value]) { +fn render_pgx_table( + out: &mut String, + label_findings: &[serde_json::Value], + summary_findings: &[serde_json::Value], +) { + let mut findings = Vec::new(); + findings.extend(label_findings.iter()); + findings.extend(summary_findings.iter()); + if findings.is_empty() { + out.push_str("

No PGx findings.

"); + return; + } + render_pgx_filters(out); + out.push_str("
"); + out.push_str("
"); let headers = [ - "Variant", - "Ref/Alt", - "Genes", + "Type", + "RSID", + "Gene", + "Ref / Alt", + "Genotype", "Drug(s)", - "Regulator", - "Action", - "Label", - "Evidence", + "Level", + "Category", + "Phenotype", + "Finding", + "Source", ]; - render_pgx_label_filters(out); - render_table_start(out, "labels-table", &headers); - for finding in findings { - let evidence = finding.get("evidence"); - let url = evidence - .and_then(|value| value.get("url")) - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let pgx_level = value_str(finding, "pgx_action_level"); - let _ = write!( - out, - "", - html_escape(&pgx_level_slug(pgx_level)) - ); - table_cell(out, value_str(finding, "variant")); - class_cell(out, &matched_ref_alt(finding), "mono"); - table_cell(out, &join_string_array(finding.get("genes"))); - table_cell(out, &join_drugs(finding)); - table_cell(out, &join_string_array(finding.get("regulatory_sources"))); - pgx_level_cell(out, pgx_level); - table_cell(out, value_str(finding, "label")); - link_cell(out, url); - out.push_str(""); + render_table_start(out, "pgx-variant-table", &headers); + for finding in &findings { + render_pgx_row(out, finding, true); } render_table_end(out); -} - -fn render_pgx_summary_table(out: &mut String, findings: &[serde_json::Value]) { - let headers = [ - "Variant", - "Ref/Alt", + out.push_str("
"); +} + +fn render_pgx_row(out: &mut String, finding: &serde_json::Value, show_drug: bool) { + let source_type = pgx_source_type(finding); + let level = pgx_level_value(finding); + let level_slug = pgx_level_filter_slug(finding); + let outcome = pgx_outcome_filter_slug(finding); + let _ = write!( + out, + "", + html_escape(&evidence_level_group(level)), + html_escape(&level_slug), + html_escape(outcome), + html_escape(&finding_participant(finding)) + ); + table_cell(out, source_type); + table_cell(out, &finding_rsid(finding)); + table_cell(out, &finding_gene(finding)); + class_cell(out, &matched_ref_alt(finding), "mono"); + pgx_genotype_cell(out, finding); + if show_drug { table_cell(out, &join_drugs(finding)); - table_cell(out, &join_string_array(finding.get("phenotype_categories"))); - evidence_level_cell(out, evidence_level); - table_cell(out, &join_string_array(finding.get("phenotypes"))); - class_cell(out, value_str(effect, "text"), "effect"); - link_cell(out, url); - out.push_str(""); } - render_table_end(out); + pgx_any_level_cell(out, finding); + table_cell(out, &pgx_category(finding)); + table_cell(out, &join_string_array(finding.get("phenotypes"))); + class_cell(out, &pgx_finding_text(finding), "effect"); + link_cell(out, pgx_evidence_url(finding)); + out.push_str(""); +} + +fn pgx_source_type(finding: &serde_json::Value) -> &str { + match value_str(finding, "schema") { + "bioscript:pgx-label:1.0" => "Drug Label", + _ => "Summary", + } +} + +fn pgx_level_value(finding: &serde_json::Value) -> &str { + if pgx_source_type(finding) == "Drug Label" { + value_str(finding, "pgx_action_level") + } else { + value_str(finding, "evidence_level") + } +} + +fn pgx_level_filter_slug(finding: &serde_json::Value) -> String { + if pgx_source_type(finding) == "Drug Label" { + format!("drug-{}", pgx_level_slug(pgx_level_value(finding))) + } else { + format!("summary-{}", evidence_level_group(pgx_level_value(finding))) + } +} + +fn pgx_category(finding: &serde_json::Value) -> String { + if pgx_source_type(finding) == "Drug Label" { + let actions = join_string_array(finding.get("prescribing_actions")); + let sources = join_string_array(finding.get("regulatory_sources")); + if actions.is_empty() { + sources + } else if sources.is_empty() { + actions + } else { + format!("{sources}; {actions}") + } + } else { + join_string_array(finding.get("phenotype_categories")) + } +} + +fn pgx_finding_text(finding: &serde_json::Value) -> String { + if pgx_source_type(finding) == "Drug Label" { + for key in ["prescribing_information", "summary", "notes", "label"] { + let value = value_str(finding, key); + if !value.is_empty() { + return value.to_owned(); + } + } + return String::new(); + } + let effect = finding + .get("matched_effect") + .unwrap_or(&serde_json::Value::Null); + let text = value_str(effect, "text"); + if !text.is_empty() { + return text.to_owned(); + } + value_str(finding, "notes").to_owned() +} + +fn pgx_evidence_url(finding: &serde_json::Value) -> &str { + finding + .get("evidence") + .and_then(|value| value.get("url")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default() +} + +fn pgx_drug_names(findings: &[&serde_json::Value]) -> Vec { + let mut drugs = Vec::new(); + for finding in findings { + for drug in finding_drug_names(finding) { + if !drugs.contains(&drug) { + drugs.push(drug); + } + } + } + drugs.sort(); + drugs +} + +fn finding_drug_names(finding: &serde_json::Value) -> Vec { + finding + .get("drugs") + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .filter_map(|drug| drug.get("name").and_then(serde_json::Value::as_str)) + .map(ToOwned::to_owned) + .collect::>() + }) + .unwrap_or_default() } -fn render_evidence_level_filters(out: &mut String) { - out.push_str("
Evidence:"); +fn render_pgx_filters(out: &mut String) { + out.push_str("
Drug Label PGx Level i
"); for (level, label) in [ + ("required", "Testing Required"), + ("recommended", "Testing Recommended"), + ("actionable", "Actionable PGx"), + ("informative", "Informative PGx"), + ("no-clinical", "No Clinical PGx"), + ("criteria", "Criteria Not Met"), + ("unknown", "Unknown"), + ] { + let _ = write!( + out, + "" + ); + } + out.push_str("
Summary Evidence Level i
"); + for (level, label, class_level) in [ ("1", "Level 1"), ("1a", "Level 1A"), ("1b", "Level 1B"), @@ -90,33 +217,115 @@ fn render_evidence_level_filters(out: &mut String) { ("2b", "Level 2B"), ("3", "Level 3"), ("4", "Level 4"), + ("unknown", "Unknown"), + ] + .map(|(level, label)| (level, label, evidence_level_color_group(level))) + { + let _ = write!( + out, + "", + html_escape(&class_level) + ); + } + out.push_str("
Result
"); + for (outcome, label, class_name) in [ + ("variant", "Variant", "analysis-badge-variant"), + ("reference", "Normal", "analysis-badge-normal"), + ("missing", "Missing", "analysis-badge-unknown"), ] { let _ = write!( out, - "" + "" ); } - out.push_str(""); - out.push_str("i
"); + out.push_str("
"); } -fn render_pgx_label_filters(out: &mut String) { - out.push_str("
PGx level:"); - for (level, label) in [ - ("required", "Testing Required"), - ("recommended", "Testing Recommended"), - ("actionable", "Actionable PGx"), - ("informative", "Informative PGx"), - ("no-clinical", "No Clinical PGx"), - ("criteria", "Criteria Not Met"), - ] { +fn pgx_any_level_cell(out: &mut String, finding: &serde_json::Value) { + let level = pgx_level_value(finding); + if pgx_source_type(finding) == "Drug Label" { + pgx_level_cell(out, level); + } else { + evidence_level_cell(out, level); + } +} + +fn pgx_genotype_cell(out: &mut String, finding: &serde_json::Value) { + let value = finding + .get("matched_observation") + .and_then(|observation| observation.get("genotype_display")) + .and_then(serde_json::Value::as_str) + .or_else(|| { + finding + .get("matched_effect") + .and_then(|effect| effect.get("label")) + .and_then(serde_json::Value::as_str) + }) + .unwrap_or_default(); + let alt = finding + .get("matched_observation") + .and_then(|observation| observation.get("alt")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if alt.is_empty() { + class_cell(out, value, "mono"); + } else { let _ = write!( out, - "" + "{}", + highlight_allele(value, alt) ); } - out.push_str(""); - out.push_str("i
"); +} + +fn finding_participant(finding: &serde_json::Value) -> String { + finding + .get("matched_observation") + .or_else(|| finding.get("matched_analysis")) + .and_then(|value| value.get("participant_id")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + .to_owned() +} + +fn pgx_outcome_filter_slug(finding: &serde_json::Value) -> &'static str { + match finding + .get("matched_observation") + .and_then(|observation| observation.get("outcome")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "variant" | "observed_alt" | "unknown_alt" => "variant", + "reference" => "reference", + _ => "missing", + } +} + +fn finding_rsid(finding: &serde_json::Value) -> String { + finding + .get("matched_observation") + .and_then(|observation| observation.get("rsid")) + .and_then(serde_json::Value::as_str) + .or_else(|| finding.get("rsid").and_then(serde_json::Value::as_str)) + .unwrap_or_default() + .to_owned() +} + +fn finding_gene(finding: &serde_json::Value) -> String { + finding + .get("matched_observation") + .and_then(|observation| observation.get("gene")) + .and_then(serde_json::Value::as_str) + .map(ToOwned::to_owned) + .or_else(|| { + let genes = join_string_array(finding.get("genes")); + if genes.is_empty() { + None + } else { + Some(genes) + } + }) + .unwrap_or_default() } fn matched_ref_alt(finding: &serde_json::Value) -> String { @@ -170,16 +379,14 @@ fn evidence_level_color_group(level: &str) -> String { } fn evidence_level_cell(out: &mut String, level: &str) { - if level.is_empty() { - out.push_str(""); - return; - } - let group = evidence_level_color_group(level); + let display = if level.is_empty() { "Unknown" } else { level }; + let group = evidence_level_color_group(display); let _ = write!( out, - "{}", + "{}", + evidence_level_sort_rank(display), html_escape(&group), - html_escape(level) + html_escape(display) ); } @@ -203,16 +410,39 @@ fn pgx_level_slug(level: &str) -> String { } fn pgx_level_cell(out: &mut String, level: &str) { - if level.is_empty() { - out.push_str(""); - return; - } - let slug = pgx_level_slug(level); + let display = if level.is_empty() { "Unknown" } else { level }; + let slug = pgx_level_slug(display); let _ = write!( out, - "{}", + "{}", + pgx_level_sort_rank(display), html_escape(&slug), - html_escape(level) + html_escape(display) ); } +fn pgx_level_sort_rank(level: &str) -> u8 { + match pgx_level_slug(level).as_str() { + "required" => 1, + "recommended" => 2, + "actionable" => 3, + "informative" => 4, + "no-clinical" => 5, + "criteria" => 6, + _ => 7, + } +} + +fn evidence_level_sort_rank(level: &str) -> u8 { + match evidence_level_group(level).as_str() { + "1a" => 11, + "1b" => 12, + "1" => 13, + "2a" => 21, + "2b" => 22, + "2" => 23, + "3" => 30, + "4" => 40, + _ => 99, + } +} diff --git a/rust/bioscript-cli/src/report_html_provenance.rs b/rust/bioscript-cli/src/report_html_provenance.rs new file mode 100644 index 0000000..36a664d --- /dev/null +++ b/rust/bioscript-cli/src/report_html_provenance.rs @@ -0,0 +1,84 @@ +fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { + let mut links = BTreeMap::::new(); + for report in reports { + collect_provenance_links_from_value(report, &mut links); + } + if links.is_empty() { + out.push_str("

No provenance links.

"); + return; + } + out.push_str("
    "); + for (domain, domain_links) in group_provenance_links_by_domain(links) { + let _ = write!( + out, + "
  • {} ({} links)
      ", + html_escape(&domain), + domain_links.len() + ); + for (url, label) in domain_links { + let display = if label.is_empty() { &url } else { &label }; + let _ = write!( + out, + "
    • {}
      {}
    • ", + html_escape(&url), + html_escape(display), + html_escape(&url) + ); + } + out.push_str("
  • "); + } + out.push_str("
"); +} + +fn group_provenance_links_by_domain( + links: BTreeMap, +) -> BTreeMap> { + let mut grouped = BTreeMap::>::new(); + for (url, label) in links { + grouped + .entry(domain_from_url(&url).unwrap_or_else(|| "other".to_owned())) + .or_default() + .insert(url, label); + } + grouped +} + +fn domain_from_url(url: &str) -> Option { + let without_scheme = url.split_once("://")?.1; + let host = without_scheme.split(['/', '?', '#']).next()?.trim(); + if host.is_empty() { + None + } else { + Some(host.to_ascii_lowercase()) + } +} +fn collect_provenance_links_from_value( + value: &serde_json::Value, + links: &mut BTreeMap, +) { + match value { + serde_json::Value::Object(object) => { + if let Some(url) = object.get("url").and_then(serde_json::Value::as_str) + && url.starts_with("http") + { + let label = object + .get("name") + .or_else(|| object.get("label")) + .or_else(|| object.get("source")) + .and_then(value_as_string) + .unwrap_or_default(); + links.entry(url.to_owned()).or_insert(label); + } + for child in object.values() { + collect_provenance_links_from_value(child, links); + } + } + serde_json::Value::Array(items) => { + for item in items { + collect_provenance_links_from_value(item, links); + } + } + _ => {} + } +} + diff --git a/rust/bioscript-cli/src/report_html_sections.rs b/rust/bioscript-cli/src/report_html_sections.rs index 61ce304..2f4e5a9 100644 --- a/rust/bioscript-cli/src/report_html_sections.rs +++ b/rust/bioscript-cli/src/report_html_sections.rs @@ -17,252 +17,287 @@ fn collect_report_findings(reports: &[serde_json::Value], schema: &str) -> VecNo analysis outputs.

"); - return; - } - for (index, analysis) in analyses.iter().enumerate() { - let table_id = format!("analysis-table-{index}"); - let title = format!( - "{} / {}", - value_str(analysis, "participant_id"), - value_str(analysis, "analysis_id") - ); - let _ = write!(out, "

{}

", html_escape(&title)); - render_analysis_logic(out, analysis); - let rows = analysis - .get("rows") - .and_then(serde_json::Value::as_array) - .cloned() - .unwrap_or_default(); - if rows.is_empty() { - out.push_str("

No rows emitted.

"); - continue; - } - let headers = analysis_row_headers(&rows); - let header_refs = headers.iter().map(String::as_str).collect::>(); - render_table_start(out, &table_id, &header_refs); - for row in rows { - out.push_str(""); - for header in &headers { - table_cell(out, &json_field_as_tsv(row.get(header))); - } - out.push_str(""); +fn collect_report_participants(reports: &[serde_json::Value]) -> Vec { + let mut participants = Vec::new(); + for report in reports { + let participant = value_str(report, "participant_id"); + if !participant.is_empty() && !participants.iter().any(|item| item == participant) { + participants.push(participant.to_owned()); } - render_table_end(out); } + participants } -fn analysis_row_headers(rows: &[serde_json::Value]) -> Vec { - let mut headers = Vec::new(); - for row in rows { - let Some(object) = row.as_object() else { - continue; - }; - for key in object.keys() { - if !headers.contains(key) { - headers.push(key.clone()); +fn render_report_manifest_header(out: &mut String, reports: &[serde_json::Value]) { + let manifest = reports + .first() + .and_then(|report| report.get("manifest")) + .unwrap_or(&serde_json::Value::Null); + let title = value_str(manifest, "label"); + let fallback = value_str(manifest, "name"); + let title = if title.is_empty() { fallback } else { title }; + let title = if title.is_empty() { + "BioScript Report" + } else { + title + }; + let _ = write!(out, "

{}

", html_escape(title)); + out.push_str("

Disclaimer: This is not medical or clinical advice, only for research purposes. Always consult a licensed professional to interpret medical information.

This report was generated offline on your system.

For more information see https://app.biovault.net

"); +} + +fn render_report_source_section(out: &mut String, reports: &[serde_json::Value]) { + let manifest = reports + .first() + .and_then(|report| report.get("manifest")) + .unwrap_or(&serde_json::Value::Null); + out.push_str("
"); + report_manifest_kv(out, "Schema", value_str(manifest, "schema")); + report_manifest_kv(out, "Version", value_str(manifest, "version")); + report_manifest_kv(out, "Name", value_str(manifest, "name")); + report_manifest_kv(out, "Label", value_str(manifest, "label")); + report_manifest_kv(out, "Tags", &manifest_tags(manifest)); + report_manifest_kv(out, "Members", &manifest_member_summary(manifest)); + out.push_str("
"); + render_manifest_members(out, manifest); +} + +fn report_manifest_kv(out: &mut String, key: &str, value: &str) { + if value.is_empty() { + return; + } + let _ = write!( + out, + "
{}
{}
", + html_escape(key), + html_escape(value) + ); +} + +fn manifest_tags(manifest: &serde_json::Value) -> String { + manifest + .get("tags") + .and_then(serde_json::Value::as_array) + .map(|items| { + items + .iter() + .filter_map(serde_json::Value::as_str) + .collect::>() + .join(", ") + }) + .unwrap_or_default() +} + +fn manifest_member_summary(manifest: &serde_json::Value) -> String { + let Some(members) = manifest.get("members").and_then(serde_json::Value::as_array) else { + return String::new(); + }; + let preview = members + .iter() + .take(5) + .filter_map(|member| { + let kind = value_str(member, "kind"); + let path = value_str(member, "path"); + if kind.is_empty() && path.is_empty() { + None + } else if path.is_empty() { + Some(kind.to_owned()) + } else { + Some(format!("{kind}: {path}")) } - } + }) + .collect::>(); + let remaining = members.len().saturating_sub(preview.len()); + if remaining == 0 { + preview.join("; ") + } else { + format!("{}; +{} more", preview.join("; "), remaining) } - headers } -fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { - let Some(logic) = analysis.get("logic") else { +fn render_manifest_members(out: &mut String, manifest: &serde_json::Value) { + let Some(members) = manifest.get("members").and_then(serde_json::Value::as_array) else { return; }; - if logic.is_null() { + if members.is_empty() { return; } - let description = logic - .get("description") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let source = logic.get("source").unwrap_or(&serde_json::Value::Null); - let source_name = source - .get("name") - .and_then(serde_json::Value::as_str) - .unwrap_or("source"); - let source_url = source - .get("url") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - out.push_str("
"); - if !description.is_empty() { - let _ = write!(out, "

{}

", html_escape(description)); + out.push_str("
Show panel members
"); + for member in members { + out.push_str(""); + table_cell(out, value_str(member, "kind")); + table_cell(out, value_str(member, "path")); + table_cell(out, value_str(member, "version")); + out.push_str(""); + } + out.push_str("
KindPathVersion
"); +} + +fn render_participant_filter(out: &mut String, participants: &[String]) { + if participants.len() <= 1 { + return; } - if !source_url.is_empty() { + out.push_str("
"); } -fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { - let mut links = BTreeMap::::new(); - for report in reports { - collect_provenance_links_from_value(report, &mut links); +fn render_input_debug(out: &mut String, reports: &[serde_json::Value], show_participant_id: bool) { + if reports.is_empty() { + out.push_str("

No input metadata.

"); + return; } - if links.is_empty() { - out.push_str("

No provenance links.

"); + if reports.len() == 1 { + render_input_debug_key_values(out, &reports[0]); return; } - out.push_str("
    "); - for (url, label) in links { - let display = if label.is_empty() { &url } else { &label }; + out.push_str("
    "); + let mut headers = Vec::new(); + if show_participant_id { + headers.push("Participant"); + } + headers.extend([ + "File", + "Format", + "Source", + "Assembly", + "Inferred Sex", + "Evidence", + ]); + for (idx, header) in headers.iter().enumerate() { let _ = write!( out, - "
  • {}
    {}
  • ", - html_escape(&url), - html_escape(display), - html_escape(&url) + "", + idx, + html_escape(header) ); } - out.push_str(""); -} - -fn collect_provenance_links_from_value( - value: &serde_json::Value, - links: &mut BTreeMap, -) { - match value { - serde_json::Value::Object(object) => { - if let Some(url) = object.get("url").and_then(serde_json::Value::as_str) - && url.starts_with("http") - { - let label = object - .get("name") - .or_else(|| object.get("label")) - .or_else(|| object.get("source")) - .and_then(value_as_string) - .unwrap_or_default(); - links.entry(url.to_owned()).or_insert(label); - } - for child in object.values() { - collect_provenance_links_from_value(child, links); - } - } - serde_json::Value::Array(items) => { - for item in items { - collect_provenance_links_from_value(item, links); - } - } - _ => {} - } -} - -fn render_observation_table(out: &mut String, observations: &[serde_json::Value]) { - let headers = [ - "participant_id", - "rsid", - "ref", - "alt", - "genotype_display", - "genotype", - "zygosity", - "outcome", - "match_status", - "coverage_status", - "call_status", - "assembly", - "chrom", - "pos_start", - "pos_end", - "kind", - "ref_count", - "alt_count", - "depth", - "genotype_quality", - "allele_balance", - "evidence_type", - "evidence_raw", - "facets", - "assay_id", - "assay_version", - "variant_key", - ]; - render_table_start(out, "observations-table", &headers); - for observation in observations { - let _ = write!(out, "", observation_row_class(observation)); - for header in headers { - render_observation_cell(out, observation, header); + out.push_str(""); + for report in reports { + let participant = value_str(report, "participant_id"); + let input = report.get("input").unwrap_or(&serde_json::Value::Null); + let debug = input.get("debug").unwrap_or(&serde_json::Value::Null); + let source = debug.get("source").unwrap_or(&serde_json::Value::Null); + let sex = debug.get("inferred_sex").unwrap_or(&serde_json::Value::Null); + let _ = write!( + out, + "", + html_escape(participant) + ); + if show_participant_id { + table_cell(out, participant); } + table_cell(out, value_str(input, "file_name")); + table_cell( + out, + &compact_join(&[ + value_str(debug, "format"), + value_str(debug, "format_confidence"), + ]), + ); + table_cell( + out, + &compact_join(&[ + value_str(source, "vendor"), + value_str(source, "platform_version"), + ]), + ); + table_cell(out, value_str(debug, "assembly")); + table_cell( + out, + &compact_join(&[ + value_str(sex, "sex"), + value_str(sex, "confidence"), + value_str(sex, "method"), + ]), + ); + table_cell(out, &input_debug_evidence(debug)); out.push_str(""); } out.push_str("
    {}
    "); } -fn observation_row_class(observation: &serde_json::Value) -> &'static str { - match observation - .get("outcome") - .and_then(serde_json::Value::as_str) - .unwrap_or_default() - { - "variant" => "row-variant", - "reference" => "row-reference", - _ => "", - } +fn render_input_debug_key_values(out: &mut String, report: &serde_json::Value) { + let input = report.get("input").unwrap_or(&serde_json::Value::Null); + let debug = input.get("debug").unwrap_or(&serde_json::Value::Null); + let source = debug.get("source").unwrap_or(&serde_json::Value::Null); + let sex = debug.get("inferred_sex").unwrap_or(&serde_json::Value::Null); + out.push_str("
    "); + input_debug_kv(out, "File", value_str(input, "file_name")); + input_debug_kv( + out, + "Format", + &compact_join(&[ + value_str(debug, "format"), + value_str(debug, "format_confidence"), + ]), + ); + input_debug_kv( + out, + "Source", + &compact_join(&[ + value_str(source, "vendor"), + value_str(source, "platform_version"), + ]), + ); + input_debug_kv(out, "Assembly", value_str(debug, "assembly")); + input_debug_kv( + out, + "Inferred sex", + &compact_join(&[ + value_str(sex, "sex"), + value_str(sex, "confidence"), + value_str(sex, "method"), + ]), + ); + input_debug_kv(out, "Evidence", &input_debug_evidence(debug)); + out.push_str("
    "); } -fn render_observation_cell(out: &mut String, observation: &serde_json::Value, header: &str) { - if header == "genotype_display" { - let outcome = observation - .get("outcome") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let value = json_field_as_tsv(observation.get(header)); - if outcome == "variant" { - let alt = observation - .get("alt") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let _ = write!( - out, - "{}", - highlight_allele(&value, alt) - ); - return; - } - } +fn input_debug_kv(out: &mut String, key: &str, value: &str) { let _ = write!( out, - "{}", - html_escape(&json_field_as_tsv(observation.get(header))) + "
    {}
    {}
    ", + html_escape(key), + html_escape(value) ); } -fn highlight_allele(value: &str, allele: &str) -> String { - if value.is_empty() || allele.is_empty() { - return html_escape(value); +fn compact_join(values: &[&str]) -> String { + values + .iter() + .filter(|value| !value.is_empty()) + .copied() + .collect::>() + .join(" / ") +} + +fn input_debug_evidence(debug: &serde_json::Value) -> String { + let mut evidence = Vec::new(); + collect_string_array(debug.get("evidence"), &mut evidence); + collect_string_array(debug.get("warnings"), &mut evidence); + if let Some(source) = debug.get("source") { + collect_string_array(source.get("evidence"), &mut evidence); } - if allele.chars().count() == 1 { - let target = allele - .chars() - .next() - .unwrap_or_default() - .to_ascii_uppercase(); - let mut out = String::new(); - for ch in value.chars() { - let escaped = html_escape(&ch.to_string()); - if ch.to_ascii_uppercase() == target { - let _ = write!(out, "{escaped}"); - } else { - out.push_str(&escaped); + if let Some(sex) = debug.get("inferred_sex") { + collect_string_array(sex.get("evidence"), &mut evidence); + } + evidence.join(" | ") +} + +fn collect_string_array(value: Option<&serde_json::Value>, out: &mut Vec) { + if let Some(items) = value.and_then(serde_json::Value::as_array) { + for item in items.iter().filter_map(serde_json::Value::as_str) { + if !item.is_empty() && !out.iter().any(|existing| existing == item) { + out.push(item.to_owned()); } } - return out; } - let escaped_value = html_escape(value); - let escaped_allele = html_escape(allele); - escaped_value.replace( - &escaped_allele, - &format!("{escaped_allele}"), - ) } diff --git a/rust/bioscript-cli/src/report_matching.rs b/rust/bioscript-cli/src/report_matching.rs index 76f4f93..6695d52 100644 --- a/rust/bioscript-cli/src/report_matching.rs +++ b/rust/bioscript-cli/src/report_matching.rs @@ -67,6 +67,7 @@ fn app_variant_binding_match_observation<'a>( return observations .iter() .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .filter(|observation| app_binding_chromosome_count_matches(binding, observation)) .find(|observation| { let dosage = app_observation_allele_dosage(observation, allele); app_binding_matches_dosage(dosage, binding) @@ -80,9 +81,26 @@ fn app_variant_binding_match_observation<'a>( if key.is_empty() { return None; } + if key == "alt" { + let expected_alleles = app_binding_expected_values(binding); + return observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .filter(|observation| app_binding_chromosome_count_matches(binding, observation)) + .find(|observation| { + app_binding_matches_value(observation.get(key), binding) + && expected_alleles + .iter() + .any(|expected| { + app_observation_allele_dosage(observation, expected) + .is_some_and(|dosage| dosage > 0) + }) + }); + } observations .iter() .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .filter(|observation| app_binding_chromosome_count_matches(binding, observation)) .find(|observation| app_binding_matches_value(observation.get(key), binding)) } @@ -90,6 +108,7 @@ fn app_finding_observation_context(observation: &serde_json::Value) -> serde_jso serde_json::json!({ "participant_id": observation.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), "rsid": observation.get("rsid").cloned().unwrap_or(serde_json::Value::Null), + "gene": observation.get("gene").cloned().unwrap_or(serde_json::Value::Null), "ref": observation.get("ref").cloned().unwrap_or(serde_json::Value::Null), "alt": observation.get("alt").cloned().unwrap_or(serde_json::Value::Null), "genotype_display": observation.get("genotype_display").cloned().unwrap_or(serde_json::Value::Null), @@ -147,15 +166,15 @@ fn app_observation_allele_dosage(observation: &serde_json::Value, allele: &str) if allele == ref_allele { return match zygosity { "hom_ref" => Some(2), - "het" => Some(1), - "hom_alt" => Some(0), + "hem_ref" | "het" => Some(1), + "hom_alt" | "hem_alt" => Some(0), _ => None, }; } if allele == alt_allele { return match zygosity { - "hom_ref" => Some(0), - "het" => Some(1), + "hom_ref" | "hem_ref" => Some(0), + "het" | "hem_alt" => Some(1), "hom_alt" => Some(2), _ => None, }; @@ -176,6 +195,31 @@ fn app_observation_allele_dosage(observation: &serde_json::Value, allele: &str) None } +fn app_binding_chromosome_count_matches( + binding: &serde_json::Value, + observation: &serde_json::Value, +) -> bool { + let Some(expected) = binding + .get("chromosome_count") + .and_then(serde_json::Value::as_i64) + else { + return true; + }; + app_observation_chromosome_count(observation).is_some_and(|actual| actual == expected) +} + +fn app_observation_chromosome_count(observation: &serde_json::Value) -> Option { + match observation + .get("zygosity") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "hem_ref" | "hem_alt" => Some(1), + "hom_ref" | "het" | "hom_alt" => Some(2), + _ => None, + } +} + fn app_binding_matches_value( actual: Option<&serde_json::Value>, binding: &serde_json::Value, @@ -203,6 +247,17 @@ fn app_binding_matches_value( } } +fn app_binding_expected_values(binding: &serde_json::Value) -> Vec { + let mut values = Vec::new(); + if let Some(value) = binding.get("value").and_then(value_as_string) { + values.push(value.clone()); + } + if let Some(array) = binding.get("values").and_then(serde_json::Value::as_array) { + values.extend(array.iter().filter_map(value_as_string)); + } + values +} + fn app_binding_matches_dosage(dosage: Option, binding: &serde_json::Value) -> bool { let Some(dosage) = dosage else { return false; @@ -290,3 +345,122 @@ fn app_finding_dedupe_key(finding: &serde_json::Value) -> String { ) } +#[cfg(test)] +mod report_matching_tests { + use super::*; + + #[test] + fn alt_binding_requires_observed_allele_dosage() { + let binding = serde_json::json!({ + "source": "variant", + "key": "alt", + "value": "G" + }); + let observations = vec![ + serde_json::json!({ + "variant_path": "rs1.yaml", + "ref": "A", + "alt": "G", + "genotype_display": "AA", + "zygosity": "hom_ref" + }), + serde_json::json!({ + "variant_path": "rs2.yaml", + "ref": "A", + "alt": "G", + "genotype_display": "AG", + "zygosity": "het" + }), + ]; + + let matched = app_variant_binding_match_observation(&binding, &observations) + .expect("het alt observation should match"); + assert_eq!( + matched + .get("genotype_display") + .and_then(serde_json::Value::as_str), + Some("AG") + ); + } + + #[test] + fn alt_in_binding_requires_observed_allele_dosage() { + let binding = serde_json::json!({ + "source": "variant", + "key": "alt", + "operator": "in", + "values": ["G", "T"] + }); + let observations = vec![serde_json::json!({ + "variant_path": "rs1.yaml", + "ref": "A", + "alt": "T", + "genotype_display": "AT", + "zygosity": "het" + })]; + + assert!(app_variant_binding_match_observation(&binding, &observations).is_some()); + } + + #[test] + fn hemizygous_observations_count_as_single_allele_dosage() { + let observations = vec![serde_json::json!({ + "variant_path": "rs3813929.yaml", + "ref": "C", + "alt": "T", + "genotype": "1", + "genotype_display": "T", + "zygosity": "hem_alt" + })]; + + let include_binding = serde_json::json!({ + "source": "variant", + "variant": "rs3813929.yaml", + "key": "alt", + "value": "T" + }); + assert!(app_variant_binding_match_observation(&include_binding, &observations).is_some()); + + let effect_binding = serde_json::json!({ + "source": "variant", + "variant": "rs3813929.yaml", + "allele": "T", + "operator": "dosage_equals", + "value": 1, + "chromosome_count": 1 + }); + assert!(app_variant_binding_match_observation(&effect_binding, &observations).is_some()); + } + + #[test] + fn chromosome_count_binding_separates_one_x_and_two_x_rows() { + let observations = vec![serde_json::json!({ + "variant_path": "rs3813929.yaml", + "ref": "C", + "alt": "T", + "genotype": "0/1", + "genotype_display": "CT", + "zygosity": "het" + })]; + + let one_x_binding = serde_json::json!({ + "source": "variant", + "variant": "rs3813929.yaml", + "allele": "T", + "operator": "dosage_equals", + "value": 1, + "chromosome_count": 1 + }); + assert!(app_variant_binding_match_observation(&one_x_binding, &observations).is_none()); + + let two_x_binding = serde_json::json!({ + "source": "variant", + "variant": "rs3813929.yaml", + "allele": "T", + "operator": "dosage_equals", + "value": 1, + "chromosome_count": 2 + }); + assert!(app_variant_binding_match_observation(&two_x_binding, &observations).is_some()); + } +} diff --git a/rust/bioscript-cli/src/report_observations.rs b/rust/bioscript-cli/src/report_observations.rs index 73ddd46..dd155fa 100644 --- a/rust/bioscript-cli/src/report_observations.rs +++ b/rust/bioscript-cli/src/report_observations.rs @@ -2,6 +2,8 @@ fn app_observation_from_manifest_row( runtime_root: &Path, row: &BTreeMap, assay_id: &str, + inferred_sex: Option<&SexInference>, + fallback_assembly: Option, ) -> Result { let row_path = row.get("path").cloned().unwrap_or_default(); let manifest_path = if Path::new(&row_path).is_absolute() { @@ -10,13 +12,16 @@ fn app_observation_from_manifest_row( runtime_root.join(&row_path) }; let manifest = load_variant_manifest(&manifest_path)?; + let gene = variant_manifest_gene(&manifest_path)?; let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); - let genotype_display = row.get("genotype").cloned().unwrap_or_default(); - let alt_alleles = variant_alt_alleles(&manifest_path)?; - let alt_allele = observed_alt_allele(&genotype_display, &ref_allele, &alt_alleles) - .or_else(|| manifest.spec.alternate.clone()) + let reportable_alt = manifest.spec.alternate.clone().unwrap_or_default(); + let observed_alt_alleles = variant_observed_alt_alleles(&manifest_path)?; + let genotype_display = row + .get("genotype") + .filter(|value| !value.is_empty()) + .cloned() + .or_else(|| genotype_display_from_raw_counts(row.get("raw_counts")?)) .unwrap_or_default(); - let (genotype, zygosity) = normalize_app_genotype(&genotype_display, &ref_allele, &alt_allele); let depth = parse_optional_u32(row.get("depth")); let ref_count = parse_optional_u32(row.get("ref_count")); let alt_count = parse_optional_u32(row.get("alt_count")); @@ -26,7 +31,12 @@ fn app_observation_from_manifest_row( } _ => None, }; - let assembly = row.get("assembly").cloned().unwrap_or_default(); + let assembly = row + .get("assembly") + .filter(|value| !value.is_empty()) + .cloned() + .or_else(|| fallback_assembly.map(assembly_row_value)) + .unwrap_or_default(); let locus = if assembly.eq_ignore_ascii_case("grch37") { manifest.spec.grch37.as_ref() } else { @@ -36,16 +46,25 @@ fn app_observation_from_manifest_row( .as_ref() .or(manifest.spec.grch37.as_ref()) }; - let outcome = if genotype == "./." { - "no_call" - } else if zygosity == "hom_ref" { - "reference" - } else if zygosity == "het" || zygosity == "hom_alt" { - "variant" - } else { - "unknown" - }; - let evidence_raw = row.get("evidence").cloned().unwrap_or_default(); + let chrom = locus.map_or(String::new(), |locus| locus.chrom.clone()); + let (genotype, zygosity) = normalize_app_genotype( + &genotype_display, + &ref_allele, + &reportable_alt, + &chrom, + inferred_sex, + ); + let non_reportable_status = + classify_non_reportable_alleles(&genotype_display, &ref_allele, &reportable_alt, &observed_alt_alleles); + let call = observation_call_values( + depth, + non_reportable_status, + &genotype, + &zygosity, + &genotype_display, + ); + let evidence_raw = observation_evidence_raw(row, &chrom, inferred_sex); + let source = variant_primary_source(&manifest_path)?; Ok(serde_json::json!({ "participant_id": row.get("participant_id").cloned().unwrap_or_default(), "assay_id": assay_id, @@ -53,32 +72,143 @@ fn app_observation_from_manifest_row( "variant_key": manifest.name, "variant_path": row_path, "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), + "gene": gene, "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, - "chrom": locus.map_or(String::new(), |locus| locus.chrom.clone()), + "chrom": chrom, "pos_start": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), "pos_end": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), "ref": ref_allele, - "alt": alt_allele, + "alt": reportable_alt, "kind": manifest.spec.kind.map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()), "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), - "call_status": if genotype == "./." { "no_call" } else { "called" }, + "call_status": call.status, "genotype": genotype, - "genotype_display": genotype_display, + "genotype_display": call.reported_genotype_display, "zygosity": zygosity, "ref_count": ref_count, "alt_count": alt_count, "depth": depth, "genotype_quality": serde_json::Value::Null, "allele_balance": allele_balance, - "outcome": outcome, + "outcome": call.outcome, "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, "evidence_raw": evidence_raw, - "facets": serde_json::Value::Null, + "source": source, + "facets": observation_facets(non_reportable_status, &observed_alt_alleles), })) } -fn variant_alt_alleles(path: &Path) -> Result, String> { +struct ObservationCallValues { + outcome: &'static str, + status: &'static str, + reported_genotype_display: String, +} + +fn observation_call_values( + depth: Option, + non_reportable_status: Option<&'static str>, + genotype: &str, + zygosity: &str, + genotype_display: &str, +) -> ObservationCallValues { + let outcome = if depth == Some(0) { + "not_covered" + } else if non_reportable_status == Some("observed_alt") { + "observed_alt" + } else if non_reportable_status == Some("unknown_alt") { + "unknown_alt" + } else if genotype == "./." { + "no_call" + } else if zygosity == "hom_ref" || zygosity == "hem_ref" { + "reference" + } else if zygosity == "het" || zygosity == "hom_alt" || zygosity == "hem_alt" { + "variant" + } else { + "unknown" + }; + let status = if matches!(outcome, "observed_alt" | "unknown_alt") { + outcome + } else if genotype == "./." { + "no_call" + } else { + "called" + }; + let reported_genotype_display = if matches!(zygosity, "hem_ref" | "hem_alt") { + hemizygous_display_genotype(genotype_display) + } else if genotype_display.is_empty() && matches!(outcome, "no_call" | "not_covered") { + "??".to_owned() + } else { + genotype_display.to_owned() + }; + ObservationCallValues { + outcome, + status, + reported_genotype_display, + } +} + +fn assembly_row_value(assembly: bioscript_core::Assembly) -> String { + match assembly { + bioscript_core::Assembly::Grch37 => "grch37".to_owned(), + bioscript_core::Assembly::Grch38 => "grch38".to_owned(), + } +} + +fn hemizygous_display_genotype(display: &str) -> String { + display + .chars() + .find(char::is_ascii_alphabetic) + .map_or_else(|| display.to_owned(), |allele| allele.to_string()) +} + +fn variant_primary_source(path: &Path) -> Result { + let value = load_yaml_value(path)?; + let mut links = BTreeMap::::new(); + collect_manifest_provenance_entries(&value, &mut links)?; + if let Some(source) = links + .values() + .find(|source| source_url_contains(source, "ncbi.nlm.nih.gov/snp/rs")) + { + return Ok(source.clone()); + } + if let Some(rsid) = value + .get("identifiers") + .and_then(|identifiers| identifiers.get("rsids")) + .and_then(serde_yaml::Value::as_sequence) + .and_then(|items| items.iter().find_map(serde_yaml::Value::as_str)) + { + return Ok(serde_json::json!({ + "kind": "database", + "label": "dbSNP / NCBI SNP", + "url": format!("https://www.ncbi.nlm.nih.gov/snp/{rsid}"), + "fields": ["identifiers.rsids"], + })); + } + Ok(links.into_values().next().unwrap_or(serde_json::Value::Null)) +} + +fn source_url_contains(source: &serde_json::Value, needle: &str) -> bool { + source + .get("url") + .and_then(serde_json::Value::as_str) + .is_some_and(|url| url.contains(needle)) +} + +fn variant_manifest_gene(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read variant YAML {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse variant YAML {}: {err}", path.display()))?; + Ok(value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("gene".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .unwrap_or_default() + .to_owned()) +} + +fn variant_observed_alt_alleles(path: &Path) -> Result, String> { let text = fs::read_to_string(path) .map_err(|err| format!("failed to read variant YAML {}: {err}", path.display()))?; let value: serde_yaml::Value = serde_yaml::from_str(&text) @@ -90,7 +220,6 @@ fn variant_alt_alleles(path: &Path) -> Result, String> { .and_then(|mapping| { mapping .get(serde_yaml::Value::String("observed_alts".to_owned())) - .or_else(|| mapping.get(serde_yaml::Value::String("alts".to_owned()))) }) .and_then(serde_yaml::Value::as_sequence) else { @@ -103,35 +232,45 @@ fn variant_alt_alleles(path: &Path) -> Result, String> { .collect()) } -fn observed_alt_allele( - genotype_display: &str, +fn normalize_app_genotype( + display: &str, ref_allele: &str, - alts: &[String], -) -> Option { - if ref_allele.len() != 1 { - return None; - } - let ref_ch = ref_allele.chars().next()?; - genotype_display - .chars() - .filter(|ch| ch.is_ascii_alphabetic() && *ch != ref_ch) - .find_map(|ch| { - alts.iter() - .find(|alt| alt.len() == 1 && alt.starts_with(ch)) - .cloned() - }) -} - -fn normalize_app_genotype(display: &str, ref_allele: &str, alt_allele: &str) -> (String, String) { + alt_allele: &str, + chrom: &str, + inferred_sex: Option<&SexInference>, +) -> (String, String) { if display.is_empty() { return ("./.".to_owned(), "unknown".to_owned()); } let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); - if alleles.len() != 2 || ref_allele.len() != 1 || alt_allele.len() != 1 { + if ref_allele.len() != 1 || alt_allele.len() != 1 { return (display.to_owned(), "unknown".to_owned()); } let ref_ch = ref_allele.chars().next().unwrap_or_default(); let alt_ch = alt_allele.chars().next().unwrap_or_default(); + if alleles.len() == 1 && is_haploid_sex_chromosome(chrom) { + let allele = alleles[0]; + if allele == ref_ch { + return ("0".to_owned(), "hem_ref".to_owned()); + } + if allele == alt_ch { + return ("1".to_owned(), "hem_alt".to_owned()); + } + return (display.to_owned(), "unknown".to_owned()); + } + if alleles.len() != 2 { + return (display.to_owned(), "unknown".to_owned()); + } + if is_confident_male_sex_chromosome(chrom, inferred_sex) && alleles[0] == alleles[1] { + let allele = alleles[0]; + if allele == ref_ch { + return ("0".to_owned(), "hem_ref".to_owned()); + } + if allele == alt_ch { + return ("1".to_owned(), "hem_alt".to_owned()); + } + return (display.to_owned(), "unknown".to_owned()); + } let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); match (ref_count, alt_count) { @@ -142,250 +281,155 @@ fn normalize_app_genotype(display: &str, ref_allele: &str, alt_allele: &str) -> } } -fn parse_optional_u32(value: Option<&String>) -> Option { - value.and_then(|value| value.parse::().ok()) +fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { + is_haploid_sex_chromosome(chrom) + && inferred_sex.is_some_and(|sex| { + sex.sex == InferredSex::Male + && matches!( + sex.confidence, + SexDetectionConfidence::High | SexDetectionConfidence::Medium + ) + }) } -fn load_manifest_findings( - root: &Path, - manifest_path: &Path, -) -> Result, String> { - let value = load_yaml_value(manifest_path)?; - let schema = value - .get("schema") - .and_then(serde_yaml::Value::as_str) - .unwrap_or_default(); - let mut findings = Vec::new(); +fn is_haploid_sex_chromosome(chrom: &str) -> bool { + matches!( + chrom + .trim() + .trim_start_matches("chr") + .trim_start_matches("CHR") + .to_ascii_uppercase() + .as_str(), + "X" | "Y" | "23" | "24" + ) +} - if matches!( - schema, - "bioscript:variant:1.0" - | "bioscript:variant" - | "bioscript:assay:1.0" - | "bioscript:panel:1.0" - | "bioscript:pgx-findings:1.0" - ) && let Some(items) = value - .get("findings") - .and_then(serde_yaml::Value::as_sequence) - { - for item in items { - let json_item = yaml_to_json(item.clone())?; - let include = json_item - .get("include") - .and_then(serde_json::Value::as_str) - .map(str::to_owned); - if let Some(include) = include { - let include_path = resolve_manifest_path(root, manifest_path, &include)?; - let mut included = load_manifest_findings(root, &include_path)?; - let inherited_binding = json_item.get("binding").cloned(); - for included_item in &mut included { - if inherited_binding.is_some() - && included_item.get("binding").is_none() - && included_item.get("effects").is_none() - && let Some(object) = included_item.as_object_mut() - { - object.insert( - "binding".to_owned(), - inherited_binding.clone().unwrap_or(serde_json::Value::Null), - ); - } - } - findings.extend(included); - continue; - } - if json_item.get("include").is_none() { - findings.push(json_item); - } - } +fn observation_evidence_raw( + row: &BTreeMap, + chrom: &str, + inferred_sex: Option<&SexInference>, +) -> String { + let mut evidence_raw = row.get("evidence").cloned().unwrap_or_default(); + if !is_haploid_sex_chromosome(chrom) { + return evidence_raw; } - - if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") - && let Some(items) = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - { - for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { - continue; - }; - if !matches!(kind, "variant" | "assay") { - continue; - } - let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { - continue; - }; - let member_path = resolve_manifest_path(root, manifest_path, path)?; - findings.extend(load_manifest_findings(root, &member_path)?); - } + let Some(inferred_sex) = inferred_sex else { + return evidence_raw; + }; + let sex_evidence = sex_inference_evidence_raw(inferred_sex); + if sex_evidence.is_empty() { + return evidence_raw; } - - Ok(findings) -} - -fn load_yaml_value(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read YAML {}: {err}", path.display()))?; - serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) + if evidence_raw.is_empty() { + evidence_raw = sex_evidence; + } else { + evidence_raw.push_str(" | "); + evidence_raw.push_str(&sex_evidence); + } + evidence_raw } -fn yaml_to_json(value: serde_yaml::Value) -> Result { - serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) +fn sex_inference_evidence_raw(inferred_sex: &SexInference) -> String { + let sex = match inferred_sex.sex { + InferredSex::Male => "male", + InferredSex::Female => "female", + InferredSex::Unknown => "unknown", + }; + let confidence = match inferred_sex.confidence { + SexDetectionConfidence::High => "high", + SexDetectionConfidence::Medium => "medium", + SexDetectionConfidence::Low => "low", + }; + let mut fields = vec![ + format!("detected_sex={sex}"), + format!("sex_confidence={confidence}"), + format!("sex_method={}", inferred_sex.method), + ]; + fields.extend( + inferred_sex + .evidence + .iter() + .map(|item| format!("sex_{item}")), + ); + fields.join(" ") } -fn load_manifest_provenance_links( - root: &Path, - manifest_path: &Path, -) -> Result, String> { - let value = load_yaml_value(manifest_path)?; - let schema = value - .get("schema") - .and_then(serde_yaml::Value::as_str) - .unwrap_or_default(); - let mut links = BTreeMap::::new(); - collect_manifest_provenance_entries(&value, &mut links)?; - - if matches!( - schema, - "bioscript:variant:1.0" - | "bioscript:variant" - | "bioscript:assay:1.0" - | "bioscript:panel:1.0" - | "bioscript:pgx-findings:1.0" - ) && let Some(items) = value - .get("findings") - .and_then(serde_yaml::Value::as_sequence) - { - for item in items { - let json_item = yaml_to_json(item.clone())?; - let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) else { - continue; - }; - let include_path = resolve_manifest_path(root, manifest_path, include)?; - for item in load_manifest_provenance_links(root, &include_path)? { - if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(item); - } +fn genotype_display_from_raw_counts(raw_counts: &str) -> Option { + let counts: serde_json::Map = serde_json::from_str(raw_counts).ok()?; + let mut items = counts + .into_iter() + .filter_map(|(base, count)| { + let base = base.chars().next()?.to_ascii_uppercase(); + let count = count.as_u64()?; + if matches!(base, 'A' | 'C' | 'G' | 'T') && count > 0 { + Some((base, count)) + } else { + None } - } + }) + .collect::>(); + if items.is_empty() { + return None; } - - if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") - && let Some(items) = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - { - for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { - continue; - }; - if !matches!(kind, "variant" | "assay") { - continue; - } - let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { - continue; - }; - let member_path = resolve_manifest_path(root, manifest_path, path)?; - for item in load_manifest_provenance_links(root, &member_path)? { - if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(item); - } - } - } + items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); + let total = items.iter().map(|(_, count)| *count).sum::(); + let (top_base, top_count) = items[0]; + if total == 0 || items.len() == 1 || top_count.saturating_mul(10) >= total.saturating_mul(8) { + return Some(format!("{top_base}{top_base}")); } - - Ok(links.into_values().collect()) + Some(format!("{}{}", top_base, items[1].0)) } -fn collect_manifest_provenance_entries( - value: &serde_yaml::Value, - links: &mut BTreeMap, -) -> Result<(), String> { - if let Some(sources) = value - .get("provenance") - .and_then(|provenance| provenance.get("sources")) - .and_then(serde_yaml::Value::as_sequence) - { - for source in sources { - let json = yaml_to_json(source.clone())?; - if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(json); - } - } +fn classify_non_reportable_alleles( + display: &str, + ref_allele: &str, + reportable_alt: &str, + observed_alts: &[String], +) -> Option<&'static str> { + if display.is_empty() || ref_allele.len() != 1 || reportable_alt.len() != 1 { + return None; } - if let Some(source) = value.get("source") { - let json = yaml_to_json(source.clone())?; - if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(json); - } + let ref_ch = ref_allele.chars().next()?.to_ascii_uppercase(); + let alt_ch = reportable_alt.chars().next()?.to_ascii_uppercase(); + let non_reportable = display + .chars() + .filter(char::is_ascii_alphabetic) + .map(|ch| ch.to_ascii_uppercase()) + .filter(|ch| *ch != ref_ch && *ch != alt_ch) + .collect::>(); + if non_reportable.is_empty() { + return None; + } + if non_reportable.iter().all(|ch| { + observed_alts.iter().any(|alt| { + alt.len() == 1 + && alt + .chars() + .next() + .is_some_and(|alt_ch| alt_ch.to_ascii_uppercase() == *ch) + }) + }) { + Some("observed_alt") + } else { + Some("unknown_alt") } - Ok(()) } -fn match_app_findings( - findings: &[serde_json::Value], - observations: &[serde_json::Value], - analyses: &[serde_json::Value], -) -> Vec { - let mut matched = Vec::new(); - let mut seen = std::collections::BTreeSet::new(); - for finding in findings { - if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { - for effect in effects { - if let Some(observation) = app_finding_match_observation(effect, observations) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.remove("effects"); - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_effect".to_owned(), effect.clone()); - object.insert( - "matched_observation".to_owned(), - app_finding_observation_context(observation), - ); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.remove("effects"); - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_effect".to_owned(), effect.clone()); - object.insert("matched_analysis".to_owned(), analysis); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } - } - } else if let Some(observation) = app_finding_match_observation(finding, observations) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert( - "matched_observation".to_owned(), - app_finding_observation_context(observation), - ); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_analysis".to_owned(), analysis); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } +fn observation_facets( + non_reportable_status: Option<&str>, + observed_alts: &[String], +) -> serde_json::Value { + let Some(status) = non_reportable_status else { + return serde_json::Value::Null; + }; + let mut facets = vec![status.to_owned()]; + if status == "observed_alt" && !observed_alts.is_empty() { + facets.push(format!("known_observed_alts={}", observed_alts.join(","))); } - matched + serde_json::Value::String(facets.join(";")) +} + +fn parse_optional_u32(value: Option<&String>) -> Option { + value.and_then(|value| value.parse::().ok()) } diff --git a/rust/bioscript-cli/src/report_options.rs b/rust/bioscript-cli/src/report_options.rs index b988a08..ec560d6 100644 --- a/rust/bioscript-cli/src/report_options.rs +++ b/rust/bioscript-cli/src/report_options.rs @@ -12,116 +12,178 @@ struct AppReportOptions { output_dir: PathBuf, root: PathBuf, html: bool, + open_report: bool, observations_format: AppOutputFormat, reports_format: AppOutputFormat, loader: GenotypeLoadOptions, filters: Vec, + analysis_max_duration_ms: u64, + detect_sex: bool, + sample_sex: Option, } -fn run_app_report(args: Vec) -> Result<(), String> { - let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; - let mut manifest_path: Option = None; - let mut input_files: Vec = Vec::new(); - let mut output_dir: Option = None; - let mut root: Option = None; - let mut html = false; - let mut observations_format = AppOutputFormat::Tsv; - let mut reports_format = AppOutputFormat::Jsonl; - let mut filters = Vec::new(); - let mut loader = GenotypeLoadOptions::default(); +struct AppReportCliState { + cwd: PathBuf, + manifest_path: Option, + input_files: Vec, + output_dir: Option, + root: Option, + html: bool, + open_report: bool, + observations_format: AppOutputFormat, + reports_format: AppOutputFormat, + loader: GenotypeLoadOptions, + filters: Vec, + analysis_max_duration_ms: u64, + detect_sex: bool, + sample_sex: Option, +} +fn run_app_report(args: Vec) -> Result<(), String> { + let mut state = AppReportCliState::new()?; let mut iter = args.into_iter(); while let Some(arg) = iter.next() { - match arg.as_str() { - "--input-file" => input_files.push(PathBuf::from( - iter.next().ok_or("--input-file requires a path")?, - )), - "--output-dir" => { - output_dir = Some(PathBuf::from( - iter.next().ok_or("--output-dir requires a path")?, - )); + state.consume_arg(&arg, &mut iter)?; + } + generate_app_report(&state.finish()?) +} + +impl AppReportCliState { + fn new() -> Result { + Ok(Self { + cwd: env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?, + manifest_path: None, + input_files: Vec::new(), + output_dir: None, + root: None, + html: false, + open_report: false, + observations_format: AppOutputFormat::Tsv, + reports_format: AppOutputFormat::Jsonl, + loader: GenotypeLoadOptions::default(), + filters: Vec::new(), + analysis_max_duration_ms: 1_000, + detect_sex: false, + sample_sex: None, + }) + } + + fn consume_arg( + &mut self, + arg: &str, + iter: &mut std::vec::IntoIter, + ) -> Result<(), String> { + match arg { + "--input-file" => self.input_files.push(PathBuf::from(next_arg(iter, "--input-file")?)), + "--output-dir" => self.output_dir = Some(PathBuf::from(next_arg(iter, "--output-dir")?)), + "--root" => self.root = Some(PathBuf::from(next_arg(iter, "--root")?)), + "--html" => self.html = true, + "--open" => { + self.html = true; + self.open_report = true; } - "--root" => { - root = Some(PathBuf::from( - iter.next().ok_or("--root requires a directory")?, - )); + "--filter" => self.filters.push(next_arg(iter, "--filter")?), + "--detect-sex" => self.detect_sex = true, + "--sample-sex" => { + self.sample_sex = Some(parse_sample_sex(&next_arg(iter, "--sample-sex")?)?); } - "--html" => html = true, - "--filter" => filters.push(iter.next().ok_or("--filter requires key=value")?), "--observations-format" => { - observations_format = parse_app_output_format( - &iter - .next() - .ok_or("--observations-format requires a value")?, - )?; + self.observations_format = + parse_app_output_format(&next_arg(iter, "--observations-format")?)?; } "--reports-format" => { - reports_format = parse_app_output_format( - &iter.next().ok_or("--reports-format requires a value")?, - )?; - } - "--input-format" => { - let value = iter.next().ok_or("--input-format requires a value")?; - if value.eq_ignore_ascii_case("auto") { - loader.format = None; - } else { - loader.format = - Some(value.parse::().map_err(|err| { - format!("invalid --input-format value {value}: {err}") - })?); - } + self.reports_format = + parse_app_output_format(&next_arg(iter, "--reports-format")?)?; } + "--analysis-max-duration-ms" => self.parse_analysis_timeout(iter)?, + "--input-format" => self.parse_input_format(iter)?, "--input-index" => { - loader.input_index = Some(PathBuf::from( - iter.next().ok_or("--input-index requires a path")?, - )); + self.loader.input_index = Some(PathBuf::from(next_arg(iter, "--input-index")?)); } "--reference-file" => { - loader.reference_file = Some(PathBuf::from( - iter.next().ok_or("--reference-file requires a path")?, - )); + self.loader.reference_file = + Some(PathBuf::from(next_arg(iter, "--reference-file")?)); } "--reference-index" => { - loader.reference_index = Some(PathBuf::from( - iter.next().ok_or("--reference-index requires a path")?, - )); + self.loader.reference_index = + Some(PathBuf::from(next_arg(iter, "--reference-index")?)); } value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")), - value => { - if manifest_path.is_none() { - manifest_path = Some(PathBuf::from(value)); - } else { - input_files.push(PathBuf::from(value)); - } - } + value => self.consume_path(value), } + Ok(()) } - let Some(manifest_path) = manifest_path else { - return Err("usage: bioscript report --input-file [--input-file ...] --output-dir [--html]".to_owned()); - }; - if input_files.is_empty() { - return Err("bioscript report requires at least one --input-file".to_owned()); + fn parse_analysis_timeout( + &mut self, + iter: &mut std::vec::IntoIter, + ) -> Result<(), String> { + let value = next_arg(iter, "--analysis-max-duration-ms")?; + self.analysis_max_duration_ms = value + .parse::() + .map_err(|err| format!("invalid --analysis-max-duration-ms value {value}: {err}"))?; + Ok(()) } - let output_dir = output_dir.ok_or("bioscript report requires --output-dir")?; - let root = root.unwrap_or(cwd); - normalize_loader_paths(&root, &mut loader); - let options = AppReportOptions { - manifest_path: absolutize(&root, &manifest_path), - input_files: input_files - .iter() - .map(|path| absolutize(&root, path)) - .collect(), - output_dir: absolutize(&root, &output_dir), - root, - html, - observations_format, - reports_format, - loader, - filters, - }; - generate_app_report(&options) + fn parse_input_format(&mut self, iter: &mut std::vec::IntoIter) -> Result<(), String> { + let value = next_arg(iter, "--input-format")?; + self.loader.format = if value.eq_ignore_ascii_case("auto") { + None + } else { + Some(value.parse::().map_err(|err| { + format!("invalid --input-format value {value}: {err}") + })?) + }; + Ok(()) + } + + fn consume_path(&mut self, value: &str) { + if self.manifest_path.is_none() { + self.manifest_path = Some(PathBuf::from(value)); + } else { + self.input_files.push(PathBuf::from(value)); + } + } + + fn finish(mut self) -> Result { + let Some(manifest_path) = self.manifest_path else { + return Err("usage: bioscript report --input-file [--input-file ...] --output-dir [--html]".to_owned()); + }; + if self.input_files.is_empty() { + return Err("bioscript report requires at least one --input-file".to_owned()); + } + let output_dir = self.output_dir.ok_or("bioscript report requires --output-dir")?; + let root = self.root.unwrap_or(self.cwd); + normalize_loader_paths(&root, &mut self.loader); + let manifest_path = if is_package_url(&manifest_path.to_string_lossy()) { + prepare_package_entrypoint_from_arg(&root, &manifest_path)? + } else { + prepare_package_entrypoint_from_arg(&root, &absolutize(&root, &manifest_path))? + }; + Ok(AppReportOptions { + manifest_path, + input_files: self + .input_files + .iter() + .map(|path| absolutize(&root, path)) + .collect(), + output_dir: absolutize(&root, &output_dir), + root, + html: self.html, + open_report: self.open_report, + observations_format: self.observations_format, + reports_format: self.reports_format, + loader: self.loader, + filters: self.filters, + analysis_max_duration_ms: self.analysis_max_duration_ms, + detect_sex: self.detect_sex, + sample_sex: self.sample_sex, + }) + } +} + +fn next_arg(iter: &mut std::vec::IntoIter, flag: &str) -> Result { + iter.next().ok_or_else(|| format!("{flag} requires a value")) } fn parse_app_output_format(value: &str) -> Result { @@ -136,6 +198,26 @@ fn parse_app_output_format(value: &str) -> Result { } } +fn parse_sample_sex(value: &str) -> Result { + match value.to_ascii_lowercase().as_str() { + "male" | "m" => Ok(InferredSex::Male), + "female" | "f" => Ok(InferredSex::Female), + "unknown" | "u" => Ok(InferredSex::Unknown), + other => Err(format!( + "unsupported --sample-sex value '{other}'; expected male, female, or unknown" + )), + } +} + +fn explicit_sample_sex_inference(sex: InferredSex) -> SexInference { + SexInference { + sex, + confidence: SexDetectionConfidence::High, + method: "explicit_sample_sex".to_owned(), + evidence: vec!["source=sample_sex_cli".to_owned()], + } +} + fn absolutize(root: &Path, path: &Path) -> PathBuf { if path.is_absolute() { path.to_path_buf() @@ -153,6 +235,7 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { })?; let assay_id = app_assay_id(&options.manifest_path)?; + let manifest_metadata = report_manifest_metadata(&options.manifest_path)?; let findings = load_manifest_findings(&options.root, &options.manifest_path)?; let provenance = load_manifest_provenance_links(&options.root, &options.manifest_path)?; let mut observations = Vec::new(); @@ -161,6 +244,17 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { for input_file in &options.input_files { let participant_id = participant_id_from_path(input_file); + let inspect_options = InspectOptions { + input_index: options.loader.input_index.clone(), + reference_file: options.loader.reference_file.clone(), + reference_index: options.loader.reference_index.clone(), + detect_sex: options.detect_sex, + }; + let mut input_inspection = + inspect_file(input_file, &inspect_options).map_err(|err| err.to_string())?; + if let Some(sample_sex) = options.sample_sex { + input_inspection.inferred_sex = Some(explicit_sample_sex_inference(sample_sex)); + } let rows = run_manifest_rows_for_report( &options.root, &options.manifest_path, @@ -171,28 +265,41 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { )?; let input_observations = rows .iter() - .map(|row| app_observation_from_manifest_row(&options.root, row, &assay_id)) + .map(|row| { + app_observation_from_manifest_row( + &options.root, + row, + &assay_id, + input_inspection.inferred_sex.as_ref(), + input_inspection.assembly, + ) + }) .collect::, _>>()?; observations.extend(input_observations.clone()); - let input_analyses = run_manifest_analyses_for_report( - &options.root, - &options.manifest_path, + let analysis_options = ReportAnalysisOptions { + runtime_root: &options.root, input_file, - &participant_id, - &options.loader, - &options.output_dir, - )?; + participant_id: &participant_id, + loader: &options.loader, + output_dir: &options.output_dir, + filters: &options.filters, + max_duration_ms: options.analysis_max_duration_ms, + }; + let input_analyses = + run_manifest_analyses_for_report(&options.manifest_path, &analysis_options)?; analyses.extend(input_analyses.clone()); let matched_findings = match_app_findings(&findings, &input_observations, &input_analyses); - reports.push(app_report_json( - &assay_id, - &participant_id, + reports.push(app_report_json(AppReportJsonInput { + assay_id: &assay_id, + participant_id: &participant_id, input_file, - &input_observations, - &input_analyses, - &matched_findings, - &provenance, - )); + observations: &input_observations, + analyses: &input_analyses, + findings: &matched_findings, + provenance: &provenance, + input_inspection: Some(&input_inspection), + manifest_metadata: &manifest_metadata, + })); } write_app_observations( @@ -205,21 +312,51 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { if options.html { write_app_html(&options.output_dir, &observations, &reports)?; } + open_app_html_report_if_requested(options); + print_app_report_paths(&options.output_dir, options.html); + Ok(()) +} - println!( - "observations: {}", - options.output_dir.join("observations.tsv").display() - ); - println!( - "analysis: {}", - options.output_dir.join("analysis.jsonl").display() - ); - println!( - "reports: {}", - options.output_dir.join("reports.jsonl").display() - ); - if options.html { - println!("html: {}", options.output_dir.join("index.html").display()); +fn open_app_html_report_if_requested(options: &AppReportOptions) { + if options.open_report + && let Err(err) = open_html_report(&options.output_dir.join("index.html")) + { + eprintln!("warning: {err}"); + } +} + +fn print_app_report_paths(output_dir: &Path, include_html: bool) { + println!("observations: {}", output_dir.join("observations.tsv").display()); + println!("analysis: {}", output_dir.join("analysis.jsonl").display()); + println!("reports: {}", output_dir.join("reports.jsonl").display()); + if include_html { + println!("html: {}", output_dir.join("index.html").display()); + } +} + +fn open_html_report(path: &Path) -> Result<(), String> { + let opener = if cfg!(target_os = "macos") { + "open" + } else if cfg!(target_os = "windows") { + "cmd" + } else { + "xdg-open" + }; + let status = if cfg!(target_os = "windows") { + let path_text = path.display().to_string(); + std::process::Command::new(opener) + .args(["/C", "start", "", &path_text]) + .status() + } else { + std::process::Command::new(opener).arg(path).status() + } + .map_err(|err| format!("failed to open html report {}: {err}", path.display()))?; + if status.success() { + Ok(()) + } else { + Err(format!( + "failed to open html report {}: opener exited with {status}", + path.display() + )) } - Ok(()) } diff --git a/rust/bioscript-cli/src/report_output.rs b/rust/bioscript-cli/src/report_output.rs index 63fea7c..6766d91 100644 --- a/rust/bioscript-cli/src/report_output.rs +++ b/rust/bioscript-cli/src/report_output.rs @@ -1,13 +1,19 @@ -fn app_report_json( - assay_id: &str, - participant_id: &str, - input_file: &Path, - observations: &[serde_json::Value], - analyses: &[serde_json::Value], - findings: &[serde_json::Value], - provenance: &[serde_json::Value], -) -> serde_json::Value { - let called = observations +#[derive(Clone, Copy)] +struct AppReportJsonInput<'a> { + assay_id: &'a str, + participant_id: &'a str, + input_file: &'a Path, + observations: &'a [serde_json::Value], + analyses: &'a [serde_json::Value], + findings: &'a [serde_json::Value], + provenance: &'a [serde_json::Value], + input_inspection: Option<&'a bioscript_formats::FileInspection>, + manifest_metadata: &'a serde_json::Value, +} + +fn app_report_json(input: AppReportJsonInput<'_>) -> serde_json::Value { + let called = input + .observations .iter() .filter(|item| { item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") @@ -16,28 +22,170 @@ fn app_report_json( serde_json::json!({ "schema": "bioscript:report:1.0", "version": "1.0", - "participant_id": participant_id, - "assay_id": assay_id, + "participant_id": input.participant_id, + "assay_id": input.assay_id, "assay_version": "1.0", + "manifest": input.manifest_metadata, "input": { - "file_name": input_file.file_name().and_then(|value| value.to_str()).unwrap_or_default(), - "file_path": input_file.display().to_string(), + "file_name": input.input_file.file_name().and_then(|value| value.to_str()).unwrap_or_default(), + "file_path": input.input_file.display().to_string(), + "debug": input.input_inspection.map(input_inspection_json), }, - "report_status": if called == observations.len() { "complete" } else { "partial" }, - "derived_from": observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), - "analyses": analyses, - "findings": findings, - "provenance": provenance, + "report_status": if called == input.observations.len() { "complete" } else { "partial" }, + "derived_from": input.observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), + "analyses": input.analyses, + "findings": input.findings, + "provenance": input.provenance, "metrics": { - "n_sites_tested": observations.len(), + "n_sites_tested": input.observations.len(), "n_sites_called": called, - "n_sites_missing": observations.len().saturating_sub(called), - "n_analyses": analyses.len(), - "n_findings_matched": findings.len(), + "n_sites_missing": input.observations.len().saturating_sub(called), + "n_analyses": input.analyses.len(), + "n_findings_matched": input.findings.len(), } }) } +fn report_manifest_metadata(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read manifest metadata {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse manifest metadata {}: {err}", path.display()))?; + let schema = yaml_string_at(&value, "schema"); + let version = yaml_string_at(&value, "version"); + let name = yaml_string_at(&value, "name"); + let label = yaml_string_at(&value, "label").or_else(|| name.clone()); + let tags = value + .get("tags") + .and_then(serde_yaml::Value::as_sequence) + .map(|items| { + items + .iter() + .filter_map(serde_yaml::Value::as_str) + .map(serde_json::Value::from) + .collect::>() + }) + .unwrap_or_default(); + let members = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + .map(|items| { + items + .iter() + .filter_map(serde_yaml::Value::as_mapping) + .map(|mapping| { + serde_json::json!({ + "kind": yaml_mapping_string(mapping, "kind"), + "path": yaml_mapping_string(mapping, "path"), + "version": yaml_mapping_string(mapping, "version"), + }) + }) + .collect::>() + }) + .unwrap_or_default(); + Ok(serde_json::json!({ + "schema": schema, + "version": version, + "name": name, + "label": label, + "tags": tags, + "members": members, + })) +} + +fn yaml_string_at(value: &serde_yaml::Value, key: &str) -> Option { + value + .get(key) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) +} + +fn yaml_mapping_string(mapping: &serde_yaml::Mapping, key: &str) -> Option { + mapping + .get(serde_yaml::Value::String(key.to_owned())) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) +} + +fn input_inspection_json(inspection: &bioscript_formats::FileInspection) -> serde_json::Value { + serde_json::json!({ + "container": file_container_name(inspection.container), + "format": detected_kind_name(inspection.detected_kind), + "format_confidence": detection_confidence_name(inspection.confidence), + "assembly": inspection.assembly.map(assembly_name), + "phased": inspection.phased, + "selected_entry": inspection.selected_entry, + "has_index": inspection.has_index, + "index_path": inspection.index_path.as_ref().map(|path| path.display().to_string()), + "reference_matches": inspection.reference_matches, + "source": inspection.source.as_ref().map(|source| serde_json::json!({ + "vendor": source.vendor, + "platform_version": source.platform_version, + "confidence": detection_confidence_name(source.confidence), + "evidence": source.evidence, + })), + "inferred_sex": inspection.inferred_sex.as_ref().map(|sex| serde_json::json!({ + "sex": inferred_sex_name(sex.sex), + "confidence": sex_detection_confidence_name(sex.confidence), + "method": sex.method, + "evidence": sex.evidence, + })), + "evidence": inspection.evidence, + "warnings": inspection.warnings, + "duration_ms": inspection.duration_ms, + }) +} + +fn file_container_name(value: bioscript_formats::FileContainer) -> &'static str { + match value { + bioscript_formats::FileContainer::Plain => "plain", + bioscript_formats::FileContainer::Zip => "zip", + } +} + +fn detected_kind_name(value: bioscript_formats::DetectedKind) -> &'static str { + match value { + bioscript_formats::DetectedKind::GenotypeText => "genotype_text", + bioscript_formats::DetectedKind::Vcf => "vcf", + bioscript_formats::DetectedKind::AlignmentCram => "alignment_cram", + bioscript_formats::DetectedKind::AlignmentBam => "alignment_bam", + bioscript_formats::DetectedKind::ReferenceFasta => "reference_fasta", + bioscript_formats::DetectedKind::Unknown => "unknown", + } +} + +fn detection_confidence_name(value: bioscript_formats::DetectionConfidence) -> &'static str { + match value { + bioscript_formats::DetectionConfidence::Authoritative => "authoritative", + bioscript_formats::DetectionConfidence::StrongHeuristic => "strong_heuristic", + bioscript_formats::DetectionConfidence::WeakHeuristic => "weak_heuristic", + bioscript_formats::DetectionConfidence::Unknown => "unknown", + } +} + +fn assembly_name(value: bioscript_core::Assembly) -> &'static str { + match value { + bioscript_core::Assembly::Grch37 => "grch37", + bioscript_core::Assembly::Grch38 => "grch38", + } +} + +fn inferred_sex_name(value: InferredSex) -> &'static str { + match value { + InferredSex::Male => "male", + InferredSex::Female => "female", + InferredSex::Unknown => "unknown", + } +} + +fn sex_detection_confidence_name(value: SexDetectionConfidence) -> &'static str { + match value { + SexDetectionConfidence::High => "high", + SexDetectionConfidence::Medium => "medium", + SexDetectionConfidence::Low => "low", + } +} + fn write_app_observations( output_dir: &Path, observations: &[serde_json::Value], @@ -124,11 +272,13 @@ fn write_app_html( reports: &[serde_json::Value], ) -> Result<(), String> { let mut out = String::from( - r##"BioScript report

    BioScript Report

    "##, + r##"BioScript report
    "##, ); let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); let analysis_outputs = collect_report_analyses(reports); + let participants = collect_report_participants(reports); + render_report_manifest_header(&mut out, reports); let _ = write!( out, "
    {} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
    ", @@ -137,29 +287,32 @@ fn write_app_html( label_findings.len(), summary_findings.len() ); - out.push_str(""); + render_participant_filter(&mut out, &participants); + out.push_str(""); + out.push_str("

    Input

    "); + render_input_debug(&mut out, reports, participants.len() > 1); + out.push_str("
    "); out.push_str("

    Observations

    "); - render_observation_table(&mut out, observations); + render_observation_table(&mut out, observations, participants.len() > 1); out.push_str("
    "); out.push_str("

    Analysis

    "); - render_analysis_tables(&mut out, &analysis_outputs); - out.push_str("
    "); - out.push_str("

    PGx Label Annotations

    "); - render_pgx_label_table(&mut out, &label_findings); + render_analysis_tables(&mut out, &analysis_outputs, participants.len() > 1); out.push_str("
    "); - out.push_str("

    PGx Summary Annotations

    "); - render_pgx_summary_table(&mut out, &summary_findings); + out.push_str("

    PGx

    "); + render_pgx_table(&mut out, &label_findings, &summary_findings); out.push_str("
    "); out.push_str("

    Provenance

    "); render_provenance_links(&mut out, reports); out.push_str("
    "); - out.push_str("

    Raw Reports JSON

    "); + out.push_str("

    Source

    "); + render_report_source_section(&mut out, reports); + out.push_str("
    "); + out.push_str("

    Raw Reports JSON

    Show raw report JSON"); for report in reports { let text = serde_json::to_string_pretty(report).map_err(|err| err.to_string())?; let _ = write!(out, "
    {}
    ", html_escape(&text)); } - out.push_str("
    "); + out.push_str("
    "); fs::write(output_dir.join("index.html"), out) .map_err(|err| format!("failed to write index.html: {err}")) } - diff --git a/rust/bioscript-cli/src/report_review.rs b/rust/bioscript-cli/src/report_review.rs new file mode 100644 index 0000000..ae05dbf --- /dev/null +++ b/rust/bioscript-cli/src/report_review.rs @@ -0,0 +1,290 @@ +struct ReviewReportOptions { + manifest_path: PathBuf, + cases_path: PathBuf, + output_dir: PathBuf, + root: PathBuf, + html: bool, + filters: Vec, +} + +struct ReviewCase { + id: String, + label: String, + genotypes: BTreeMap>, +} + +fn run_review_report(args: Vec) -> Result<(), String> { + let cwd = env::current_dir().map_err(|err| format!("failed to get cwd: {err}"))?; + let mut manifest_path: Option = None; + let mut cases_path: Option = None; + let mut output_dir: Option = None; + let mut root: Option = None; + let mut html = false; + let mut filters = Vec::new(); + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--cases" => { + cases_path = Some(PathBuf::from(iter.next().ok_or("--cases requires a path")?)); + } + "--output-dir" => { + output_dir = Some(PathBuf::from( + iter.next().ok_or("--output-dir requires a path")?, + )); + } + "--root" => { + root = Some(PathBuf::from( + iter.next().ok_or("--root requires a directory")?, + )); + } + "--html" => html = true, + "--filter" => filters.push(iter.next().ok_or("--filter requires key=value")?), + value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")), + value => { + if manifest_path.is_none() { + manifest_path = Some(PathBuf::from(value)); + } else { + return Err(format!("unexpected argument: {value}")); + } + } + } + } + + let Some(manifest_path) = manifest_path else { + return Err("usage: bioscript review --cases --output-dir [--html]".to_owned()); + }; + let cases_path = cases_path.ok_or("bioscript review requires --cases")?; + let output_dir = output_dir.ok_or("bioscript review requires --output-dir")?; + let root = root.unwrap_or(cwd); + let manifest_path = if is_package_url(&manifest_path.to_string_lossy()) { + prepare_package_entrypoint_from_arg(&root, &manifest_path)? + } else { + prepare_package_entrypoint_from_arg(&root, &absolutize(&root, &manifest_path))? + }; + let options = ReviewReportOptions { + manifest_path, + cases_path: absolutize(&root, &cases_path), + output_dir: absolutize(&root, &output_dir), + root, + html, + filters, + }; + generate_review_report(&options) +} + +fn generate_review_report(options: &ReviewReportOptions) -> Result<(), String> { + fs::create_dir_all(&options.output_dir).map_err(|err| { + format!( + "failed to create output dir {}: {err}", + options.output_dir.display() + ) + })?; + + let assay_id = app_assay_id(&options.manifest_path)?; + let manifest_metadata = report_manifest_metadata(&options.manifest_path)?; + let findings = load_manifest_findings(&options.root, &options.manifest_path)?; + let provenance = load_manifest_provenance_links(&options.root, &options.manifest_path)?; + let cases = load_review_cases(&options.cases_path)?; + let mut observations = Vec::new(); + let mut analyses = Vec::new(); + let mut reports = Vec::new(); + + for case in cases { + let input_bytes = review_case_genotype_text(&case); + let store = GenotypeStore::from_bytes(&format!("{}.txt", case.id), input_bytes.as_bytes()) + .map_err(|err| err.to_string())?; + let input_observations = run_manifest_rows_with_store( + &options.root, + &options.manifest_path, + &store, + &case.id, + &options.filters, + )? + .iter() + .map(|row| app_observation_from_manifest_row(&options.root, row, &assay_id, None, None)) + .collect::, _>>()?; + observations.extend(input_observations.clone()); + + let input_analyses = run_review_analyses(options, &case, &input_bytes)?; + analyses.extend(input_analyses.clone()); + let matched_findings = match_app_findings(&findings, &input_observations, &input_analyses); + let synthetic_input = PathBuf::from(format!("review://{}", case.id)); + let mut report = app_report_json(AppReportJsonInput { + assay_id: &assay_id, + participant_id: &case.id, + input_file: &synthetic_input, + observations: &input_observations, + analyses: &input_analyses, + findings: &matched_findings, + provenance: &provenance, + input_inspection: None, + manifest_metadata: &manifest_metadata, + }); + if let Some(object) = report.as_object_mut() { + object.insert( + "review_case".to_owned(), + serde_json::json!({ + "id": case.id, + "label": case.label, + }), + ); + } + reports.push(report); + } + let review_temp_dir = options.output_dir.join(".review-temp"); + if review_temp_dir.exists() { + fs::remove_dir_all(&review_temp_dir).map_err(|err| { + format!( + "failed to remove review temp dir {}: {err}", + review_temp_dir.display() + ) + })?; + } + + write_app_observations(&options.output_dir, &observations, AppOutputFormat::Tsv)?; + write_app_analyses(&options.output_dir, &analyses)?; + write_app_reports(&options.output_dir, &reports, AppOutputFormat::Jsonl)?; + if options.html { + write_app_html(&options.output_dir, &observations, &reports)?; + } + println!( + "review reports: {}", + options.output_dir.join("reports.jsonl").display() + ); + if options.html { + println!("review html: {}", options.output_dir.join("index.html").display()); + } + Ok(()) +} + +fn run_manifest_rows_with_store( + runtime_root: &Path, + manifest_path: &Path, + store: &GenotypeStore, + participant_id: &str, + filters: &[String], +) -> Result>, String> { + match manifest_schema(manifest_path)?.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = load_variant_manifest(manifest_path)?; + Ok(vec![run_variant_manifest_with_store( + runtime_root, + &manifest, + store, + Some(participant_id), + )?]) + } + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + run_panel_manifest_with_store(runtime_root, &manifest, store, Some(participant_id), filters) + } + "bioscript:assay:1.0" => { + let manifest = load_assay_manifest(manifest_path)?; + run_assay_manifest_with_store(runtime_root, &manifest, store, Some(participant_id), filters) + } + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn run_review_analyses( + options: &ReviewReportOptions, + case: &ReviewCase, + input_bytes: &str, +) -> Result, String> { + let temp_dir = options.output_dir.join(".review-temp"); + fs::create_dir_all(&temp_dir).map_err(|err| { + format!( + "failed to create review temp dir {}: {err}", + temp_dir.display() + ) + })?; + let temp_path = temp_dir.join(format!("{}.txt", case.id)); + fs::write(&temp_path, input_bytes) + .map_err(|err| format!("failed to write review temp input {}: {err}", temp_path.display()))?; + let loader = GenotypeLoadOptions { + format: Some(GenotypeSourceFormat::Text), + ..GenotypeLoadOptions::default() + }; + let analysis_options = ReportAnalysisOptions { + runtime_root: &options.root, + input_file: &temp_path, + participant_id: &case.id, + loader: &loader, + output_dir: &options.output_dir, + filters: &options.filters, + max_duration_ms: 1_000, + }; + let result = run_manifest_analyses_for_report(&options.manifest_path, &analysis_options); + let cleanup = fs::remove_file(&temp_path); + if let Err(err) = cleanup { + return Err(format!("failed to remove review temp input {}: {err}", temp_path.display())); + } + result +} + +fn load_review_cases(path: &Path) -> Result, String> { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read review cases {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse review cases {}: {err}", path.display()))?; + let cases = value + .get("cases") + .and_then(serde_yaml::Value::as_sequence) + .ok_or("review cases missing cases list")?; + cases + .iter() + .map(review_case_from_yaml) + .collect::, _>>() +} + +fn review_case_from_yaml(value: &serde_yaml::Value) -> Result { + let id = value + .get("id") + .and_then(serde_yaml::Value::as_str) + .ok_or("review case missing id")? + .to_owned(); + let label = value + .get("label") + .and_then(serde_yaml::Value::as_str) + .unwrap_or(&id) + .to_owned(); + let genotypes_value = value + .get("genotypes") + .or_else(|| value.get("variants")) + .and_then(serde_yaml::Value::as_mapping) + .ok_or_else(|| format!("review case {id} missing genotypes"))?; + let mut genotypes = BTreeMap::new(); + for (key, value) in genotypes_value { + let Some(rsid) = key.as_str() else { + return Err(format!("review case {id} has non-string genotype key")); + }; + let genotype = if value.is_null() { + None + } else { + Some( + value + .as_str() + .ok_or_else(|| format!("review case {id} genotype {rsid} must be string or null"))? + .to_owned(), + ) + }; + genotypes.insert(rsid.to_owned(), genotype); + } + Ok(ReviewCase { + id, + label, + genotypes, + }) +} + +fn review_case_genotype_text(case: &ReviewCase) -> String { + let mut out = String::from("rsid\tgenotype\n"); + for (rsid, genotype) in &case.genotypes { + let Some(genotype) = genotype else { + continue; + }; + let _ = writeln!(out, "{}\t{}", rsid.replace('\t', " "), genotype.replace('\t', " ")); + } + out +} diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index e7e4bb8..17613ed 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -116,7 +116,7 @@ fn batch_lookup_query_plan_runs_and_preserves_requested_result_order() { ); let stdout = String::from_utf8_lossy(&output.stdout); assert!(stdout.contains("AG")); - assert!(stdout.contains("TC")); + assert!(stdout.contains("CT")); assert!(stdout.contains("II")); } diff --git a/rust/bioscript-cli/tests/cli/manifests.rs b/rust/bioscript-cli/tests/cli/manifests.rs index a14d379..cbac633 100644 --- a/rust/bioscript-cli/tests/cli/manifests.rs +++ b/rust/bioscript-cli/tests/cli/manifests.rs @@ -239,6 +239,7 @@ members: assert!(!stdout.contains("example-rs60910145")); } + #[test] fn panel_manifest_filters_by_kind_tag_path_and_rejects_unknown_filter_keys() { let root = repo_root(); diff --git a/rust/bioscript-formats/Cargo.toml b/rust/bioscript-formats/Cargo.toml index c657916..98b9e6d 100644 --- a/rust/bioscript-formats/Cargo.toml +++ b/rust/bioscript-formats/Cargo.toml @@ -8,6 +8,7 @@ crate-type = ["rlib"] [dependencies] bioscript-core = { path = "../bioscript-core" } +flate2 = "1.1.9" noodles = { version = "0.109.0", features = ["bgzf", "core", "cram", "csi", "fasta", "sam", "tabix", "vcf"] } zip = { version = "2.2.0", default-features = false, features = ["deflate"] } diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 754545c..d040e36 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -556,6 +556,10 @@ mod tests { infer_snp_genotype('A', 'G', 5, 5, 10).as_deref(), Some("AG") ); + assert_eq!( + infer_snp_genotype('G', 'A', 5, 5, 10).as_deref(), + Some("AG") + ); assert!(describe_snp_decision_rule('A', 'G', 0, 0, 0).contains("no covering reads")); assert!(describe_snp_decision_rule('A', 'G', 0, 0, 3).contains("no reads matched")); assert!(describe_snp_decision_rule('A', 'G', 2, 8, 10).contains("alt_fraction=0.800")); @@ -604,6 +608,9 @@ mod tests { assert_eq!(strip_inline_comment("AG // note"), "AG"); assert_eq!(normalize_genotype("n/a"), "--"); assert_eq!(normalize_genotype("a / g"), "AG"); + assert_eq!(normalize_genotype("g / a"), "AG"); + assert_eq!(normalize_genotype("T|C"), "CT"); + assert_eq!(normalize_genotype("TC"), "CT"); assert_eq!(normalize_genotype("A/-"), "ID"); assert_eq!(split_csv_line(r#"rs1,"1,2",AG"#), vec!["rs1", "1,2", "AG"]); diff --git a/rust/bioscript-formats/src/genotype/common.rs b/rust/bioscript-formats/src/genotype/common.rs index 95b1338..5135c2e 100644 --- a/rust/bioscript-formats/src/genotype/common.rs +++ b/rust/bioscript-formats/src/genotype/common.rs @@ -54,12 +54,21 @@ pub(crate) fn normalize_genotype(value: &str) -> String { if cleaned.is_empty() || matches!(cleaned.as_str(), "NA" | "N/A" | "#N/A" | "NONE") { return "--".to_owned(); } - if cleaned.contains('/') { - let parts: Vec<&str> = cleaned.split('/').collect(); + if cleaned.contains('/') || cleaned.contains('|') { + let parts: Vec<&str> = cleaned.split(['/', '|']).collect(); if parts.iter().any(|part| part.is_empty() || *part == "-") { return "ID".to_owned(); } - return parts.concat(); + return sorted_genotype_parts(parts); } - cleaned + sorted_genotype_parts(cleaned.chars().map(String::from).collect()) +} + +fn sorted_genotype_parts(parts: Vec) -> String +where + T: AsRef, +{ + let mut parts = parts; + parts.sort_by(|left, right| left.as_ref().cmp(right.as_ref())); + parts.iter().map(AsRef::as_ref).collect() } diff --git a/rust/bioscript-formats/src/genotype/cram_backend.rs b/rust/bioscript-formats/src/genotype/cram_backend.rs index 383bb46..a712127 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend.rs @@ -117,7 +117,25 @@ pub(crate) fn infer_snp_genotype( } else if alt_fraction <= 0.2 { Some(format!("{reference}{reference}")) } else { - Some(format!("{reference}{alternate}")) + Some(unphased_allele_pair(reference, alternate)) + } +} + +fn unphased_allele_pair(left: char, right: char) -> String { + let mut alleles = [left.to_ascii_uppercase(), right.to_ascii_uppercase()]; + alleles.sort_by_key(|allele| allele_sort_rank(*allele)); + alleles.iter().collect() +} + +fn allele_sort_rank(allele: char) -> u8 { + match allele.to_ascii_uppercase() { + 'A' => 0, + 'C' => 1, + 'G' => 2, + 'T' => 3, + 'I' => 4, + 'D' => 5, + _ => 99, } } @@ -161,10 +179,19 @@ pub(crate) fn infer_copy_number_genotype( } else if alt_fraction <= 0.2 { Some(format!("{reference}{reference}")) } else { - Some(format!("{reference}{alternate}")) + Some(sorted_unphased_tokens(reference, alternate)) } } +fn sorted_unphased_tokens(reference: &str, alternate: &str) -> String { + let mut alleles = [ + reference.to_ascii_uppercase(), + alternate.to_ascii_uppercase(), + ]; + alleles.sort_by_key(|allele| allele.chars().next().map_or(u8::MAX, allele_sort_rank)); + alleles.concat() +} + pub(crate) fn describe_copy_number_decision_rule( reference: &str, alternate: &str, diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index add8e70..40135f3 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -34,11 +34,13 @@ use bioscript_core::{Assembly, RuntimeError}; mod heuristics; mod io; mod render; +mod sex; pub(crate) use heuristics::*; pub(crate) use io::*; #[cfg(test)] pub(crate) use render::*; +pub use sex::{InferredSex, SexDetectionConfidence, SexInference}; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum FileContainer { @@ -77,6 +79,7 @@ pub struct InspectOptions { pub input_index: Option, pub reference_file: Option, pub reference_index: Option, + pub detect_sex: bool, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -92,6 +95,7 @@ pub struct FileInspection { pub has_index: Option, pub index_path: Option, pub reference_matches: Option, + pub inferred_sex: Option, pub evidence: Vec, pub warnings: Vec, pub duration_ms: u128, @@ -125,6 +129,13 @@ pub fn inspect_bytes( &sample_lines, options, ); + if options.detect_sex { + inspection.inferred_sex = Some(sex::infer_sex_from_zip_bytes( + bytes, + &selected_entry, + inspection.detected_kind, + )?); + } inspection.duration_ms = started.elapsed().as_millis(); return Ok(inspection); } @@ -163,8 +174,9 @@ pub fn inspect_bytes( } _ => read_plain_sample_lines_from_bytes(&lower, bytes)?, }; - let source = detect_source(&lower, &sample_lines, detected_kind); - let assembly = detect_assembly(&lower, &sample_lines); + let inspection_context = inspect_context_name(&lower, options); + let source = detect_source(&inspection_context, &sample_lines, detected_kind); + let assembly = detect_assembly(&inspection_context, &sample_lines); let phased = (detected_kind == DetectedKind::Vcf) .then(|| detect_vcf_phasing(&sample_lines)) .flatten(); @@ -181,6 +193,10 @@ pub fn inspect_bytes( .clone() .or_else(|| options.reference_index.clone()); let confidence = classify_confidence(detected_kind, &sample_lines, source.as_ref()); + let inferred_sex = options + .detect_sex + .then(|| sex::infer_sex_from_bytes(name, bytes, detected_kind)) + .transpose()?; Ok(FileInspection { path: path.to_path_buf(), @@ -194,6 +210,7 @@ pub fn inspect_bytes( has_index, index_path, reference_matches: None, + inferred_sex, evidence, warnings, duration_ms: started.elapsed().as_millis(), @@ -216,6 +233,10 @@ pub fn inspect_file(path: &Path, options: &InspectOptions) -> Result Result read_plain_sample_lines(path)?, }; - let source = detect_source(&lower, &sample_lines, detected_kind); - let assembly = detect_assembly(&lower, &sample_lines); + let inspection_context = inspect_context_name(&lower, options); + let source = detect_source(&inspection_context, &sample_lines, detected_kind); + let assembly = detect_assembly(&inspection_context, &sample_lines); let phased = (detected_kind == DetectedKind::Vcf) .then(|| detect_vcf_phasing(&sample_lines)) .flatten(); let (has_index, index_path) = detect_index(path, detected_kind, options); let confidence = classify_confidence(detected_kind, &sample_lines, source.as_ref()); + let inferred_sex = options + .detect_sex + .then(|| sex::infer_sex_from_path(path, detected_kind)) + .transpose()?; Ok(FileInspection { path: path.to_path_buf(), @@ -274,6 +300,7 @@ pub fn inspect_file(path: &Path, options: &InspectOptions) -> Result String { + let mut context = lower_name.to_owned(); + for path in [ + options.input_index.as_ref(), + options.reference_file.as_ref(), + options.reference_index.as_ref(), + ] + .into_iter() + .flatten() + { + context.push('\n'); + context.push_str(&path.display().to_string().to_ascii_lowercase()); + } + context +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/bioscript-formats/src/inspect/heuristics.rs b/rust/bioscript-formats/src/inspect/heuristics.rs index 84f396a..a26a5f1 100644 --- a/rust/bioscript-formats/src/inspect/heuristics.rs +++ b/rust/bioscript-formats/src/inspect/heuristics.rs @@ -181,10 +181,14 @@ pub(crate) fn detect_source( vendor = Some("MyHeritage".to_owned()); confidence = DetectionConfidence::StrongHeuristic; evidence.push("MyHeritage header/export name".to_owned()); - } else if normalized.contains("sequencing com") && kind == DetectedKind::Vcf { + } else if normalized.contains("sequencing com") { vendor = Some("Sequencing.com".to_owned()); - confidence = DetectionConfidence::WeakHeuristic; - evidence.push("sequencing.com header text".to_owned()); + confidence = if kind == DetectedKind::Vcf { + DetectionConfidence::WeakHeuristic + } else { + DetectionConfidence::StrongHeuristic + }; + evidence.push("sequencing.com path/header text".to_owned()); } else if normalized.contains("carigenetics") || normalized.contains("cari genetics") { vendor = Some("CariGenetics".to_owned()); confidence = DetectionConfidence::StrongHeuristic; @@ -235,6 +239,8 @@ pub(crate) fn detect_assembly(lower_name: &str, sample_lines: &[String]) -> Opti let header = sample_lines.join("\n").to_ascii_lowercase(); let combined = format!("{lower_name}\n{header}"); let looks_like_grch38 = combined.contains("build 38") + || combined.contains("assembly38") + || combined.contains("assembly 38") || combined.contains("grch38") || combined.contains("hg38") || combined.contains("gca_000001405.15") diff --git a/rust/bioscript-formats/src/inspect/render.rs b/rust/bioscript-formats/src/inspect/render.rs index 0cf700f..13180f6 100644 --- a/rust/bioscript-formats/src/inspect/render.rs +++ b/rust/bioscript-formats/src/inspect/render.rs @@ -1,6 +1,9 @@ use bioscript_core::Assembly; -use super::{DetectedKind, DetectionConfidence, FileContainer, FileInspection}; +use super::{ + DetectedKind, DetectionConfidence, FileContainer, FileInspection, InferredSex, + SexDetectionConfidence, +}; impl FileInspection { #[must_use] @@ -31,6 +34,23 @@ impl FileInspection { "reference_matches\t{}", render_bool(self.reference_matches) )); + if let Some(inferred) = &self.inferred_sex { + lines.push(format!( + "inferred_sex\t{}", + render_inferred_sex(inferred.sex) + )); + lines.push(format!( + "sex_confidence\t{}", + render_sex_confidence(inferred.confidence) + )); + lines.push(format!("sex_method\t{}", inferred.method)); + lines.push(format!("sex_evidence\t{}", inferred.evidence.join(" | "))); + } else { + lines.push("inferred_sex\t".to_owned()); + lines.push("sex_confidence\t".to_owned()); + lines.push("sex_method\t".to_owned()); + lines.push("sex_evidence\t".to_owned()); + } if let Some(source) = &self.source { lines.push(format!( "vendor\t{}", @@ -100,3 +120,19 @@ pub(crate) fn render_bool(value: Option) -> &'static str { None => "", } } + +pub(crate) fn render_inferred_sex(value: InferredSex) -> &'static str { + match value { + InferredSex::Male => "male", + InferredSex::Female => "female", + InferredSex::Unknown => "unknown", + } +} + +pub(crate) fn render_sex_confidence(value: SexDetectionConfidence) -> &'static str { + match value { + SexDetectionConfidence::High => "high", + SexDetectionConfidence::Medium => "medium", + SexDetectionConfidence::Low => "low", + } +} diff --git a/rust/bioscript-formats/src/inspect/sex.rs b/rust/bioscript-formats/src/inspect/sex.rs new file mode 100644 index 0000000..4c931ac --- /dev/null +++ b/rust/bioscript-formats/src/inspect/sex.rs @@ -0,0 +1,494 @@ +use std::{ + io::{BufRead, BufReader, Cursor, Read}, + path::Path, +}; + +use bioscript_core::RuntimeError; +use flate2::read::MultiGzDecoder; +use zip::ZipArchive; + +use super::{DetectedKind, split_fields}; + +const MAX_SEX_DETECTION_LINES: usize = 50_000_000; +const MAX_ZIP_ENTRY_BYTES: u64 = 256 * 1024 * 1024; +const MALE_SPECIFIC_Y_MARKERS: &[&str] = &[ + "rs11575897", + "rs2534636", + "i3000043", + "i3000045", + "i4000162", + "rs13303871", + "rs35284970", + "rs3895", + "i4000120", + "i4000121", + "i4000123", + "rs13447361", + "rs2267801", + "rs2267802", + "rs9786142", + "i4000099", + "i4000174", + "i4000095", + "i4000052", + "i4000102", +]; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum InferredSex { + Male, + Female, + Unknown, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SexDetectionConfidence { + High, + Medium, + Low, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SexInference { + pub sex: InferredSex, + pub confidence: SexDetectionConfidence, + pub method: String, + pub evidence: Vec, +} + +#[derive(Debug, Default)] +struct SexStats { + total_y_snps: usize, + called_y_snps: usize, + male_markers_found: usize, + male_markers_called: usize, + y_examples: Vec, + x_non_par_sites: usize, + x_haploid_gt_sites: usize, + x_diploid_gt_sites: usize, + x_het_gt_sites: usize, +} + +pub(crate) fn infer_sex_from_path( + path: &Path, + kind: DetectedKind, +) -> Result { + if !supports_sex_detection(kind) { + return Ok(unsupported_sex_inference()); + } + let lower = path.to_string_lossy().to_ascii_lowercase(); + if lower.ends_with(".zip") { + let file = std::fs::File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open {}: {err}", path.display())))?; + let mut archive = ZipArchive::new(file).map_err(|err| { + RuntimeError::Io(format!("failed to read zip {}: {err}", path.display())) + })?; + let entry_name = select_sex_detection_zip_entry(&mut archive)?; + let mut entry = archive.by_name(&entry_name).map_err(|err| { + RuntimeError::Io(format!( + "failed to open zip entry {entry_name} in {}: {err}", + path.display() + )) + })?; + let mut bytes = Vec::new(); + std::io::Read::by_ref(&mut entry) + .take(MAX_ZIP_ENTRY_BYTES.saturating_add(1)) + .read_to_end(&mut bytes) + .map_err(|err| { + RuntimeError::Io(format!("failed to read zip entry {entry_name}: {err}")) + })?; + if u64::try_from(bytes.len()).unwrap_or(u64::MAX) > MAX_ZIP_ENTRY_BYTES { + return Err(RuntimeError::InvalidArguments(format!( + "zip entry {entry_name} exceeds sex detection limit of {MAX_ZIP_ENTRY_BYTES} bytes" + ))); + } + return infer_sex_from_bytes(&entry_name, &bytes, kind); + } + + let file = std::fs::File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open {}: {err}", path.display())))?; + if lower.ends_with(".vcf.gz") { + return infer_sex_from_reader(BufReader::new(MultiGzDecoder::new(file)), kind); + } + infer_sex_from_reader(BufReader::new(file), kind) +} + +pub(crate) fn infer_sex_from_bytes( + name: &str, + bytes: &[u8], + kind: DetectedKind, +) -> Result { + if !supports_sex_detection(kind) { + return Ok(unsupported_sex_inference()); + } + let lower = name.to_ascii_lowercase(); + if lower.ends_with(".vcf.gz") { + return infer_sex_from_reader( + BufReader::new(MultiGzDecoder::new(Cursor::new(bytes))), + kind, + ); + } + infer_sex_from_reader(BufReader::new(Cursor::new(bytes)), kind) +} + +pub(crate) fn infer_sex_from_zip_bytes( + bytes: &[u8], + selected_entry: &str, + kind: DetectedKind, +) -> Result { + let mut archive = ZipArchive::new(Cursor::new(bytes)) + .map_err(|err| RuntimeError::Io(format!("failed to read zip bytes: {err}")))?; + let mut entry = archive.by_name(selected_entry).map_err(|err| { + RuntimeError::Io(format!( + "failed to open zip entry {selected_entry} from bytes: {err}" + )) + })?; + let mut entry_bytes = Vec::new(); + Read::by_ref(&mut entry) + .take(MAX_ZIP_ENTRY_BYTES.saturating_add(1)) + .read_to_end(&mut entry_bytes) + .map_err(|err| { + RuntimeError::Io(format!("failed to read zip entry {selected_entry}: {err}")) + })?; + if u64::try_from(entry_bytes.len()).unwrap_or(u64::MAX) > MAX_ZIP_ENTRY_BYTES { + return Err(RuntimeError::InvalidArguments(format!( + "zip entry {selected_entry} exceeds sex detection limit of {MAX_ZIP_ENTRY_BYTES} bytes" + ))); + } + infer_sex_from_bytes(selected_entry, &entry_bytes, kind) +} + +pub(crate) fn infer_sex_from_text_lines( + lines: &[String], + kind: DetectedKind, +) -> Result { + let mut stats = SexStats::default(); + for line in lines { + update_stats_from_line(&mut stats, line, kind); + } + Ok(classify_stats(&stats, kind)) +} + +fn infer_sex_from_reader( + mut reader: R, + kind: DetectedKind, +) -> Result { + let mut stats = SexStats::default(); + let mut line = String::new(); + for _ in 0..MAX_SEX_DETECTION_LINES { + line.clear(); + let bytes = reader + .read_line(&mut line) + .map_err(|err| RuntimeError::Io(format!("failed to scan sex markers: {err}")))?; + if bytes == 0 { + break; + } + update_stats_from_line(&mut stats, line.trim_end_matches(['\n', '\r']), kind); + } + Ok(classify_stats(&stats, kind)) +} + +fn update_stats_from_line(stats: &mut SexStats, line: &str, kind: DetectedKind) { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("//") { + return; + } + match kind { + DetectedKind::Vcf => update_vcf_stats(stats, trimmed), + DetectedKind::GenotypeText | DetectedKind::Unknown => { + update_genotype_text_stats(stats, trimmed); + } + _ => {} + } +} + +fn update_genotype_text_stats(stats: &mut SexStats, line: &str) { + let fields = split_fields(line); + if fields.len() < 4 { + return; + } + let rsid = fields[0].trim(); + let chrom = normalize_chrom(fields.get(1).map(String::as_str).unwrap_or_default()); + let genotype = genotype_text_field(&fields); + if chrom != "Y" { + return; + } + stats.total_y_snps += 1; + let called = is_called_genotype_text(genotype); + if called { + stats.called_y_snps += 1; + if stats.y_examples.len() < 5 { + stats.y_examples.push(format!("{rsid}:{genotype}")); + } + } + if MALE_SPECIFIC_Y_MARKERS.contains(&rsid) { + stats.male_markers_found += 1; + if called { + stats.male_markers_called += 1; + } + } +} + +fn update_vcf_stats(stats: &mut SexStats, line: &str) { + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() < 10 { + return; + } + let chrom = normalize_chrom(fields[0]); + let Ok(pos) = fields[1].parse::() else { + return; + }; + let gt = fields[9].split(':').next().unwrap_or_default(); + if chrom == "Y" { + stats.total_y_snps += 1; + if is_called_vcf_gt(gt) { + stats.called_y_snps += 1; + } + return; + } + if chrom != "X" || !is_non_par_x(pos) || !is_called_vcf_gt(gt) { + return; + } + stats.x_non_par_sites += 1; + let allele_count = vcf_gt_allele_count(gt); + if allele_count == 1 { + stats.x_haploid_gt_sites += 1; + } else if allele_count == 2 { + stats.x_diploid_gt_sites += 1; + if is_vcf_gt_het(gt) { + stats.x_het_gt_sites += 1; + } + } +} + +fn classify_stats(stats: &SexStats, kind: DetectedKind) -> SexInference { + match kind { + DetectedKind::Vcf => classify_vcf_stats(stats), + DetectedKind::GenotypeText | DetectedKind::Unknown => classify_y_fingerprint_stats(stats), + _ => unsupported_sex_inference(), + } +} + +fn supports_sex_detection(kind: DetectedKind) -> bool { + matches!( + kind, + DetectedKind::Vcf | DetectedKind::GenotypeText | DetectedKind::Unknown + ) +} + +fn unsupported_sex_inference() -> SexInference { + SexInference { + sex: InferredSex::Unknown, + confidence: SexDetectionConfidence::Low, + method: "unsupported_source_type".to_owned(), + evidence: vec!["sex detection currently supports genotype text and VCF inputs".to_owned()], + } +} + +fn classify_y_fingerprint_stats(stats: &SexStats) -> SexInference { + let (sex, confidence) = if stats.called_y_snps > 500 { + (InferredSex::Male, SexDetectionConfidence::High) + } else if stats.called_y_snps > 100 && stats.male_markers_called > 10 { + (InferredSex::Male, SexDetectionConfidence::Medium) + } else if stats.total_y_snps > 1000 && stats.called_y_snps < 10 && stats.male_markers_called < 3 + { + ( + InferredSex::Female, + if stats.called_y_snps == 0 { + SexDetectionConfidence::High + } else { + SexDetectionConfidence::Medium + }, + ) + } else if stats.called_y_snps > 50 || stats.male_markers_called > 5 { + (InferredSex::Male, SexDetectionConfidence::Medium) + } else if stats.called_y_snps < 10 && stats.total_y_snps > 0 { + (InferredSex::Female, SexDetectionConfidence::Medium) + } else { + (InferredSex::Unknown, SexDetectionConfidence::Low) + }; + SexInference { + sex, + confidence, + method: "y_fingerprint".to_owned(), + evidence: y_fingerprint_evidence(stats), + } +} + +fn classify_vcf_stats(stats: &SexStats) -> SexInference { + let (sex, confidence) = if stats.called_y_snps > 500 + || (stats.x_haploid_gt_sites >= 20 && stats.x_diploid_gt_sites == 0) + { + (InferredSex::Male, SexDetectionConfidence::High) + } else if stats.x_non_par_sites >= 50 + && stats.x_diploid_gt_sites > 0 + && stats.x_het_gt_sites * 100 / stats.x_diploid_gt_sites.max(1) >= 2 + { + (InferredSex::Female, SexDetectionConfidence::Medium) + } else { + (InferredSex::Unknown, SexDetectionConfidence::Low) + }; + SexInference { + sex, + confidence, + method: "vcf_non_par_x_gt_y_count".to_owned(), + evidence: vec![ + format!("x_non_par_sites={}", stats.x_non_par_sites), + format!("x_haploid_gt_sites={}", stats.x_haploid_gt_sites), + format!("x_diploid_gt_sites={}", stats.x_diploid_gt_sites), + format!("x_het_gt_sites={}", stats.x_het_gt_sites), + format!("called_y_snps={}", stats.called_y_snps), + ], + } +} + +fn y_fingerprint_evidence(stats: &SexStats) -> Vec { + let mut evidence = vec![ + format!("total_y_snps={}", stats.total_y_snps), + format!("called_y_snps={}", stats.called_y_snps), + format!("male_markers_found={}", stats.male_markers_found), + format!("male_markers_called={}", stats.male_markers_called), + ]; + if !stats.y_examples.is_empty() { + evidence.push(format!("y_examples={}", stats.y_examples.join(","))); + } + evidence +} + +fn normalize_chrom(value: &str) -> String { + value + .trim() + .trim_start_matches("chr") + .trim_start_matches("CHR") + .to_ascii_uppercase() +} + +fn genotype_text_field(fields: &[String]) -> &str { + if fields.len() >= 4 { + fields[3].trim() + } else { + fields.last().map(String::as_str).unwrap_or_default().trim() + } +} + +fn is_called_genotype_text(value: &str) -> bool { + let value = value.trim(); + if value.is_empty() || matches!(value, "--" | "00" | "." | "./." | ".|.") { + return false; + } + value + .chars() + .all(|ch| matches!(ch.to_ascii_uppercase(), 'A' | 'C' | 'G' | 'T')) +} + +fn is_called_vcf_gt(value: &str) -> bool { + let value = value.trim(); + !value.is_empty() + && !value.contains('.') + && (value != "0" || matches!(value, "0" | "1" | "2" | "3")) +} + +fn vcf_gt_allele_count(gt: &str) -> usize { + gt.split(['/', '|']) + .filter(|part| !part.is_empty() && *part != ".") + .count() +} + +fn is_vcf_gt_het(gt: &str) -> bool { + let alleles: Vec<&str> = gt + .split(['/', '|']) + .filter(|part| !part.is_empty() && *part != ".") + .collect(); + alleles.len() == 2 && alleles[0] != alleles[1] +} + +fn is_non_par_x(pos: u32) -> bool { + // Human GRCh38 non-PAR X used by bcftools +guess-ploidy. + // GRCh37 differs slightly, but these bounds cover the common non-PAR body + // and avoid both pseudoautosomal ends for this QC heuristic. + (2_781_480..=154_931_043).contains(&pos) +} + +fn select_sex_detection_zip_entry( + archive: &mut ZipArchive, +) -> Result { + for idx in 0..archive.len() { + let entry = archive.by_index(idx).map_err(|err| { + RuntimeError::Io(format!("failed to inspect zip for sex detection: {err}")) + })?; + if entry.is_dir() || entry.name().starts_with("__MACOSX/") { + continue; + } + let lower = entry.name().to_ascii_lowercase(); + if lower.ends_with(".txt") + || lower.ends_with(".tsv") + || lower.ends_with(".csv") + || lower.ends_with(".vcf") + || lower.ends_with(".vcf.gz") + { + return Ok(entry.name().to_owned()); + } + } + Err(RuntimeError::Unsupported( + "zip archive does not contain a supported sex detection input".to_owned(), + )) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn y_fingerprint_detects_male_and_female_text_exports() { + let male = [ + "rs11575897\tY\t1\tG".to_owned(), + "rs2534636\tY\t2\tC".to_owned(), + "i3000043\tY\t3\tG".to_owned(), + "i3000045\tY\t4\tG".to_owned(), + "i4000162\tY\t5\tT".to_owned(), + "rs13303871\tY\t6\tG".to_owned(), + ]; + let result = infer_sex_from_text_lines(&male, DetectedKind::GenotypeText).unwrap(); + assert_eq!(result.sex, InferredSex::Male); + assert_eq!(result.confidence, SexDetectionConfidence::Medium); + + let female: Vec = (0..1001) + .map(|idx| format!("rs{idx}\tY\t{idx}\t--")) + .collect(); + let result = infer_sex_from_text_lines(&female, DetectedKind::GenotypeText).unwrap(); + assert_eq!(result.sex, InferredSex::Female); + assert_eq!(result.confidence, SexDetectionConfidence::High); + } + + #[test] + fn y_fingerprint_uses_genotype_column_not_array_metrics() { + let lines: Vec = (0..1001) + .map(|idx| format!("rs{idx}\tY\t{idx}\t--\t0\t0.5\t-4.0")) + .chain([ + "rs11575897\tY\t2001\t--\t0\t0.5\t-4.2".to_owned(), + "rs2534636\tY\t2002\t--\t0\t0.9\t-2.6".to_owned(), + ]) + .collect(); + let result = infer_sex_from_text_lines(&lines, DetectedKind::GenotypeText).unwrap(); + assert_eq!(result.sex, InferredSex::Female); + assert_eq!(result.confidence, SexDetectionConfidence::High); + assert!(result.evidence.iter().any(|item| item == "called_y_snps=0")); + } + + #[test] + fn vcf_non_par_x_gt_detects_diploid_het_signal() { + let lines = vec![ + "chrX\t3000000\t.\tC\tT\t.\tPASS\t.\tGT\t0/1".to_owned(), + "chrX\t4000000\t.\tC\tT\t.\tPASS\t.\tGT\t0/0".to_owned(), + "chrX\t5000000\t.\tC\tT\t.\tPASS\t.\tGT\t1/1".to_owned(), + ]; + let result = infer_sex_from_text_lines(&lines, DetectedKind::Vcf).unwrap(); + assert_eq!(result.method, "vcf_non_par_x_gt_y_count"); + assert!( + result + .evidence + .iter() + .any(|item| item == "x_het_gt_sites=1") + ); + } +} diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index ae050f8..e38057b 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -18,7 +18,7 @@ pub use genotype::{ observe_cram_indel_with_reader, observe_cram_snp_with_reader, observe_vcf_snp_with_reader, }; pub use inspect::{ - DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions, - SourceMetadata, inspect_bytes, inspect_file, + DetectedKind, DetectionConfidence, FileContainer, FileInspection, InferredSex, InspectOptions, + SexDetectionConfidence, SexInference, SourceMetadata, inspect_bytes, inspect_file, }; pub use prepare::{PrepareRequest, PreparedPaths, prepare_indexes, shell_flags}; diff --git a/rust/bioscript-formats/tests/file_formats/delimited.rs b/rust/bioscript-formats/tests/file_formats/delimited.rs index 77448d8..fd9a733 100644 --- a/rust/bioscript-formats/tests/file_formats/delimited.rs +++ b/rust/bioscript-formats/tests/file_formats/delimited.rs @@ -89,7 +89,7 @@ fn delimited_parser_handles_space_delimited_rows_without_headers_and_inline_comm let store = GenotypeStore::from_file(&path).unwrap(); - assert_eq!(store.get("rsSpace").unwrap().as_deref(), Some("TC")); + assert_eq!(store.get("rsSpace").unwrap().as_deref(), Some("CT")); let observation = store .lookup_variant(&VariantSpec { grch38: Some(bioscript_core::GenomicLocus { diff --git a/rust/bioscript-formats/tests/file_formats/vcf.rs b/rust/bioscript-formats/tests/file_formats/vcf.rs index fe16072..6eb9a4f 100644 --- a/rust/bioscript-formats/tests/file_formats/vcf.rs +++ b/rust/bioscript-formats/tests/file_formats/vcf.rs @@ -29,7 +29,7 @@ fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { }) .unwrap(); - assert_eq!(observation.genotype.as_deref(), Some("GC")); + assert_eq!(observation.genotype.as_deref(), Some("CG")); assert_eq!(observation.assembly, Some(bioscript_core::Assembly::Grch38)); assert_eq!(observation.evidence[0], "resolved by locus chr1:1000"); assert!( @@ -69,7 +69,7 @@ fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { ..VariantSpec::default() }) .unwrap(); - assert_eq!(deletion.genotype.as_deref(), Some("ID")); + assert_eq!(deletion.genotype.as_deref(), Some("DI")); assert_eq!(deletion.assembly, Some(bioscript_core::Assembly::Grch37)); assert_eq!(deletion.evidence[0], "resolved by locus 1:99"); assert!( diff --git a/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs b/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs index 825c72d..d814223 100644 --- a/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs +++ b/rust/bioscript-formats/tests/file_formats/zip_and_fixtures.rs @@ -75,7 +75,7 @@ fn zip_genotype_file_is_auto_detected_and_readable() { let store = GenotypeStore::from_file(&zip_path).unwrap(); assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); - assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("TG")); + assert_eq!(store.get("rs60910145").unwrap().as_deref(), Some("GT")); assert_eq!(store.get("rs71785313").unwrap().as_deref(), Some("II")); } @@ -154,7 +154,7 @@ fn zip_vcf_gz_entry_is_selected_and_read_as_vcf() { let store = GenotypeStore::from_file(&zip_path).unwrap(); assert_eq!(store.backend_name(), "vcf"); - assert_eq!(store.get("rsZipVcfGz").unwrap().as_deref(), Some("GA")); + assert_eq!(store.get("rsZipVcfGz").unwrap().as_deref(), Some("AG")); } #[test] diff --git a/rust/bioscript-schema/src/validator_load.rs b/rust/bioscript-schema/src/validator_load.rs index f3532ca..d34013e 100644 --- a/rust/bioscript-schema/src/validator_load.rs +++ b/rust/bioscript-schema/src/validator_load.rs @@ -159,6 +159,7 @@ pub fn load_panel_manifest(path: &Path) -> Result { Ok(PanelManifest { path: path.to_path_buf(), name: required_non_empty_string(&value, &["name"])?, + label: scalar_at(&value, &["label"]), tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), permissions, downloads, diff --git a/rust/bioscript-schema/src/validator_panel.rs b/rust/bioscript-schema/src/validator_panel.rs index 4ef3c43..e1daccb 100644 --- a/rust/bioscript-schema/src/validator_panel.rs +++ b/rust/bioscript-schema/src/validator_panel.rs @@ -397,9 +397,7 @@ fn variant_spec_from_root(root: &Value) -> Result { let grch37 = locus_from_root(root, "grch37")?; let grch38 = locus_from_root(root, "grch38")?; let reference = scalar_at(root, &["alleles", "ref"]); - let alternate = seq_of_strings(root, &["alleles", "observed_alts"]) - .or_else(|| seq_of_strings(root, &["alleles", "alts"])) - .and_then(|alts| alts.first().cloned()); + let alternate = seq_of_strings(root, &["alleles", "alts"]).and_then(|alts| alts.first().cloned()); let deletion_length = value_at(root, &["alleles", "deletion_length"]) .and_then(Value::as_u64) .and_then(|value| usize::try_from(value).ok()); @@ -447,4 +445,3 @@ fn locus_from_root(root: &Value, assembly: &str) -> Result, end, })) } - diff --git a/rust/bioscript-schema/src/validator_parse.rs b/rust/bioscript-schema/src/validator_parse.rs index f31f28f..b959518 100644 --- a/rust/bioscript-schema/src/validator_parse.rs +++ b/rust/bioscript-schema/src/validator_parse.rs @@ -71,6 +71,10 @@ fn parse_panel_interpretations(root: &Value) -> Result, }; interpretations.push(PanelInterpretation { id: mapping_required_string(mapping, "id", idx, key)?, + label: mapping + .get(Value::String("label".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), kind: mapping_required_string(mapping, "kind", idx, key)?, path: mapping_required_string(mapping, "path", idx, key)?, output_format: mapping @@ -198,4 +202,3 @@ fn mapping_required_string( .map(ToOwned::to_owned) .ok_or_else(|| format!("{parent}[{idx}].{field} missing or empty")) } - diff --git a/rust/bioscript-schema/src/validator_types.rs b/rust/bioscript-schema/src/validator_types.rs index 13f7a96..a9466eb 100644 --- a/rust/bioscript-schema/src/validator_types.rs +++ b/rust/bioscript-schema/src/validator_types.rs @@ -109,6 +109,7 @@ pub struct VariantManifest { pub struct PanelManifest { pub path: PathBuf, pub name: String, + pub label: Option, pub tags: Vec, pub permissions: Permissions, pub downloads: Vec, @@ -151,6 +152,7 @@ pub struct PanelMember { #[derive(Debug, Clone, PartialEq, Eq)] pub struct PanelInterpretation { pub id: String, + pub label: Option, pub kind: String, pub path: String, pub output_format: Option,