From 0c36dc8cdf2983c97cabe3e85433374390926d73 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Thu, 7 May 2026 14:28:36 +1000 Subject: [PATCH] fixes for vcf and haploid X issues - more packaging changes --- rust/Cargo.lock | 1 + rust/bioscript-cli/Cargo.toml | 1 + rust/bioscript-cli/src/cli_bootstrap.rs | 2 +- rust/bioscript-cli/src/manifest_runner.rs | 67 +++-- rust/bioscript-cli/src/package.rs | 65 ++++- rust/bioscript-cli/src/package_release.rs | 94 +++++++ rust/bioscript-cli/src/report_options.rs | 19 +- rust/bioscript-formats/src/genotype.rs | 28 +- .../src/genotype/cram_backend.rs | 2 +- .../src/genotype/cram_backend/store.rs | 222 +++++++++++++++- rust/bioscript-formats/src/genotype/types.rs | 24 +- rust/bioscript-formats/src/genotype/vcf.rs | 178 ++++++------- .../src/genotype/vcf/matching.rs | 121 +++++++++ .../src/genotype/vcf_tokens.rs | 6 +- .../src/inspect/heuristics.rs | 11 +- rust/bioscript-formats/src/inspect/sex.rs | 245 +++++++++--------- .../src/inspect/sex/classify.rs | 115 ++++++++ .../tests/file_formats/cram.rs | 2 + .../tests/file_formats/vcf.rs | 125 +++++++++ 19 files changed, 1065 insertions(+), 263 deletions(-) create mode 100644 rust/bioscript-cli/src/package_release.rs create mode 100644 rust/bioscript-formats/src/genotype/vcf/matching.rs create mode 100644 rust/bioscript-formats/src/inspect/sex/classify.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 37a9374..86cd271 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -111,6 +111,7 @@ dependencies = [ "monty", "serde_json", "serde_yaml", + "sha2", "zip", ] diff --git a/rust/bioscript-cli/Cargo.toml b/rust/bioscript-cli/Cargo.toml index 25cb202..db5be65 100644 --- a/rust/bioscript-cli/Cargo.toml +++ b/rust/bioscript-cli/Cargo.toml @@ -15,6 +15,7 @@ bioscript-schema = { path = "../bioscript-schema" } monty = { path = "../../monty/crates/monty" } serde_json = { version = "1.0.133", features = ["preserve_order"] } serde_yaml = "0.9.34" +sha2 = "0.10" zip = { version = "2.2.0", default-features = false, features = ["deflate"] } [lints.clippy] diff --git a/rust/bioscript-cli/src/cli_bootstrap.rs b/rust/bioscript-cli/src/cli_bootstrap.rs index ffd5398..b85e5a5 100644 --- a/rust/bioscript-cli/src/cli_bootstrap.rs +++ b/rust/bioscript-cli/src/cli_bootstrap.rs @@ -57,7 +57,7 @@ fn run_cli() -> Result<(), String> { Ok(()) } -const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|cram] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]"; +const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|cram] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]"; struct CliOptions { script_path: Option, diff --git a/rust/bioscript-cli/src/manifest_runner.rs b/rust/bioscript-cli/src/manifest_runner.rs index d750ff3..e691d08 100644 --- a/rust/bioscript-cli/src/manifest_runner.rs +++ b/rust/bioscript-cli/src/manifest_runner.rs @@ -122,9 +122,10 @@ fn run_panel_manifest_with_store( participant_id: Option<&str>, filters: &[String], ) -> Result>, String> { - let mut rows = Vec::new(); + let mut rows_by_member: Vec>> = vec![Vec::new(); panel.members.len()]; + let mut variant_entries = Vec::new(); - for member in &panel.members { + for (member_index, member) in panel.members.iter().enumerate() { let Some(path) = &member.path else { return Err("remote panel members are not executable yet".to_owned()); }; @@ -134,21 +135,16 @@ fn run_panel_manifest_with_store( if !matches_filters(&manifest, &resolved, filters) { continue; } - rows.push(run_variant_manifest_with_store( - runtime_root, - &manifest, - store, - participant_id, - )?); + variant_entries.push((member_index, resolved, manifest)); } else if member.kind == "assay" { let assay = load_assay_manifest(&resolved)?; - rows.extend(run_assay_manifest_with_store( + rows_by_member[member_index] = run_assay_manifest_with_store( runtime_root, &assay, store, participant_id, filters, - )?); + )?; } else { return Err(format!( "panel member kind '{}' is not executable", @@ -157,6 +153,29 @@ fn run_panel_manifest_with_store( } } + let observations = store + .lookup_variants( + &variant_entries + .iter() + .map(|(_, _, manifest)| manifest.spec.clone()) + .collect::>(), + ) + .map_err(|err| err.to_string())?; + + for ((member_index, resolved, manifest), observation) in + variant_entries.into_iter().zip(observations) + { + rows_by_member[member_index].push(variant_row( + runtime_root, + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } + + let rows = rows_by_member.into_iter().flatten().collect(); Ok(rows) } @@ -181,7 +200,7 @@ fn run_assay_manifest_with_store( participant_id: Option<&str>, filters: &[String], ) -> Result>, String> { - let mut rows = Vec::new(); + let mut entries = Vec::new(); for member in &assay.members { if member.kind != "variant" { @@ -198,18 +217,32 @@ fn run_assay_manifest_with_store( if !matches_filters(&manifest, &resolved, filters) { continue; } - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - rows.push(variant_row( + entries.push((resolved, manifest)); + } + + let observations = store + .lookup_variants( + &entries + .iter() + .map(|(_, manifest)| manifest.spec.clone()) + .collect::>(), + ) + .map_err(|err| err.to_string())?; + + let rows = entries + .into_iter() + .zip(observations) + .map(|((resolved, manifest), observation)| { + variant_row( runtime_root, &resolved, &manifest.name, &manifest.tags, &observation, participant_id, - )); - } + ) + }) + .collect(); Ok(rows) } diff --git a/rust/bioscript-cli/src/package.rs b/rust/bioscript-cli/src/package.rs index 88c9fde..9bce723 100644 --- a/rust/bioscript-cli/src/package.rs +++ b/rust/bioscript-cli/src/package.rs @@ -6,19 +6,35 @@ const MAX_PACKAGE_FILES: usize = 1000; const MAX_PACKAGE_FILE_BYTES: u64 = 16 * 1024 * 1024; const MAX_PACKAGE_TOTAL_BYTES: u64 = 64 * 1024 * 1024; +include!("package_release.rs"); + fn prepare_package_entrypoint_from_arg( runtime_root: &Path, source: &Path, ) -> Result { let source_text = source.to_string_lossy(); - let package_path = if is_package_url(&source_text) { - download_package_url(runtime_root, &source_text)? + let source_url = if is_package_url(&source_text) { + Some(source_text.to_string()) + } else { + None + }; + let package_path = if let Some(url) = &source_url { + download_package_url(runtime_root, url)? } else { source.to_path_buf() }; if is_package_zip_path(&package_path) { let imported = import_package_zip(runtime_root, &package_path, None)?; Ok(imported.entrypoint) + } else if is_package_release_path(&package_path) { + match package_zip_from_release_manifest(runtime_root, &package_path, source_url.as_deref())? + { + Some(zip_path) => { + let imported = import_package_zip(runtime_root, &zip_path, None)?; + Ok(imported.entrypoint) + } + None => Ok(package_path), + } } else { Ok(package_path) } @@ -47,11 +63,19 @@ fn run_import_package(args: Vec) -> Result<(), String> { .map_or_else(env::current_dir, Ok) .map_err(|err| format!("failed to get current directory: {err}"))?; let source_text = source.to_string_lossy(); - let package_path = if is_package_url(&source_text) { - download_package_url(&runtime_root, &source_text)? + let source_url = if is_package_url(&source_text) { + Some(source_text.to_string()) + } else { + None + }; + let package_path = if let Some(url) = &source_url { + download_package_url(&runtime_root, url)? } else { absolutize(&runtime_root, &source) }; + let package_path = + package_zip_from_release_manifest(&runtime_root, &package_path, source_url.as_deref())? + .unwrap_or(package_path); let imported = import_package_zip(&runtime_root, &package_path, output_dir.as_deref())?; println!("root\t{}", imported.root.display()); println!("entrypoint\t{}", imported.entrypoint.display()); @@ -170,6 +194,23 @@ fn load_package_descriptor(root: &Path) -> Result { .ok_or_else(|| { format!("package descriptor {} is missing schema", path.display()) })?; + if matches!( + schema, + "bioscript:panel:1.0" + | "bioscript:assay:1.0" + | "bioscript:variant:1.0" + | "bioscript:variant" + ) { + let package_name = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("name".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned); + return Ok(PackageDescriptor { + entrypoint: PathBuf::from(PACKAGE_DESCRIPTOR), + name: package_name, + }); + } if schema != "bioscript:package:1.0" { return Err(format!( "package descriptor {} has unsupported schema '{schema}'", @@ -352,11 +393,13 @@ fn download_package_url(runtime_root: &Path, url: &str) -> Result bool { .and_then(|ext| ext.to_str()) .is_some_and(|ext| ext.eq_ignore_ascii_case("zip")) } + +fn is_package_release_path(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| matches!(ext.to_ascii_lowercase().as_str(), "yaml" | "yml")) +} diff --git a/rust/bioscript-cli/src/package_release.rs b/rust/bioscript-cli/src/package_release.rs new file mode 100644 index 0000000..5e34895 --- /dev/null +++ b/rust/bioscript-cli/src/package_release.rs @@ -0,0 +1,94 @@ +fn package_zip_from_release_manifest( + runtime_root: &Path, + path: &Path, + source_url: Option<&str>, +) -> Result, String> { + if !is_package_release_path(path) || !path.exists() { + return Ok(None); + } + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read package release {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse package release {}: {err}", path.display()))?; + let schema = yaml_string(&value, "schema"); + if schema.as_deref() != Some("bioscript:package-release:1.0") { + return Ok(None); + } + let artifact = value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("artifact".to_owned()))) + .and_then(serde_yaml::Value::as_mapping) + .ok_or_else(|| format!("package release {} is missing artifact", path.display()))?; + let artifact_path = artifact + .get(serde_yaml::Value::String("path".to_owned())) + .and_then(serde_yaml::Value::as_str); + let artifact_url = artifact + .get(serde_yaml::Value::String("url".to_owned())) + .and_then(serde_yaml::Value::as_str); + let zip_path = if let Some(url) = artifact_url { + download_package_url(runtime_root, url)? + } else if let Some(relative) = artifact_path { + if let Some(base_url) = source_url { + download_package_url(runtime_root, &join_url(base_url, relative))? + } else { + path.parent() + .ok_or_else(|| format!("package release has no parent: {}", path.display()))? + .join(checked_relative_package_path(relative)?) + } + } else { + return Err(format!( + "package release {} artifact needs path or url", + path.display() + )); + }; + if let Some(expected) = artifact + .get(serde_yaml::Value::String("sha256".to_owned())) + .and_then(serde_yaml::Value::as_str) + { + let actual = sha256_file(&zip_path)?; + if actual != expected { + return Err(format!( + "package artifact sha256 mismatch for {}: expected {expected}, got {actual}", + zip_path.display() + )); + } + } + Ok(Some(zip_path)) +} + +fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option { + value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String(key.to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) +} + +fn sha256_file(path: &Path) -> Result { + use sha2::{Digest, Sha256}; + + let mut file = fs::File::open(path) + .map_err(|err| format!("failed to open artifact {}: {err}", path.display()))?; + let mut digest = Sha256::new(); + let mut buffer = vec![0_u8; 1024 * 64]; + loop { + let n = std::io::Read::read(&mut file, &mut buffer) + .map_err(|err| format!("failed to read artifact {}: {err}", path.display()))?; + if n == 0 { + break; + } + digest.update(&buffer[..n]); + } + Ok(format!("{:x}", digest.finalize())) +} + +fn join_url(base_url: &str, relative: &str) -> String { + if relative.starts_with("https://") || relative.starts_with("http://") { + return relative.to_owned(); + } + let base = base_url.split('?').next().unwrap_or(base_url); + match base.rsplit_once('/') { + Some((prefix, _)) => format!("{prefix}/{relative}"), + None => relative.to_owned(), + } +} diff --git a/rust/bioscript-cli/src/report_options.rs b/rust/bioscript-cli/src/report_options.rs index ec560d6..08d9101 100644 --- a/rust/bioscript-cli/src/report_options.rs +++ b/rust/bioscript-cli/src/report_options.rs @@ -255,12 +255,13 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { if let Some(sample_sex) = options.sample_sex { input_inspection.inferred_sex = Some(explicit_sample_sex_inference(sample_sex)); } + let input_loader = loader_with_inspection(&options.loader, &input_inspection); let rows = run_manifest_rows_for_report( &options.root, &options.manifest_path, input_file, &participant_id, - &options.loader, + &input_loader, &options.filters, )?; let input_observations = rows @@ -280,7 +281,7 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { runtime_root: &options.root, input_file, participant_id: &participant_id, - loader: &options.loader, + loader: &input_loader, output_dir: &options.output_dir, filters: &options.filters, max_duration_ms: options.analysis_max_duration_ms, @@ -317,6 +318,20 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { Ok(()) } +fn loader_with_inspection( + base: &GenotypeLoadOptions, + inspection: &bioscript_formats::FileInspection, +) -> GenotypeLoadOptions { + let mut loader = base.clone(); + loader.assembly = inspection.assembly.or(loader.assembly); + loader.inferred_sex = inspection + .inferred_sex + .as_ref() + .map(|inference| inference.sex) + .or(loader.inferred_sex); + loader +} + fn open_app_html_report_if_requested(options: &AppReportOptions) { if options.open_report && let Err(err) = open_html_report(&options.output_dir.join("index.html")) diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index d040e36..c082921 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -30,12 +30,12 @@ use cram_backend::{ infer_snp_genotype, len_as_i64, normalize_pileup_base, record_overlaps_locus, spans_position, }; pub use cram_backend::{observe_cram_indel_with_reader, observe_cram_snp_with_reader}; -#[cfg(test)] -use delimited::{ - Delimiter, GENOTYPE_ALIASES, parse_streaming_row, split_csv_line, strip_bom, - strip_inline_comment, +pub(crate) use delimited::{ + DelimitedColumnIndexes, Delimiter, detect_delimiter, parse_streaming_row, }; -use delimited::{RowParser, detect_delimiter, scan_delimited_variants}; +#[cfg(test)] +use delimited::{GENOTYPE_ALIASES, split_csv_line, strip_bom, strip_inline_comment}; +use delimited::{RowParser, scan_delimited_variants}; #[cfg(test)] use delimited::{ build_column_indexes, default_column_indexes, find_header_index, looks_like_header_fields, @@ -340,6 +340,9 @@ impl GenotypeStore { if let QueryBackend::Vcf(backend) = &self.backend { return backend.lookup_variants(variants); } + if let QueryBackend::Cram(backend) = &self.backend { + return backend.lookup_variants(variants); + } let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); @@ -718,6 +721,7 @@ mod tests { genotype_from_vcf_gt("./1", "A", &["G"]).as_deref(), Some("--") ); + assert_eq!(genotype_from_vcf_gt("1", "C", &["T"]).as_deref(), Some("T")); assert_eq!( genotype_from_vcf_gt("bad", "A", &["G"]).as_deref(), Some("--") @@ -740,6 +744,20 @@ mod tests { detect_vcf_assembly(Path::new("sample.vcf"), &["##assembly=GRCh38".to_owned()]), Some(Assembly::Grch38) ); + assert_eq!( + detect_vcf_assembly( + Path::new("sample.vcf"), + &["##contig=".to_owned()] + ), + Some(Assembly::Grch38) + ); + assert_eq!( + detect_vcf_assembly( + Path::new("sample.vcf"), + &["##contig=".to_owned()] + ), + Some(Assembly::Grch37) + ); assert_eq!( detect_vcf_assembly(Path::new("sample.b37.vcf"), &[]), Some(Assembly::Grch37) diff --git a/rust/bioscript-formats/src/genotype/cram_backend.rs b/rust/bioscript-formats/src/genotype/cram_backend.rs index a712127..01787f3 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend.rs @@ -287,7 +287,7 @@ fn observe_snp_pileup( ) } -fn snp_pileup_with_reader( +pub(super) fn snp_pileup_with_reader( reader: &mut cram::io::indexed_reader::IndexedReader, label: &str, locus: &GenomicLocus, diff --git a/rust/bioscript-formats/src/genotype/cram_backend/store.rs b/rust/bioscript-formats/src/genotype/cram_backend/store.rs index 2b6a27b..7af7aca 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend/store.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend/store.rs @@ -8,13 +8,15 @@ use bioscript_core::{ Assembly, GenomicLocus, RuntimeError, VariantKind, VariantObservation, VariantSpec, }; +use noodles::cram; + use crate::alignment::{self, AlignmentOpKind}; use super::{ anchor_window, choose_variant_locus, classify_expected_indel, describe_copy_number_decision_rule, describe_locus, describe_snp_decision_rule, first_base, indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, observe_snp_pileup, - record_overlaps_locus, spans_position, + record_overlaps_locus, snp_pileup_with_reader, spans_position, }; use crate::genotype::{describe_query, types::CramBackend}; @@ -75,6 +77,87 @@ impl CramBackend { Ok(observation) } + pub(crate) fn lookup_variants( + &self, + variants: &[VariantSpec], + ) -> Result, RuntimeError> { + let Some(reference_file) = self.options.reference_file.as_ref() else { + return Err(RuntimeError::Unsupported(format!( + "backend '{}' cannot satisfy CRAM variant queries for {} without --reference-file", + self.backend_name(), + self.path.display() + ))); + }; + + let repository = alignment::build_reference_repository(reference_file)?; + let mut reader = + alignment::build_cram_indexed_reader_from_path(&self.path, &self.options, repository)?; + let label = self.path.display().to_string(); + + let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); + indexed.sort_by_cached_key(|(_, variant)| crate::genotype::variant_sort_key(variant)); + + let mut results = vec![VariantObservation::default(); variants.len()]; + for (idx, variant) in indexed { + let Some((assembly, locus)) = choose_variant_locus(variant, reference_file) else { + results[idx] = self.unsupported_locus_observation(variant, reference_file); + continue; + }; + results[idx] = + self.observe_with_reader(&mut reader, &label, variant, assembly, &locus)?; + } + + Ok(results) + } + + fn unsupported_locus_observation( + &self, + variant: &VariantSpec, + reference_file: &Path, + ) -> VariantObservation { + let mut evidence = format!( + "backend '{}' cannot satisfy query '{}' for {} using reference {}", + self.backend_name(), + describe_query(variant), + self.path.display(), + reference_file.display() + ); + evidence.push_str(". This backend needs GRCh37/GRCh38 coordinates, not only rsIDs"); + VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: variant.rsids.first().cloned(), + evidence: vec![evidence], + ..VariantObservation::default() + } + } + + fn observe_with_reader( + &self, + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + ) -> Result { + match variant.kind.unwrap_or(VariantKind::Other) { + VariantKind::Snp => { + self.observe_snp_with_reader(reader, label, variant, assembly, locus) + } + VariantKind::Deletion => { + self.observe_deletion_with_reader(reader, label, variant, assembly, locus) + } + VariantKind::Insertion | VariantKind::Indel => { + self.observe_indel_with_reader(reader, label, variant, assembly, locus) + } + VariantKind::Other => Err(RuntimeError::Unsupported(format!( + "backend '{}' does not yet support {:?} observation for {}", + self.backend_name(), + variant.kind.unwrap_or(VariantKind::Other), + self.path.display() + ))), + } + } + fn observe_snp( &self, variant: &VariantSpec, @@ -128,6 +211,58 @@ impl CramBackend { }) } + fn observe_snp_with_reader( + &self, + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + ) -> Result { + let reference = variant + .reference + .as_deref() + .and_then(first_base) + .ok_or_else(|| { + RuntimeError::InvalidArguments("SNP variant requires ref/reference".to_owned()) + })?; + let alternate = variant + .alternate + .as_deref() + .and_then(first_base) + .ok_or_else(|| { + RuntimeError::InvalidArguments("SNP variant requires alt/alternate".to_owned()) + })?; + + let pileup = snp_pileup_with_reader( + reader, + label, + locus, + reference, + alternate, + self.options.allow_reference_md5_mismatch, + )?; + let ref_count = pileup.filtered_ref_count; + let alt_count = pileup.filtered_alt_count; + let depth = pileup.filtered_depth; + let evidence = pileup.evidence_lines(&describe_locus(locus), locus.start); + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly: Some(assembly), + genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: pileup.raw_base_counts, + decision: Some(describe_snp_decision_rule( + reference, alternate, ref_count, alt_count, depth, + )), + evidence, + }) + } + fn observe_deletion( &self, variant: &VariantSpec, @@ -187,6 +322,65 @@ impl CramBackend { }) } + fn observe_deletion_with_reader( + &self, + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + ) -> Result { + let deletion_length = variant.deletion_length.ok_or_else(|| { + RuntimeError::InvalidArguments("deletion variant requires deletion_length".to_owned()) + })?; + let reference = variant.reference.clone().unwrap_or_else(|| "I".to_owned()); + let alternate = variant.alternate.clone().unwrap_or_else(|| "D".to_owned()); + let anchor_pos = locus.start.saturating_sub(1); + + let mut alt_count = 0u32; + let mut ref_count = 0u32; + let mut depth = 0u32; + + alignment::for_each_cram_record_with_reader( + reader, + label, + &anchor_window(locus), + |record| { + if record.is_unmapped || !spans_position(&record, anchor_pos) { + return Ok(true); + } + depth += 1; + match indel_at_anchor(&record, anchor_pos) { + Some((AlignmentOpKind::Deletion, len)) if len == deletion_length => { + alt_count += 1; + } + _ => ref_count += 1, + } + Ok(true) + }, + )?; + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly: Some(assembly), + genotype: infer_copy_number_genotype( + &reference, &alternate, ref_count, alt_count, depth, + ), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: BTreeMap::new(), + decision: Some(describe_copy_number_decision_rule( + &reference, &alternate, ref_count, alt_count, depth, + )), + evidence: vec![format!( + "observed deletion anchor {}:{} len={} depth={} ref_count={} alt_count={}", + locus.chrom, anchor_pos, deletion_length, depth, ref_count, alt_count + )], + }) + } + fn observe_indel( &self, variant: &VariantSpec, @@ -263,4 +457,30 @@ impl CramBackend { )], }) } + + fn observe_indel_with_reader( + &self, + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + variant: &VariantSpec, + assembly: Assembly, + locus: &GenomicLocus, + ) -> Result { + let reference = variant.reference.clone().ok_or_else(|| { + RuntimeError::InvalidArguments("indel variant requires ref/reference".to_owned()) + })?; + let alternate = variant.alternate.clone().ok_or_else(|| { + RuntimeError::InvalidArguments("indel variant requires alt/alternate".to_owned()) + })?; + + super::observe_cram_indel_with_reader( + reader, + label, + locus, + &reference, + &alternate, + variant.rsids.first().cloned(), + Some(assembly), + ) + } } diff --git a/rust/bioscript-formats/src/genotype/types.rs b/rust/bioscript-formats/src/genotype/types.rs index 756ce96..aad9abe 100644 --- a/rust/bioscript-formats/src/genotype/types.rs +++ b/rust/bioscript-formats/src/genotype/types.rs @@ -1,5 +1,9 @@ use std::{collections::HashMap, path::PathBuf, str::FromStr}; +use bioscript_core::Assembly; + +use crate::inspect::InferredSex; + #[derive(Debug, Clone)] pub struct GenotypeStore { pub(crate) backend: QueryBackend, @@ -72,11 +76,29 @@ impl FromStr for GenotypeSourceFormat { } } -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub struct GenotypeLoadOptions { pub format: Option, pub input_index: Option, pub reference_file: Option, pub reference_index: Option, + pub assembly: Option, + pub inferred_sex: Option, + pub impute_vcf_missing_as_reference: bool, pub allow_reference_md5_mismatch: bool, } + +impl Default for GenotypeLoadOptions { + fn default() -> Self { + Self { + format: None, + input_index: None, + reference_file: None, + reference_index: None, + assembly: None, + inferred_sex: None, + impute_vcf_missing_as_reference: true, + allow_reference_md5_mismatch: false, + } + } +} diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index 8ef2fcb..bf001ee 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -8,18 +8,22 @@ use std::{ use noodles::bgzf; use noodles::csi; -use bioscript_core::{ - Assembly, GenomicLocus, RuntimeError, VariantKind, VariantObservation, VariantSpec, -}; +use bioscript_core::{Assembly, RuntimeError, VariantKind, VariantObservation, VariantSpec}; use crate::alignment; +use crate::inspect::detect_assembly; use super::{ describe_query, genotype_from_vcf_gt, is_bgzf_path, types::VcfBackend, variant_sort_key, }; +mod matching; mod reader; +pub(crate) use matching::{ + choose_variant_locus_for_assembly, normalize_chromosome_name, vcf_row_matches_variant, +}; +use matching::{first_single_base_allele, imputed_reference_observation}; pub use reader::observe_vcf_snp_with_reader; #[derive(Debug, Clone)] @@ -40,8 +44,10 @@ pub(crate) fn scan_vcf_variants( let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); - let mut probe_lines = Vec::new(); - let detected_assembly = { + let detected_assembly = if let Some(assembly) = backend.options.assembly { + Some(assembly) + } else { + let mut probe_lines = Vec::new(); let file = File::open(&backend.path).map_err(|err| { RuntimeError::Io(format!( "failed to open VCF file {}: {err}", @@ -81,6 +87,7 @@ pub(crate) fn scan_vcf_variants( let mut coord_targets: HashMap<(String, i64), Vec> = HashMap::new(); let mut results = vec![VariantObservation::default(); variants.len()]; let mut unresolved = variants.len(); + let label = backend.path.display().to_string(); for (idx, variant) in &indexed { for rsid in &variant.rsids { @@ -141,14 +148,41 @@ pub(crate) fn scan_vcf_variants( for (idx, variant) in indexed { if results[idx].genotype.is_none() { - results[idx] = VariantObservation { - backend: backend.backend_name().to_owned(), - assembly: detected_assembly, - evidence: vec![format!( - "no matching rsid or locus found for {}", - describe_query(variant) - )], - ..VariantObservation::default() + results[idx] = if backend.options.impute_vcf_missing_as_reference { + choose_variant_locus_for_assembly(variant, detected_assembly) + .and_then(|locus| { + imputed_reference_observation( + backend.backend_name(), + &label, + variant, + &locus, + detected_assembly, + backend.options.inferred_sex, + &format!( + "no matching rsid or locus found for {}", + describe_query(variant) + ), + ) + }) + .unwrap_or_else(|| VariantObservation { + backend: backend.backend_name().to_owned(), + assembly: detected_assembly, + evidence: vec![format!( + "no matching rsid or locus found for {}", + describe_query(variant) + )], + ..VariantObservation::default() + }) + } else { + VariantObservation { + backend: backend.backend_name().to_owned(), + assembly: detected_assembly, + evidence: vec![format!( + "no matching rsid or locus found for {}", + describe_query(variant) + )], + ..VariantObservation::default() + } }; } } @@ -163,7 +197,10 @@ pub(crate) fn lookup_indexed_vcf_variants( let Some(input_index) = backend.options.input_index.as_ref() else { return Ok(None); }; - let detected_assembly = detect_vcf_assembly_from_path(&backend.path)?; + let detected_assembly = match backend.options.assembly { + Some(assembly) => Some(assembly), + None => detect_vcf_assembly_from_path(&backend.path)?, + }; let mut indexed_variants = Vec::with_capacity(variants.len()); for (idx, variant) in variants.iter().enumerate() { let Some(locus) = choose_variant_locus_for_assembly(variant, detected_assembly) else { @@ -199,7 +236,7 @@ pub(crate) fn lookup_indexed_vcf_variants( let mut results = vec![VariantObservation::default(); variants.len()]; for (idx, variant, locus, reference, alternate) in indexed_variants { - results[idx] = observe_vcf_snp_with_reader( + let observation = observe_vcf_snp_with_reader( &mut indexed, &backend.path.display().to_string(), &locus, @@ -208,6 +245,26 @@ pub(crate) fn lookup_indexed_vcf_variants( variant.rsids.first().cloned(), detected_assembly, )?; + results[idx] = if backend.options.impute_vcf_missing_as_reference + && observation.genotype.is_none() + && observation + .evidence + .iter() + .any(|line| line.contains("no VCF record at")) + { + imputed_reference_observation( + backend.backend_name(), + &backend.path.display().to_string(), + variant, + &locus, + detected_assembly, + backend.options.inferred_sex, + &observation.evidence.join(" | "), + ) + .unwrap_or(observation) + } else { + observation + }; } Ok(Some(results)) } @@ -242,13 +299,6 @@ pub(crate) fn detect_vcf_assembly_from_path(path: &Path) -> Result) -> Option { - let value = value?; - let mut chars = value.chars(); - let base = chars.next()?; - chars.next().is_none().then_some(base) -} - struct VcfResolutionTargets<'a> { variants: &'a [VariantSpec], detected_assembly: Option, @@ -393,87 +443,5 @@ pub(crate) fn extract_vcf_sample_genotype( } pub(crate) fn detect_vcf_assembly(path: &Path, probe_lines: &[String]) -> Option { - let combined = probe_lines.join("\n").to_ascii_lowercase(); - if combined.contains("assembly=b37") - || combined.contains("assembly=grch37") - || combined.contains("assembly=hg19") - || combined.contains("reference=grch37") - || combined.contains("reference=hg19") - { - return Some(Assembly::Grch37); - } - if combined.contains("assembly=b38") - || combined.contains("assembly=grch38") - || combined.contains("assembly=hg38") - || combined.contains("reference=grch38") - || combined.contains("reference=hg38") - { - return Some(Assembly::Grch38); - } - - let lower = path.to_string_lossy().to_ascii_lowercase(); - if lower.contains("grch37") || lower.contains("hg19") || lower.contains("b37") { - Some(Assembly::Grch37) - } else if lower.contains("grch38") || lower.contains("hg38") || lower.contains("b38") { - Some(Assembly::Grch38) - } else { - None - } -} - -pub(crate) fn choose_variant_locus_for_assembly( - variant: &VariantSpec, - assembly: Option, -) -> Option { - match assembly { - Some(Assembly::Grch37) => variant.grch37.clone().or_else(|| variant.grch38.clone()), - Some(Assembly::Grch38) => variant.grch38.clone().or_else(|| variant.grch37.clone()), - None => variant.grch37.clone().or_else(|| variant.grch38.clone()), - } -} - -pub(crate) fn normalize_chromosome_name(value: &str) -> String { - value.trim().trim_start_matches("chr").to_ascii_lowercase() -} - -pub(crate) fn vcf_row_matches_variant( - row: &ParsedVcfRow, - variant: &VariantSpec, - assembly: Option, -) -> bool { - let Some(locus) = choose_variant_locus_for_assembly(variant, assembly) else { - return false; - }; - - if normalize_chromosome_name(&row.chrom) != normalize_chromosome_name(&locus.chrom) { - return false; - } - - match variant.kind.unwrap_or(VariantKind::Other) { - VariantKind::Snp => { - row.position == locus.start - && variant - .reference - .as_ref() - .is_none_or(|reference| reference.eq_ignore_ascii_case(&row.reference)) - && variant.alternate.as_ref().is_none_or(|alternate| { - row.alternates - .iter() - .any(|candidate| candidate.eq_ignore_ascii_case(alternate)) - }) - } - VariantKind::Deletion => { - let expected_len = variant.deletion_length.unwrap_or(0); - row.position == locus.start.saturating_sub(1) - && row.alternates.iter().any(|alternate| { - let actual_len = row.reference.len().saturating_sub(alternate.len()); - (expected_len == 0 || actual_len == expected_len) - && alternate.len() < row.reference.len() - }) - } - VariantKind::Insertion | VariantKind::Indel => { - row.position == locus.start.saturating_sub(1) - } - VariantKind::Other => row.position == locus.start, - } + detect_assembly(&path.to_string_lossy().to_ascii_lowercase(), probe_lines) } diff --git a/rust/bioscript-formats/src/genotype/vcf/matching.rs b/rust/bioscript-formats/src/genotype/vcf/matching.rs new file mode 100644 index 0000000..5a65a7e --- /dev/null +++ b/rust/bioscript-formats/src/genotype/vcf/matching.rs @@ -0,0 +1,121 @@ +use bioscript_core::{Assembly, GenomicLocus, VariantKind, VariantObservation, VariantSpec}; + +use crate::inspect::InferredSex; + +use super::ParsedVcfRow; + +pub(crate) fn choose_variant_locus_for_assembly( + variant: &VariantSpec, + assembly: Option, +) -> Option { + match assembly { + Some(Assembly::Grch37) => variant.grch37.clone().or_else(|| variant.grch38.clone()), + Some(Assembly::Grch38) => variant.grch38.clone().or_else(|| variant.grch37.clone()), + None => variant.grch37.clone().or_else(|| variant.grch38.clone()), + } +} + +pub(super) fn first_single_base_allele(value: Option<&str>) -> Option { + let value = value?; + let mut chars = value.chars(); + let base = chars.next()?; + chars.next().is_none().then_some(base) +} + +pub(super) fn imputed_reference_observation( + backend_name: &str, + label: &str, + variant: &VariantSpec, + locus: &GenomicLocus, + assembly: Option, + inferred_sex: Option, + missing_evidence: &str, +) -> Option { + if !matches!(variant.kind, None | Some(VariantKind::Snp)) { + return None; + } + let reference = first_single_base_allele(variant.reference.as_deref())?; + first_single_base_allele(variant.alternate.as_deref())?; + let genotype = reference_genotype_for_locus(reference, locus, inferred_sex); + Some(VariantObservation { + backend: backend_name.to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly, + genotype: Some(genotype), + evidence: vec![format!( + "{label}: {missing_evidence} | imputed reference genotype from absent variant-only VCF record" + )], + ..VariantObservation::default() + }) +} + +fn reference_genotype_for_locus( + reference: char, + locus: &GenomicLocus, + inferred_sex: Option, +) -> String { + let reference = reference.to_ascii_uppercase(); + if inferred_sex == Some(InferredSex::Male) && is_sex_chromosome(&locus.chrom) { + reference.to_string() + } else { + format!("{reference}{reference}") + } +} + +fn is_sex_chromosome(chrom: &str) -> bool { + matches!( + chrom + .trim() + .trim_start_matches("chr") + .trim_start_matches("CHR") + .to_ascii_uppercase() + .as_str(), + "X" | "Y" | "23" | "24" + ) +} + +pub(crate) fn normalize_chromosome_name(value: &str) -> String { + value.trim().trim_start_matches("chr").to_ascii_lowercase() +} + +pub(crate) fn vcf_row_matches_variant( + row: &ParsedVcfRow, + variant: &VariantSpec, + assembly: Option, +) -> bool { + let Some(locus) = choose_variant_locus_for_assembly(variant, assembly) else { + return false; + }; + + if normalize_chromosome_name(&row.chrom) != normalize_chromosome_name(&locus.chrom) { + return false; + } + + match variant.kind.unwrap_or(VariantKind::Other) { + VariantKind::Snp => { + row.position == locus.start + && variant + .reference + .as_ref() + .is_none_or(|reference| reference.eq_ignore_ascii_case(&row.reference)) + && variant.alternate.as_ref().is_none_or(|alternate| { + row.alternates + .iter() + .any(|candidate| candidate.eq_ignore_ascii_case(alternate)) + }) + } + VariantKind::Deletion => { + let expected_len = variant.deletion_length.unwrap_or(0); + row.position == locus.start.saturating_sub(1) + && row.alternates.iter().any(|alternate| { + let actual_len = row.reference.len().saturating_sub(alternate.len()); + (expected_len == 0 || actual_len == expected_len) + && alternate.len() < row.reference.len() + }) + } + VariantKind::Insertion | VariantKind::Indel => { + row.position == locus.start.saturating_sub(1) + } + VariantKind::Other => row.position == locus.start, + } +} diff --git a/rust/bioscript-formats/src/genotype/vcf_tokens.rs b/rust/bioscript-formats/src/genotype/vcf_tokens.rs index 5cd5650..c1aae58 100644 --- a/rust/bioscript-formats/src/genotype/vcf_tokens.rs +++ b/rust/bioscript-formats/src/genotype/vcf_tokens.rs @@ -11,14 +11,16 @@ pub(crate) fn genotype_from_vcf_gt( let cleaned = gt.trim().replace('|', "/"); let parts: Vec<&str> = cleaned.split('/').collect(); - if parts.len() != 2 || parts.contains(&".") { + if !(parts.len() == 1 || parts.len() == 2) || parts.contains(&".") { return Some("--".to_owned()); } let ref_token = vcf_reference_token(reference, alternates); let mut out = String::new(); for part in parts { - let idx = part.parse::().ok()?; + let Ok(idx) = part.parse::() else { + return Some("--".to_owned()); + }; if idx == 0 { out.push_str(&ref_token); } else { diff --git a/rust/bioscript-formats/src/inspect/heuristics.rs b/rust/bioscript-formats/src/inspect/heuristics.rs index a26a5f1..2dd9490 100644 --- a/rust/bioscript-formats/src/inspect/heuristics.rs +++ b/rust/bioscript-formats/src/inspect/heuristics.rs @@ -243,19 +243,28 @@ pub(crate) fn detect_assembly(lower_name: &str, sample_lines: &[String]) -> Opti || combined.contains("assembly 38") || combined.contains("grch38") || combined.contains("hg38") + || combined.contains("b38") || combined.contains("gca_000001405.15") || combined.contains("grch38_no_alt_analysis_set") - || combined.contains("##contig="); + || combined.contains("##contig= Result { let mut stats = SexStats::default(); + let delimiter = detect_delimiter(lines); + let mut column_indexes = None; + let mut comment_header = None; for line in lines { - update_stats_from_line(&mut stats, line, kind); + update_stats_from_line( + &mut stats, + line, + kind, + delimiter, + &mut column_indexes, + &mut comment_header, + )?; } Ok(classify_stats(&stats, kind)) } @@ -174,8 +190,45 @@ fn infer_sex_from_reader( kind: DetectedKind, ) -> Result { let mut stats = SexStats::default(); + let mut probe_lines = Vec::new(); let mut line = String::new(); - for _ in 0..MAX_SEX_DETECTION_LINES { + for _ in 0..64 { + line.clear(); + let bytes = reader + .read_line(&mut line) + .map_err(|err| RuntimeError::Io(format!("failed to scan sex markers: {err}")))?; + if bytes == 0 { + let delimiter = detect_delimiter(&probe_lines); + let mut column_indexes = None; + let mut comment_header = None; + for probe_line in &probe_lines { + update_stats_from_line( + &mut stats, + probe_line, + kind, + delimiter, + &mut column_indexes, + &mut comment_header, + )?; + } + return Ok(classify_stats(&stats, kind)); + } + probe_lines.push(line.trim_end_matches(['\n', '\r']).to_owned()); + } + let delimiter = detect_delimiter(&probe_lines); + let mut column_indexes = None; + let mut comment_header = None; + for probe_line in &probe_lines { + update_stats_from_line( + &mut stats, + probe_line, + kind, + delimiter, + &mut column_indexes, + &mut comment_header, + )?; + } + for _ in probe_lines.len()..MAX_SEX_DETECTION_LINES { line.clear(); let bytes = reader .read_line(&mut line) @@ -183,35 +236,55 @@ fn infer_sex_from_reader( if bytes == 0 { break; } - update_stats_from_line(&mut stats, line.trim_end_matches(['\n', '\r']), kind); + update_stats_from_line( + &mut stats, + line.trim_end_matches(['\n', '\r']), + kind, + delimiter, + &mut column_indexes, + &mut comment_header, + )?; } Ok(classify_stats(&stats, kind)) } -fn update_stats_from_line(stats: &mut SexStats, line: &str, kind: DetectedKind) { +fn update_stats_from_line( + stats: &mut SexStats, + line: &str, + kind: DetectedKind, + delimiter: Delimiter, + column_indexes: &mut Option, + comment_header: &mut Option>, +) -> Result<(), RuntimeError> { let trimmed = line.trim(); - if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("//") { - return; + if trimmed.is_empty() { + return Ok(()); } match kind { DetectedKind::Vcf => update_vcf_stats(stats, trimmed), DetectedKind::GenotypeText | DetectedKind::Unknown => { - update_genotype_text_stats(stats, trimmed); + update_genotype_text_stats(stats, trimmed, delimiter, column_indexes, comment_header)?; } _ => {} } + Ok(()) } -fn update_genotype_text_stats(stats: &mut SexStats, line: &str) { - let fields = split_fields(line); - if fields.len() < 4 { - return; - } - let rsid = fields[0].trim(); - let chrom = normalize_chrom(fields.get(1).map(String::as_str).unwrap_or_default()); - let genotype = genotype_text_field(&fields); +fn update_genotype_text_stats( + stats: &mut SexStats, + line: &str, + delimiter: Delimiter, + column_indexes: &mut Option, + comment_header: &mut Option>, +) -> Result<(), RuntimeError> { + let Some(row) = parse_streaming_row(line, delimiter, column_indexes, comment_header)? else { + return Ok(()); + }; + let rsid = row.rsid.as_deref().unwrap_or_default(); + let chrom = normalize_chrom(row.chrom.as_deref().unwrap_or_default()); + let genotype = row.genotype.as_str(); if chrom != "Y" { - return; + return Ok(()); } stats.total_y_snps += 1; let called = is_called_genotype_text(genotype); @@ -227,6 +300,7 @@ fn update_genotype_text_stats(stats: &mut SexStats, line: &str) { stats.male_markers_called += 1; } } + Ok(()) } fn update_vcf_stats(stats: &mut SexStats, line: &str) { @@ -261,100 +335,6 @@ fn update_vcf_stats(stats: &mut SexStats, line: &str) { } } -fn classify_stats(stats: &SexStats, kind: DetectedKind) -> SexInference { - match kind { - DetectedKind::Vcf => classify_vcf_stats(stats), - DetectedKind::GenotypeText | DetectedKind::Unknown => classify_y_fingerprint_stats(stats), - _ => unsupported_sex_inference(), - } -} - -fn supports_sex_detection(kind: DetectedKind) -> bool { - matches!( - kind, - DetectedKind::Vcf | DetectedKind::GenotypeText | DetectedKind::Unknown - ) -} - -fn unsupported_sex_inference() -> SexInference { - SexInference { - sex: InferredSex::Unknown, - confidence: SexDetectionConfidence::Low, - method: "unsupported_source_type".to_owned(), - evidence: vec!["sex detection currently supports genotype text and VCF inputs".to_owned()], - } -} - -fn classify_y_fingerprint_stats(stats: &SexStats) -> SexInference { - let (sex, confidence) = if stats.called_y_snps > 500 { - (InferredSex::Male, SexDetectionConfidence::High) - } else if stats.called_y_snps > 100 && stats.male_markers_called > 10 { - (InferredSex::Male, SexDetectionConfidence::Medium) - } else if stats.total_y_snps > 1000 && stats.called_y_snps < 10 && stats.male_markers_called < 3 - { - ( - InferredSex::Female, - if stats.called_y_snps == 0 { - SexDetectionConfidence::High - } else { - SexDetectionConfidence::Medium - }, - ) - } else if stats.called_y_snps > 50 || stats.male_markers_called > 5 { - (InferredSex::Male, SexDetectionConfidence::Medium) - } else if stats.called_y_snps < 10 && stats.total_y_snps > 0 { - (InferredSex::Female, SexDetectionConfidence::Medium) - } else { - (InferredSex::Unknown, SexDetectionConfidence::Low) - }; - SexInference { - sex, - confidence, - method: "y_fingerprint".to_owned(), - evidence: y_fingerprint_evidence(stats), - } -} - -fn classify_vcf_stats(stats: &SexStats) -> SexInference { - let (sex, confidence) = if stats.called_y_snps > 500 - || (stats.x_haploid_gt_sites >= 20 && stats.x_diploid_gt_sites == 0) - { - (InferredSex::Male, SexDetectionConfidence::High) - } else if stats.x_non_par_sites >= 50 - && stats.x_diploid_gt_sites > 0 - && stats.x_het_gt_sites * 100 / stats.x_diploid_gt_sites.max(1) >= 2 - { - (InferredSex::Female, SexDetectionConfidence::Medium) - } else { - (InferredSex::Unknown, SexDetectionConfidence::Low) - }; - SexInference { - sex, - confidence, - method: "vcf_non_par_x_gt_y_count".to_owned(), - evidence: vec![ - format!("x_non_par_sites={}", stats.x_non_par_sites), - format!("x_haploid_gt_sites={}", stats.x_haploid_gt_sites), - format!("x_diploid_gt_sites={}", stats.x_diploid_gt_sites), - format!("x_het_gt_sites={}", stats.x_het_gt_sites), - format!("called_y_snps={}", stats.called_y_snps), - ], - } -} - -fn y_fingerprint_evidence(stats: &SexStats) -> Vec { - let mut evidence = vec![ - format!("total_y_snps={}", stats.total_y_snps), - format!("called_y_snps={}", stats.called_y_snps), - format!("male_markers_found={}", stats.male_markers_found), - format!("male_markers_called={}", stats.male_markers_called), - ]; - if !stats.y_examples.is_empty() { - evidence.push(format!("y_examples={}", stats.y_examples.join(","))); - } - evidence -} - fn normalize_chrom(value: &str) -> String { value .trim() @@ -363,14 +343,6 @@ fn normalize_chrom(value: &str) -> String { .to_ascii_uppercase() } -fn genotype_text_field(fields: &[String]) -> &str { - if fields.len() >= 4 { - fields[3].trim() - } else { - fields.last().map(String::as_str).unwrap_or_default().trim() - } -} - fn is_called_genotype_text(value: &str) -> bool { let value = value.trim(); if value.is_empty() || matches!(value, "--" | "00" | "." | "./." | ".|.") { @@ -475,6 +447,41 @@ mod tests { assert!(result.evidence.iter().any(|item| item == "called_y_snps=0")); } + #[test] + fn y_fingerprint_respects_header_column_order() { + let mut lines = vec!["# rsid\tchromosome\tposition\tgs\tbaf\tlrr\tgenotype".to_owned()]; + lines.extend((0..1001).map(|idx| format!("rs{idx}\tY\t{idx}\t0\t0.5\t-4.0\t--"))); + lines.push("rs11575897\tY\t2001\t0\t0.5\t-4.2\t--".to_owned()); + let result = infer_sex_from_text_lines(&lines, DetectedKind::GenotypeText).unwrap(); + assert_eq!(result.sex, InferredSex::Female); + assert_eq!(result.confidence, SexDetectionConfidence::High); + assert!(result.evidence.iter().any(|item| item == "called_y_snps=0")); + } + + #[test] + fn y_fingerprint_treats_sparse_y_calls_without_male_markers_as_female() { + let mut lines: Vec = (0..5500) + .map(|idx| format!("rs{idx}\tY\t{idx}\t--\t0\t0.5\t-4.0")) + .collect(); + for idx in 0..900 { + lines.push(format!("noise{idx}\tY\t{}\tAA\t0.2\t0\t0.0", idx + 6000)); + } + for marker in [ + "rs11575897", + "rs2534636", + "rs35284970", + "rs13447361", + "rs2267801", + "rs2267802", + "rs9786142", + ] { + lines.push(format!("{marker}\tY\t9000\t--\t0\t0.5\t-4.0")); + } + let result = infer_sex_from_text_lines(&lines, DetectedKind::GenotypeText).unwrap(); + assert_eq!(result.sex, InferredSex::Female); + assert_eq!(result.confidence, SexDetectionConfidence::High); + } + #[test] fn vcf_non_par_x_gt_detects_diploid_het_signal() { let lines = vec![ diff --git a/rust/bioscript-formats/src/inspect/sex/classify.rs b/rust/bioscript-formats/src/inspect/sex/classify.rs new file mode 100644 index 0000000..e5c8052 --- /dev/null +++ b/rust/bioscript-formats/src/inspect/sex/classify.rs @@ -0,0 +1,115 @@ +use crate::inspect::DetectedKind; + +use super::{InferredSex, SexDetectionConfidence, SexInference, SexStats}; + +pub(super) fn classify_stats(stats: &SexStats, kind: DetectedKind) -> SexInference { + match kind { + DetectedKind::Vcf => classify_vcf_stats(stats), + DetectedKind::GenotypeText | DetectedKind::Unknown => classify_y_fingerprint_stats(stats), + _ => unsupported_sex_inference(), + } +} + +pub(super) fn supports_sex_detection(kind: DetectedKind) -> bool { + matches!( + kind, + DetectedKind::Vcf | DetectedKind::GenotypeText | DetectedKind::Unknown + ) +} + +pub(super) fn unsupported_sex_inference() -> SexInference { + SexInference { + sex: InferredSex::Unknown, + confidence: SexDetectionConfidence::Low, + method: "unsupported_source_type".to_owned(), + evidence: vec!["sex detection currently supports genotype text and VCF inputs".to_owned()], + } +} + +fn classify_y_fingerprint_stats(stats: &SexStats) -> SexInference { + let called_y_pct = called_y_percent(stats); + let (sex, confidence) = if stats.total_y_snps > 1000 + && stats.male_markers_found >= 3 + && stats.male_markers_called == 0 + && called_y_pct < 20.0 + { + (InferredSex::Female, SexDetectionConfidence::High) + } else if stats.called_y_snps > 500 && called_y_pct >= 50.0 { + (InferredSex::Male, SexDetectionConfidence::High) + } else if stats.called_y_snps > 100 && stats.male_markers_called > 10 { + (InferredSex::Male, SexDetectionConfidence::Medium) + } else if stats.total_y_snps > 1000 && stats.called_y_snps < 10 && stats.male_markers_called < 3 + { + ( + InferredSex::Female, + if stats.called_y_snps == 0 { + SexDetectionConfidence::High + } else { + SexDetectionConfidence::Medium + }, + ) + } else if stats.called_y_snps > 50 || stats.male_markers_called > 5 { + (InferredSex::Male, SexDetectionConfidence::Medium) + } else if stats.called_y_snps < 10 && stats.total_y_snps > 0 { + (InferredSex::Female, SexDetectionConfidence::Medium) + } else { + (InferredSex::Unknown, SexDetectionConfidence::Low) + }; + SexInference { + sex, + confidence, + method: "y_fingerprint".to_owned(), + evidence: y_fingerprint_evidence(stats), + } +} + +fn classify_vcf_stats(stats: &SexStats) -> SexInference { + let (sex, confidence) = if stats.called_y_snps > 500 + || (stats.x_haploid_gt_sites >= 20 && stats.x_diploid_gt_sites == 0) + { + (InferredSex::Male, SexDetectionConfidence::High) + } else if stats.x_non_par_sites >= 50 + && stats.x_diploid_gt_sites > 0 + && stats.x_het_gt_sites * 100 / stats.x_diploid_gt_sites.max(1) >= 2 + { + (InferredSex::Female, SexDetectionConfidence::Medium) + } else { + (InferredSex::Unknown, SexDetectionConfidence::Low) + }; + SexInference { + sex, + confidence, + method: "vcf_non_par_x_gt_y_count".to_owned(), + evidence: vec![ + format!("x_non_par_sites={}", stats.x_non_par_sites), + format!("x_haploid_gt_sites={}", stats.x_haploid_gt_sites), + format!("x_diploid_gt_sites={}", stats.x_diploid_gt_sites), + format!("x_het_gt_sites={}", stats.x_het_gt_sites), + format!("called_y_snps={}", stats.called_y_snps), + ], + } +} + +fn y_fingerprint_evidence(stats: &SexStats) -> Vec { + let mut evidence = vec![ + format!("total_y_snps={}", stats.total_y_snps), + format!("called_y_snps={}", stats.called_y_snps), + format!("called_y_pct={:.1}", called_y_percent(stats)), + format!("male_markers_found={}", stats.male_markers_found), + format!("male_markers_called={}", stats.male_markers_called), + ]; + if !stats.y_examples.is_empty() { + evidence.push(format!("y_examples={}", stats.y_examples.join(","))); + } + evidence +} + +fn called_y_percent(stats: &SexStats) -> f64 { + if stats.total_y_snps == 0 { + 0.0 + } else { + let called_y_snps = u32::try_from(stats.called_y_snps).unwrap_or(u32::MAX); + let total_y_snps = u32::try_from(stats.total_y_snps).unwrap_or(u32::MAX); + f64::from(called_y_snps) * 100.0 / f64::from(total_y_snps) + } +} diff --git a/rust/bioscript-formats/tests/file_formats/cram.rs b/rust/bioscript-formats/tests/file_formats/cram.rs index cb912a2..a0b38d6 100644 --- a/rust/bioscript-formats/tests/file_formats/cram.rs +++ b/rust/bioscript-formats/tests/file_formats/cram.rs @@ -16,6 +16,7 @@ fn forced_cram_store(dir: &std::path::Path, reference_name: &str) -> GenotypeSto reference_index: Some(dir.join(format!("{reference_name}.fai"))), input_index: Some(dir.join("missing.cram.crai")), allow_reference_md5_mismatch: false, + ..GenotypeLoadOptions::default() }, ) .unwrap() @@ -257,6 +258,7 @@ fn open_cram_store_with_md5_policy( reference_file: Some(fx.reference.clone()), reference_index: Some(fx.reference_index.clone()), allow_reference_md5_mismatch, + ..GenotypeLoadOptions::default() }, ) .expect("open cram store") diff --git a/rust/bioscript-formats/tests/file_formats/vcf.rs b/rust/bioscript-formats/tests/file_formats/vcf.rs index 6eb9a4f..cd81acf 100644 --- a/rust/bioscript-formats/tests/file_formats/vcf.rs +++ b/rust/bioscript-formats/tests/file_formats/vcf.rs @@ -39,6 +39,131 @@ fn vcf_coordinate_lookup_normalizes_chr_prefix_and_handles_multiallelic_gt() { ); } +#[test] +fn vcf_lookup_prefers_loader_assembly_over_file_name_or_header() { + let dir = temp_dir("vcf-loader-assembly"); + let path = dir.join("sample.hg19.vcf"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=hg19\n\ + ##FORMAT=\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + chr7\t87550285\t.\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let variant = VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "7".to_owned(), + start: 87_179_601, + end: 87_179_601, + }), + grch38: Some(bioscript_core::GenomicLocus { + chrom: "7".to_owned(), + start: 87_550_285, + end: 87_550_285, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }; + + let default_store = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + impute_vcf_missing_as_reference: false, + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + let default_observation = default_store.lookup_variant(&variant).unwrap(); + assert_eq!(default_observation.genotype, None); + assert_eq!( + default_observation.assembly, + Some(bioscript_core::Assembly::Grch37) + ); + + let inspected_store = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + assembly: Some(bioscript_core::Assembly::Grch38), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + let inspected_observation = inspected_store.lookup_variant(&variant).unwrap(); + assert_eq!(inspected_observation.genotype.as_deref(), Some("AG")); + assert_eq!( + inspected_observation.assembly, + Some(bioscript_core::Assembly::Grch38) + ); + assert_eq!( + inspected_observation.evidence[0], + "resolved by locus chr7:87550285" + ); +} + +#[test] +fn vcf_missing_locus_defaults_to_imputed_reference_with_sex_aware_ploidy() { + let dir = temp_dir("vcf-impute-reference"); + let path = dir.join("sample.vcf"); + fs::write( + &path, + "##fileformat=VCFv4.2\n\ + ##reference=GRCh38\n\ + ##FORMAT=\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file_with_options( + &path, + &GenotypeLoadOptions { + inferred_sex: Some(bioscript_formats::InferredSex::Male), + ..GenotypeLoadOptions::default() + }, + ) + .unwrap(); + + let observations = store + .lookup_variants(&[ + VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 20, + end: 20, + }), + reference: Some("C".to_owned()), + alternate: Some("T".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "X".to_owned(), + start: 30, + end: 30, + }), + reference: Some("G".to_owned()), + alternate: Some("A".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + ]) + .unwrap(); + + assert_eq!(observations[0].genotype.as_deref(), Some("CC")); + assert_eq!(observations[1].genotype.as_deref(), Some("G")); + assert!( + observations[0].evidence[0].contains("imputed reference genotype"), + "{:?}", + observations[0].evidence + ); +} + #[test] fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { let dir = temp_dir("vcf-indel-locus");