diff --git a/docs/variant-schema.md b/docs/variant-schema.md index f6c9de6..91e53c8 100644 --- a/docs/variant-schema.md +++ b/docs/variant-schema.md @@ -32,9 +32,11 @@ coordinates: grch37: chrom: "12" pos: 112241766 + assembly_ref: "G" grch38: chrom: "12" pos: 111803962 + assembly_ref: "G" alleles: kind: "snv" @@ -57,6 +59,12 @@ At least one of these must also exist: - `identifiers` - `coordinates` +`coordinates..assembly_ref` is optional for SNVs. Use it when an +older assembly's reference base differs from the canonical `alleles.ref` used +by the catalogue. VCF inputs are matched against the assembly reference base +and then translated back to the canonical `alleles.ref` / `alleles.alts` +representation before findings are evaluated. + ## Top-Level `tags` Optional free-form classification tags for filtering and compatibility hints. diff --git a/docs/variant-schema.yaml b/docs/variant-schema.yaml index 3c2a26d..4568ff0 100644 --- a/docs/variant-schema.yaml +++ b/docs/variant-schema.yaml @@ -18,10 +18,12 @@ coordinates: grch37: chrom: "1" pos: 12345 + assembly_ref: "A" grch38: chrom: "1" start: 12345 end: 12346 + assembly_ref: "G" alleles: kind: "snv" diff --git a/rust/bioscript-cli/src/cli_bootstrap.rs b/rust/bioscript-cli/src/cli_bootstrap.rs index b0cf393..63bae41 100644 --- a/rust/bioscript-cli/src/cli_bootstrap.rs +++ b/rust/bioscript-cli/src/cli_bootstrap.rs @@ -57,7 +57,7 @@ fn run_cli() -> Result<(), String> { Ok(()) } -const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|cram] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]"; +const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--allow-md5-mismatch] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]"; struct CliOptions { script_path: Option, diff --git a/rust/bioscript-cli/src/report_options.rs b/rust/bioscript-cli/src/report_options.rs index 08d9101..1a4bc99 100644 --- a/rust/bioscript-cli/src/report_options.rs +++ b/rust/bioscript-cli/src/report_options.rs @@ -108,6 +108,7 @@ impl AppReportCliState { self.loader.reference_index = Some(PathBuf::from(next_arg(iter, "--reference-index")?)); } + "--allow-md5-mismatch" => self.loader.allow_reference_md5_mismatch = true, value if value.starts_with('-') => return Err(format!("unexpected argument: {value}")), value => self.consume_path(value), } diff --git a/rust/bioscript-core/src/variant.rs b/rust/bioscript-core/src/variant.rs index 5d333ef..2ae9022 100644 --- a/rust/bioscript-core/src/variant.rs +++ b/rust/bioscript-core/src/variant.rs @@ -27,6 +27,8 @@ pub struct VariantSpec { pub rsids: Vec, pub grch37: Option, pub grch38: Option, + pub grch37_assembly_ref: Option, + pub grch38_assembly_ref: Option, pub reference: Option, pub alternate: Option, pub kind: Option, @@ -58,6 +60,15 @@ impl VariantSpec { pub fn has_coordinates(&self) -> bool { self.grch37.is_some() || self.grch38.is_some() } + + #[must_use] + pub fn assembly_reference(&self, assembly: Option) -> Option<&str> { + match assembly { + Some(Assembly::Grch37) => self.grch37_assembly_ref.as_deref(), + Some(Assembly::Grch38) => self.grch38_assembly_ref.as_deref(), + None => None, + } + } } #[cfg(test)] diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index deff0df..c1881b6 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -105,6 +105,26 @@ where cram_stream::for_each_cram_record_with_reader_inner(reader, label, locus, true, on_record) } +pub(crate) fn for_each_cram_record_with_reader_allow_md5_mismatch( + reader: &mut cram::io::indexed_reader::IndexedReader, + label: &str, + locus: &GenomicLocus, + allow_reference_md5_mismatch: bool, + on_record: F, +) -> Result<(), RuntimeError> +where + R: Read + Seek, + F: FnMut(AlignmentRecord) -> Result, +{ + cram_stream::for_each_cram_record_with_reader_inner( + reader, + label, + locus, + allow_reference_md5_mismatch, + on_record, + ) +} + /// Iterate raw CRAM records intersecting `locus`, streaming from an /// already-built CRAM `IndexedReader`. The raw variant preserves the /// `cram::Record` handle so callers can pull base+quality at a specific diff --git a/rust/bioscript-formats/src/alignment/cram_stream.rs b/rust/bioscript-formats/src/alignment/cram_stream.rs index 0b1facf..da4cc16 100644 --- a/rust/bioscript-formats/src/alignment/cram_stream.rs +++ b/rust/bioscript-formats/src/alignment/cram_stream.rs @@ -232,6 +232,7 @@ where let reference_sequence_repository = reader.reference_sequence_repository().clone(); let mut stop = false; + let mut callback_err: Option = None; for (index, slice_result) in container.slices().enumerate() { let slice = slice_result.map_err(|err| { @@ -256,30 +257,28 @@ where )) })?; - let records = slice.records( + let decode_result = slice.records_while( reference_sequence_repository.clone(), header, &compression_header, &core_data_src, &external_data_srcs, + !allow_reference_md5_mismatch, + |record| { + Ok(handle_decoded_cram_record( + label, + record, + interval, + locus_end, + &mut stop, + &mut callback_err, + on_record, + )) + }, ); - match records { - Ok(records) => { - let mut callback_err: Option = None; - for record in &records { - if !handle_decoded_cram_record( - label, - record, - interval, - locus_end, - &mut stop, - &mut callback_err, - on_record, - ) { - break; - } - } + match decode_result { + Ok(()) => { if let Some(err) = callback_err { return Err(err); } @@ -287,7 +286,7 @@ where Err(err) if allow_reference_md5_mismatch && is_reference_md5_mismatch(&err) => { eprintln!( "[bioscript] warning: CRAM reference MD5 mismatch for {label} slice landmark {landmark} — \ - this noodles version cannot retry without checksum validation. \ + decoding continued with checksum validation disabled for this slice. \ Details: {err}" ); } diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index eea489b..805c805 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -548,6 +548,8 @@ mod tests { rsids: vec!["rs1".to_owned()], grch37: Some(locus("1", 10, 10)), grch38: Some(locus("2", 20, 20)), + grch37_assembly_ref: None, + grch38_assembly_ref: None, reference: Some("A".to_owned()), alternate: Some("G".to_owned()), kind: Some(VariantKind::Snp), @@ -831,6 +833,22 @@ mod tests { ..VariantSpec::default() }; assert!(vcf_row_matches_variant(&row, &snp, Some(Assembly::Grch38))); + let assembly_ref_snp = VariantSpec { + grch37: Some(locus("1", 10, 10)), + grch37_assembly_ref: Some("G".to_owned()), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }; + let inverted_row = parse_vcf_record("1\t10\trs10\tG\tA\t.\tPASS\t.\tGT\t1/1") + .unwrap() + .unwrap(); + assert!(vcf_row_matches_variant( + &inverted_row, + &assembly_ref_snp, + Some(Assembly::Grch37) + )); let deletion_row = parse_vcf_record("1\t9\trsdel\tATC\tA\t.\tPASS\t.\tGT\t0/1") .unwrap() .unwrap(); @@ -880,6 +898,58 @@ mod tests { )); } + #[test] + fn vcf_scan_translates_assembly_ref_snv_rows_and_missing_reference() { + let dir = temp_dir("vcf-assembly-ref"); + let vcf_path = dir.join("sample.grch37.vcf"); + fs::write( + &vcf_path, + "##fileformat=VCFv4.3\n\ + ##reference=GRCh37\n\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ + 1\t10\trs10\tT\tC,A\t.\tPASS\t.\tGT\t1/1\n", + ) + .unwrap(); + + let backend = VcfBackend { + path: vcf_path, + options: GenotypeLoadOptions { + assembly: Some(Assembly::Grch37), + ..GenotypeLoadOptions::default() + }, + }; + let present = VariantSpec { + grch37: Some(locus("1", 10, 10)), + grch37_assembly_ref: Some("T".to_owned()), + reference: Some("C".to_owned()), + alternate: Some("T".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }; + let missing = VariantSpec { + grch37: Some(locus("1", 20, 20)), + grch37_assembly_ref: Some("T".to_owned()), + reference: Some("C".to_owned()), + alternate: Some("T".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }; + + let observations = scan_vcf_variants(&backend, &[present, missing]).unwrap(); + assert_eq!(observations[0].genotype.as_deref(), Some("CC")); + assert!( + observations[0].evidence[0].contains("resolved by locus"), + "{:?}", + observations[0].evidence + ); + assert_eq!(observations[1].genotype.as_deref(), Some("TT")); + assert!( + observations[1].evidence[0].contains("imputed reference genotype"), + "{:?}", + observations[1].evidence + ); + } + #[test] fn genotype_private_helpers_cover_indel_record_classification() { let record = AlignmentRecord { diff --git a/rust/bioscript-formats/src/genotype/cram_backend.rs b/rust/bioscript-formats/src/genotype/cram_backend.rs index 30d3747..a907d1c 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend.rs @@ -6,10 +6,7 @@ use std::{ use noodles::core::Position; use noodles::cram; -use noodles::sam::alignment::{ - Record as _, - record::{Cigar as _, QualityScores as _, Sequence as _, cigar::op::Kind as CigarOpKind}, -}; +use noodles::sam::alignment::Record as _; use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantSpec}; @@ -392,54 +389,7 @@ fn cram_base_quality_at_reference_position( target_position: Position, reference_base: u8, ) -> Result, RuntimeError> { - let Some(alignment_start) = record.alignment_start() else { - return Ok(None); - }; - let alignment_start = alignment_start - .map_err(|err| RuntimeError::Io(format!("failed to read CRAM alignment start: {err}")))?; - let mut reference_position = usize::from(alignment_start); - let target = usize::from(target_position); - let mut read_position = 0usize; - let sequence = record.sequence(); - let qualities = record.quality_scores(); - - for op in record.cigar().iter() { - let op = op.map_err(|err| RuntimeError::Io(format!("failed to read CRAM CIGAR: {err}")))?; - match op.kind() { - CigarOpKind::Match | CigarOpKind::SequenceMatch | CigarOpKind::SequenceMismatch => { - for offset in 0..op.len() { - if reference_position + offset == target { - let base = sequence - .get(read_position + offset) - .unwrap_or(reference_base); - let quality = qualities - .iter() - .nth(read_position + offset) - .transpose() - .map_err(|err| { - RuntimeError::Io(format!("failed to read CRAM base quality: {err}")) - })? - .unwrap_or(0); - return Ok(Some((base, quality))); - } - } - reference_position += op.len(); - read_position += op.len(); - } - CigarOpKind::Insertion | CigarOpKind::SoftClip => { - read_position += op.len(); - } - CigarOpKind::Deletion | CigarOpKind::Skip => { - if target >= reference_position && target < reference_position + op.len() { - return Ok(None); - } - reference_position += op.len(); - } - CigarOpKind::HardClip | CigarOpKind::Pad => {} - } - } - - Ok(None) + Ok(record.base_quality_at_reference_position(target_position, reference_base)) } pub(crate) fn normalize_pileup_base(base: u8) -> Option { diff --git a/rust/bioscript-formats/src/genotype/vcf/matching.rs b/rust/bioscript-formats/src/genotype/vcf/matching.rs index e633cb7..afecd10 100644 --- a/rust/bioscript-formats/src/genotype/vcf/matching.rs +++ b/rust/bioscript-formats/src/genotype/vcf/matching.rs @@ -33,7 +33,7 @@ pub fn imputed_reference_observation( ) -> Option { let genotype = match variant.kind { None | Some(VariantKind::Snp) => { - let reference = first_single_base_allele(variant.reference.as_deref())?; + let reference = reference_for_assembly(variant, assembly)?; first_single_base_allele(variant.alternate.as_deref())?; reference_genotype_for_locus(reference, locus, inferred_sex) } @@ -77,6 +77,16 @@ fn reference_genotype_for_locus( } } +pub(crate) fn reference_for_assembly( + variant: &VariantSpec, + assembly: Option, +) -> Option { + let value = variant + .assembly_reference(assembly) + .or(variant.reference.as_deref())?; + first_single_base_allele(Some(value)) +} + fn is_sex_chromosome(chrom: &str) -> bool { matches!( chrom @@ -108,16 +118,13 @@ pub(crate) fn vcf_row_matches_variant( match variant.kind.unwrap_or(VariantKind::Other) { VariantKind::Snp => { + let expected_reference = variant + .assembly_reference(assembly) + .or(variant.reference.as_deref()); row.position == locus.start - && variant - .reference - .as_ref() + && expected_reference .is_none_or(|reference| reference.eq_ignore_ascii_case(&row.reference)) - && variant.alternate.as_ref().is_none_or(|alternate| { - row.alternates - .iter() - .any(|candidate| candidate.eq_ignore_ascii_case(alternate)) - }) + && snp_row_has_catalog_allele(row, variant) } VariantKind::Deletion => { let expected_len = variant.deletion_length.unwrap_or(0); @@ -134,3 +141,21 @@ pub(crate) fn vcf_row_matches_variant( VariantKind::Other => row.position == locus.start, } } + +fn snp_row_has_catalog_allele(row: &ParsedVcfRow, variant: &VariantSpec) -> bool { + let Some(alternate) = variant.alternate.as_ref() else { + return true; + }; + if row + .alternates + .iter() + .any(|candidate| candidate.eq_ignore_ascii_case(alternate)) + { + return true; + } + variant.reference.as_ref().is_some_and(|reference| { + row.alternates + .iter() + .any(|candidate| candidate.eq_ignore_ascii_case(reference)) + }) +} diff --git a/rust/bioscript-formats/src/genotype/vcf/reader.rs b/rust/bioscript-formats/src/genotype/vcf/reader.rs index 6244b14..6a2c1b1 100644 --- a/rust/bioscript-formats/src/genotype/vcf/reader.rs +++ b/rust/bioscript-formats/src/genotype/vcf/reader.rs @@ -21,6 +21,31 @@ pub fn observe_vcf_snp_with_reader( matched_rsid: Option, assembly: Option, ) -> Result +where + R: Read + Seek, +{ + observe_vcf_snp_with_reader_and_assembly_ref( + indexed, + label, + locus, + reference, + alternate, + None, + matched_rsid, + assembly, + ) +} + +pub(crate) fn observe_vcf_snp_with_reader_and_assembly_ref( + indexed: &mut csi::io::IndexedReader, tabix::Index>, + label: &str, + locus: &GenomicLocus, + reference: char, + alternate: char, + assembly_reference: Option, + matched_rsid: Option, + assembly: Option, +) -> Result where R: Read + Seek, { @@ -57,8 +82,10 @@ where RuntimeError::Io(format!("{label}: tabix query for {locus_label}: {err}")) })?; - let reference_str = reference.to_ascii_uppercase().to_string(); + let expected_reference = assembly_reference.unwrap_or(reference); + let reference_str = expected_reference.to_ascii_uppercase().to_string(); let alternate_str = alternate.to_ascii_uppercase().to_string(); + let catalog_reference_str = reference.to_ascii_uppercase().to_string(); let mut saw_any = false; for record_result in query { @@ -79,6 +106,10 @@ where .alternates .iter() .any(|alt| alt.eq_ignore_ascii_case(&alternate_str)) + && !row + .alternates + .iter() + .any(|alt| alt.eq_ignore_ascii_case(&catalog_reference_str)) { continue; } @@ -95,7 +126,7 @@ where let evidence = if saw_any { vec![format!( - "{label}: {locus_label} present but ref={reference}/alt={alternate} did not match any record" + "{label}: {locus_label} present but ref={expected_reference}/alt={alternate} did not match any record" )] } else { vec![format!("{label}: no VCF record at {locus_label}")] diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index 40135f3..b19ef17 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -283,10 +283,19 @@ pub fn inspect_file(path: &Path, options: &InspectOptions) -> Result = (0..100) + .map(|idx| { + let gt = if idx % 3 == 0 { "0/1" } else { "0/0" }; + format!("chrX\t{}\t.\tC\tT\t.\tPASS\t.\tGT\t{gt}", 3_000_000 + idx) + }) + .collect(); let result = infer_sex_from_text_lines(&lines, DetectedKind::Vcf).unwrap(); + assert_eq!(result.sex, InferredSex::Female); assert_eq!(result.method, "vcf_non_par_x_gt_y_count"); assert!( result .evidence .iter() - .any(|item| item == "x_het_gt_sites=1") + .any(|item| item == "x_het_gt_sites=34") + ); + } + + #[test] + fn vcf_non_par_x_gt_beats_low_y_call_presence_when_signals_conflict() { + let mut lines: Vec = (0..20) + .map(|idx| format!("chrY\t{}\t.\tC\tT\t.\tPASS\t.\tGT\t1", 3_000_000 + idx)) + .collect(); + lines.extend((0..1000).map(|idx| { + let gt = if idx % 2 == 0 { "0/1" } else { "0/0" }; + format!("chrX\t{}\t.\tC\tT\t.\tPASS\t.\tGT\t{gt}", 3_000_000 + idx) + })); + + let result = infer_sex_from_text_lines(&lines, DetectedKind::Vcf).unwrap(); + assert_eq!(result.sex, InferredSex::Female); + assert_eq!(result.confidence, SexDetectionConfidence::High); + assert!( + result + .evidence + .iter() + .any(|item| item == "called_y_snps=20") + ); + } + + #[test] + fn vcf_strong_y_signal_beats_diploid_x_het_from_wgs_callers() { + let mut lines: Vec = (0..600) + .map(|idx| format!("chrY\t{}\t.\tC\tT\t.\tPASS\t.\tGT\t1", 3_000_000 + idx)) + .collect(); + lines.extend((0..2000).map(|idx| { + let gt = if idx % 10 < 3 { "0/1" } else { "0/0" }; + format!("chrX\t{}\t.\tC\tT\t.\tPASS\t.\tGT\t{gt}", 3_000_000 + idx) + })); + + let result = infer_sex_from_text_lines(&lines, DetectedKind::Vcf).unwrap(); + assert_eq!(result.sex, InferredSex::Male); + assert_eq!(result.confidence, SexDetectionConfidence::High); + assert!(result.evidence.iter().any(|item| item == "x_het_pct=30.00")); + assert!( + result + .evidence + .iter() + .any(|item| item == "y_to_x_pct=30.00") ); } } diff --git a/rust/bioscript-formats/src/inspect/sex/alignment_depth.rs b/rust/bioscript-formats/src/inspect/sex/alignment_depth.rs new file mode 100644 index 0000000..d90d49d --- /dev/null +++ b/rust/bioscript-formats/src/inspect/sex/alignment_depth.rs @@ -0,0 +1,207 @@ +use std::{io::Read, path::Path}; + +use bioscript_core::RuntimeError; + +use crate::{alignment, genotype::GenotypeLoadOptions}; + +use super::{ + DetectedKind, InferredSex, InspectOptions, SexDetectionConfidence, SexInference, + unsupported_sex_inference, +}; + +const ALIGNMENT_SEX_WINDOW_LEN: i64 = 1000; +const ALIGNMENT_SEX_WINDOW_RECORD_CAP: usize = 400; +const ALIGNMENT_AUTOSOME_WINDOWS: &[(&str, i64)] = &[ + ("1", 50_000_000), + ("2", 50_000_000), + ("3", 50_000_000), + ("4", 50_000_000), + ("5", 50_000_000), + ("6", 50_000_000), + ("7", 50_000_000), + ("8", 50_000_000), +]; +const ALIGNMENT_X_NON_PAR_WINDOWS: &[i64] = &[ + 10_000_000, + 20_000_000, + 40_000_000, + 60_000_000, + 80_000_000, + 100_000_000, + 120_000_000, + 140_000_000, +]; +const ALIGNMENT_Y_WINDOWS: &[i64] = &[ + 3_500_000, 8_000_000, 12_000_000, 16_000_000, 20_000_000, 24_000_000, 28_000_000, 40_000_000, +]; + +#[derive(Debug, Default)] +struct AlignmentSexStats { + autosome_windows: usize, + autosome_records: usize, + x_windows: usize, + x_records: usize, + y_windows: usize, + y_records: usize, +} + +pub(crate) fn infer_sex_from_alignment_path( + path: &Path, + options: &InspectOptions, + kind: DetectedKind, +) -> Result { + if kind != DetectedKind::AlignmentCram { + return Ok(unsupported_sex_inference()); + } + let Some(reference_file) = options.reference_file.as_ref() else { + return Ok(SexInference { + sex: InferredSex::Unknown, + confidence: SexDetectionConfidence::Low, + method: "alignment_y_x_coverage".to_owned(), + evidence: vec!["CRAM sex detection requires --reference-file".to_owned()], + }); + }; + + let load_options = GenotypeLoadOptions { + input_index: options.input_index.clone(), + reference_file: options.reference_file.clone(), + reference_index: options.reference_index.clone(), + allow_reference_md5_mismatch: true, + ..GenotypeLoadOptions::default() + }; + + let stats = sample_alignment_sex_windows(path, &load_options, reference_file)?; + let autosome_mean = mean_records(stats.autosome_records, stats.autosome_windows); + let x_mean = mean_records(stats.x_records, stats.x_windows); + let y_mean = mean_records(stats.y_records, stats.y_windows); + let x_ratio = ratio_to_autosome(x_mean, autosome_mean); + let y_ratio = ratio_to_autosome(y_mean, autosome_mean); + + let (sex, confidence) = if autosome_mean < 5.0 { + (InferredSex::Unknown, SexDetectionConfidence::Low) + } else if x_ratio >= 0.75 && y_ratio < 0.15 { + (InferredSex::Female, SexDetectionConfidence::High) + } else if x_ratio < 0.75 && y_ratio >= 0.08 { + (InferredSex::Male, SexDetectionConfidence::High) + } else if x_ratio >= 0.75 && y_ratio < 0.25 { + (InferredSex::Female, SexDetectionConfidence::Medium) + } else if x_ratio < 0.85 && y_ratio >= 0.03 { + (InferredSex::Male, SexDetectionConfidence::Medium) + } else { + (InferredSex::Unknown, SexDetectionConfidence::Low) + }; + + let mut evidence = vec![ + format!("autosome_windows={}", stats.autosome_windows), + format!("autosome_records={}", stats.autosome_records), + format!("x_windows={}", stats.x_windows), + format!("x_records={}", stats.x_records), + format!("y_windows={}", stats.y_windows), + format!("y_records={}", stats.y_records), + format!("autosome_mean_records={autosome_mean:.2}"), + format!("x_mean_records={x_mean:.2}"), + format!("y_mean_records={y_mean:.2}"), + format!("x_to_autosome_ratio={x_ratio:.3}"), + format!("y_to_autosome_ratio={y_ratio:.3}"), + ]; + if options.input_index.is_none() { + evidence.push("CRAM sex detection ran without explicit --input-index".to_owned()); + } + + Ok(SexInference { + sex, + confidence, + method: "alignment_autosome_x_y_depth_ratio".to_owned(), + evidence, + }) +} + +fn sample_alignment_sex_windows( + path: &Path, + options: &GenotypeLoadOptions, + reference_file: &Path, +) -> Result { + let repository = alignment::build_reference_repository(reference_file)?; + let mut reader = alignment::build_cram_indexed_reader_from_path(path, options, repository)?; + let label = path.display().to_string(); + let mut stats = AlignmentSexStats::default(); + + for (chrom, center) in ALIGNMENT_AUTOSOME_WINDOWS { + stats.autosome_records += count_alignment_records_in_window( + &mut reader, + &label, + chrom, + *center, + options.allow_reference_md5_mismatch, + )?; + stats.autosome_windows += 1; + } + for center in ALIGNMENT_X_NON_PAR_WINDOWS { + stats.x_records += count_alignment_records_in_window( + &mut reader, + &label, + "X", + *center, + options.allow_reference_md5_mismatch, + )?; + stats.x_windows += 1; + } + for center in ALIGNMENT_Y_WINDOWS { + stats.y_records += count_alignment_records_in_window( + &mut reader, + &label, + "Y", + *center, + options.allow_reference_md5_mismatch, + )?; + stats.y_windows += 1; + } + + Ok(stats) +} + +fn count_alignment_records_in_window( + reader: &mut noodles::cram::io::indexed_reader::IndexedReader, + label: &str, + chrom: &str, + center: i64, + allow_reference_md5_mismatch: bool, +) -> Result { + let half_window = ALIGNMENT_SEX_WINDOW_LEN / 2; + let locus = bioscript_core::GenomicLocus { + chrom: chrom.to_owned(), + start: center.saturating_sub(half_window).max(1), + end: center.saturating_add(half_window), + }; + let mut count = 0usize; + alignment::for_each_cram_record_with_reader_allow_md5_mismatch( + reader, + label, + &locus, + allow_reference_md5_mismatch, + |record| { + if !record.is_unmapped { + count += 1; + } + Ok(count < ALIGNMENT_SEX_WINDOW_RECORD_CAP) + }, + )?; + Ok(count) +} + +fn mean_records(records: usize, windows: usize) -> f64 { + if windows == 0 { + 0.0 + } else { + f64::from(u32::try_from(records).unwrap_or(u32::MAX)) + / f64::from(u32::try_from(windows).unwrap_or(u32::MAX)) + } +} + +fn ratio_to_autosome(value: f64, autosome_mean: f64) -> f64 { + if autosome_mean <= f64::EPSILON { + 0.0 + } else { + value / autosome_mean + } +} diff --git a/rust/bioscript-formats/src/inspect/sex/classify.rs b/rust/bioscript-formats/src/inspect/sex/classify.rs index e5c8052..4427241 100644 --- a/rust/bioscript-formats/src/inspect/sex/classify.rs +++ b/rust/bioscript-formats/src/inspect/sex/classify.rs @@ -64,15 +64,38 @@ fn classify_y_fingerprint_stats(stats: &SexStats) -> SexInference { } fn classify_vcf_stats(stats: &SexStats) -> SexInference { - let (sex, confidence) = if stats.called_y_snps > 500 - || (stats.x_haploid_gt_sites >= 20 && stats.x_diploid_gt_sites == 0) - { + let x_het_pct = if stats.x_diploid_gt_sites == 0 { + 0.0 + } else { + count_as_f64(stats.x_het_gt_sites) * 100.0 / count_as_f64(stats.x_diploid_gt_sites) + }; + let y_to_x_pct = if stats.x_non_par_sites == 0 { + 0.0 + } else { + count_as_f64(stats.called_y_snps) * 100.0 / count_as_f64(stats.x_non_par_sites) + }; + let female_like_x = + stats.x_non_par_sites >= 50 && stats.x_diploid_gt_sites > 0 && x_het_pct >= 2.0; + let male_like_x = stats.x_haploid_gt_sites >= 20 + || (stats.x_non_par_sites >= 50 && stats.x_diploid_gt_sites > 0 && x_het_pct <= 0.2); + let male_like_y = stats.called_y_snps > 500; + let strong_y_signal = + stats.called_y_snps >= 10_000 || (stats.x_non_par_sites >= 1000 && y_to_x_pct >= 10.0); + let (sex, confidence) = if strong_y_signal { (InferredSex::Male, SexDetectionConfidence::High) - } else if stats.x_non_par_sites >= 50 - && stats.x_diploid_gt_sites > 0 - && stats.x_het_gt_sites * 100 / stats.x_diploid_gt_sites.max(1) >= 2 - { - (InferredSex::Female, SexDetectionConfidence::Medium) + } else if female_like_x { + ( + InferredSex::Female, + if stats.x_non_par_sites >= 1000 && x_het_pct >= 10.0 { + SexDetectionConfidence::High + } else { + SexDetectionConfidence::Medium + }, + ) + } else if male_like_x { + (InferredSex::Male, SexDetectionConfidence::High) + } else if male_like_y { + (InferredSex::Unknown, SexDetectionConfidence::Medium) } else { (InferredSex::Unknown, SexDetectionConfidence::Low) }; @@ -85,11 +108,17 @@ fn classify_vcf_stats(stats: &SexStats) -> SexInference { format!("x_haploid_gt_sites={}", stats.x_haploid_gt_sites), format!("x_diploid_gt_sites={}", stats.x_diploid_gt_sites), format!("x_het_gt_sites={}", stats.x_het_gt_sites), + format!("x_het_pct={x_het_pct:.2}"), format!("called_y_snps={}", stats.called_y_snps), + format!("y_to_x_pct={y_to_x_pct:.2}"), ], } } +fn count_as_f64(count: usize) -> f64 { + f64::from(u32::try_from(count).unwrap_or(u32::MAX)) +} + fn y_fingerprint_evidence(stats: &SexStats) -> Vec { let mut evidence = vec![ format!("total_y_snps={}", stats.total_y_snps), diff --git a/rust/bioscript-formats/tests/file_formats/cram.rs b/rust/bioscript-formats/tests/file_formats/cram.rs index a0b38d6..670e3d8 100644 --- a/rust/bioscript-formats/tests/file_formats/cram.rs +++ b/rust/bioscript-formats/tests/file_formats/cram.rs @@ -517,6 +517,55 @@ fn cram_md5_mismatch_is_tolerated_and_returns_correct_result() { ); } +#[test] +fn cram_md5_mismatch_still_decodes_chr6_pgx_loci() { + let Some(fx) = cram_fixture_or_skip("cram_md5_mismatch_still_decodes_chr6_pgx_loci") else { + return; + }; + let store = open_cram_store_with_md5_policy(&fx, true); + + for (rsid, start, reference, alternate, expected_ref_base) in [ + ("rs1360780", 35_639_794, "T", "C", "T"), + ("rs4713916", 35_702_206, "A", "G", "A"), + ("rs1799971", 154_039_662, "A", "G", "A"), + ] { + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec![rsid.to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "6".to_owned(), + start, + end: start, + }), + reference: Some(reference.to_owned()), + alternate: Some(alternate.to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .unwrap_or_else(|err| { + panic!("{rsid} lookup should decode despite MD5 mismatch: {err:?}") + }); + + let depth = observation.depth.unwrap_or(0); + assert!( + depth >= 10, + "{rsid} should have CRAM coverage after lenient MD5 decode, got depth {depth}: {:?}", + observation.evidence + ); + assert!( + !observation.raw_counts.contains_key("C") || rsid != "rs1799971", + "{rsid} should not decode reference-like rs1799971 bases as previous-base C: {:?}", + observation.raw_counts + ); + assert!( + observation.raw_counts.contains_key(expected_ref_base) + || observation.ref_count.unwrap_or(0) == 0, + "{rsid} should include expected reference base {expected_ref_base} when reference reads are present: {:?}", + observation.raw_counts + ); + } +} + #[test] fn cram_rs9357296_reports_heterozygous_counts_for_na06985() { let Some(fx) = cram_fixture_or_skip("cram_rs9357296_reports_heterozygous_counts_for_na06985") diff --git a/rust/bioscript-formats/tests/inspect.rs b/rust/bioscript-formats/tests/inspect.rs index dc81a42..fd10d36 100644 --- a/rust/bioscript-formats/tests/inspect.rs +++ b/rust/bioscript-formats/tests/inspect.rs @@ -6,7 +6,10 @@ use std::{ }; use bioscript_core::Assembly; -use bioscript_formats::{DetectedKind, FileContainer, InspectOptions, inspect_bytes, inspect_file}; +use bioscript_formats::{ + DetectedKind, FileContainer, InferredSex, InspectOptions, SexDetectionConfidence, + inspect_bytes, inspect_file, +}; use zip::write::SimpleFileOptions; fn repo_root() -> PathBuf { @@ -51,6 +54,15 @@ fn shared_fixture_or_skip(test_name: &str, relative: &str) -> Option { Some(path) } +fn require_large_tests(test_name: &str) -> bool { + if env::var_os("BIOSCRIPT_RUN_LARGE_TESTS").is_some() { + true + } else { + eprintln!("skipping {test_name}: set BIOSCRIPT_RUN_LARGE_TESTS=1 to enable"); + false + } +} + fn temp_dir(label: &str) -> PathBuf { let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -481,6 +493,110 @@ fn chr_y_cram_fixture_reports_index_without_decoding_entire_file() { assert!(elapsed < 1000); } +#[test] +fn na06985_vcf_sex_detection_uses_non_par_x_over_y_calls() { + let test_name = "na06985_vcf_sex_detection_uses_non_par_x_over_y_calls"; + let Some(path) = shared_fixture_or_skip(test_name, "1k-genomes/vcf/NA06985.clean.vcf.gz") + else { + return; + }; + let index_path = shared_fixture_or_skip(test_name, "1k-genomes/vcf/NA06985.clean.vcf.gz.tbi"); + + let inspection = inspect_file( + &path, + &InspectOptions { + input_index: index_path, + detect_sex: true, + ..InspectOptions::default() + }, + ) + .unwrap(); + let inferred = inspection.inferred_sex.expect("sex inference"); + + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inferred.sex, InferredSex::Female); + assert_eq!(inferred.confidence, SexDetectionConfidence::High); + assert_eq!(inferred.method, "vcf_non_par_x_gt_y_count"); + assert!( + inferred + .evidence + .iter() + .any(|item| item.starts_with("x_het_pct=")), + "missing X heterozygosity evidence: {:?}", + inferred.evidence + ); + assert!( + inferred + .evidence + .iter() + .any(|item| item == "called_y_snps=2924"), + "missing conflicting Y evidence: {:?}", + inferred.evidence + ); +} + +#[test] +fn na06985_cram_sex_detection_uses_depth_ratios_not_y_presence() { + let test_name = "na06985_cram_sex_detection_uses_depth_ratios_not_y_presence"; + if !require_large_tests(test_name) { + return; + } + let Some(path) = shared_fixture_or_skip(test_name, "1k-genomes/aligned/NA06985.final.cram") + else { + return; + }; + let Some(input_index) = + shared_fixture_or_skip(test_name, "1k-genomes/aligned/NA06985.final.cram.crai") + else { + return; + }; + let Some(reference_file) = shared_fixture_or_skip( + test_name, + "1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa", + ) else { + return; + }; + let Some(reference_index) = shared_fixture_or_skip( + test_name, + "1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai", + ) else { + return; + }; + + let inspection = inspect_file( + &path, + &InspectOptions { + input_index: Some(input_index), + reference_file: Some(reference_file), + reference_index: Some(reference_index), + detect_sex: true, + }, + ) + .unwrap(); + let inferred = inspection.inferred_sex.expect("sex inference"); + + assert_eq!(inspection.detected_kind, DetectedKind::AlignmentCram); + assert_eq!(inferred.sex, InferredSex::Female); + assert_eq!(inferred.confidence, SexDetectionConfidence::High); + assert_eq!(inferred.method, "alignment_autosome_x_y_depth_ratio"); + assert!( + inferred + .evidence + .iter() + .any(|item| item.starts_with("x_to_autosome_ratio=")), + "missing X/autosome ratio evidence: {:?}", + inferred.evidence + ); + assert!( + inferred + .evidence + .iter() + .any(|item| item.starts_with("y_to_autosome_ratio=")), + "missing Y/autosome ratio evidence: {:?}", + inferred.evidence + ); +} + #[test] fn inspect_file_reports_bam_cram_and_reference_indexes_without_decoding() { let dir = temp_dir("index-detection"); diff --git a/rust/bioscript-schema/src/validator/spec.rs b/rust/bioscript-schema/src/validator/spec.rs index 0abc368..a778953 100644 --- a/rust/bioscript-schema/src/validator/spec.rs +++ b/rust/bioscript-schema/src/validator/spec.rs @@ -25,6 +25,8 @@ pub(crate) fn variant_spec_from_root(root: &Value) -> Result Result Option { + scalar_at(root, &["coordinates", assembly, "assembly_ref"]) +} + fn preferred_alternate_from_root(root: &Value) -> Option { let alts = seq_of_strings(root, &["alleles", "alts"])?; if let Some(finding_alt) = first_specific_finding_alt(root) diff --git a/rust/bioscript-schema/src/validator/variant.rs b/rust/bioscript-schema/src/validator/variant.rs index 4a5ced5..f5fedc0 100644 --- a/rust/bioscript-schema/src/validator/variant.rs +++ b/rust/bioscript-schema/src/validator/variant.rs @@ -132,6 +132,28 @@ pub(crate) fn validate_coordinates(root: &Value, issues: &mut Vec) { } else { validate_coordinate_range(coord, assembly, issues); } + validate_assembly_ref(coord, assembly, issues); + } +} + +fn validate_assembly_ref(coord: &Mapping, assembly: &str, issues: &mut Vec) { + let Some(value) = coord.get(Value::String("assembly_ref".to_owned())) else { + return; + }; + let Some(reference) = value.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.assembly_ref"), + message: "expected string".to_owned(), + }); + return; + }; + if !is_base_allele(reference) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.assembly_ref"), + message: "expected A, C, G, or T".to_owned(), + }); } } diff --git a/rust/bioscript-schema/src/validator_panel.rs b/rust/bioscript-schema/src/validator_panel.rs index e1daccb..ba6f9e5 100644 --- a/rust/bioscript-schema/src/validator_panel.rs +++ b/rust/bioscript-schema/src/validator_panel.rs @@ -414,6 +414,8 @@ fn variant_spec_from_root(root: &Value) -> Result { rsids, grch37, grch38, + grch37_assembly_ref: scalar_at(root, &["coordinates", "grch37", "assembly_ref"]), + grch38_assembly_ref: scalar_at(root, &["coordinates", "grch38", "assembly_ref"]), reference, alternate, kind,