diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index 2f585b0..c84d927 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -8,6 +8,7 @@ include!("package.rs"); include!("report_review.rs"); include!("report_execution.rs"); include!("report_observations.rs"); +include!("report_observation_json.rs"); include!("report_findings.rs"); include!("report_matching.rs"); include!("report_output.rs"); diff --git a/rust/bioscript-cli/src/report_findings.rs b/rust/bioscript-cli/src/report_findings.rs index 93bd872..c54e835 100644 --- a/rust/bioscript-cli/src/report_findings.rs +++ b/rust/bioscript-cli/src/report_findings.rs @@ -273,22 +273,70 @@ mod report_observations_tests { ); } + #[test] + fn deletion_copy_number_calls_are_normalized_from_insertion_deletion_tokens() { + assert_eq!( + normalize_app_genotype( + "DI", + "TTATAA", + "", + Some(bioscript_core::VariantKind::Deletion), + "22", + None + ), + ("0/1".to_owned(), "het".to_owned()) + ); + } + + #[test] + fn cram_long_deletion_copy_number_calls_are_displayed_as_insertion_deletion_tokens() { + let mut row = BTreeMap::new(); + row.insert("backend".to_owned(), "cram".to_owned()); + let manifest = bioscript_schema::VariantManifest { + path: std::path::PathBuf::new(), + name: "apol1_g2".to_owned(), + tags: Vec::new(), + spec: bioscript_core::VariantSpec { + reference: Some("TTATAA".to_owned()), + alternate: Some("".to_owned()), + kind: Some(bioscript_core::VariantKind::Deletion), + ..bioscript_core::VariantSpec::default() + }, + }; + + assert_eq!( + deletion_copy_number_display(&row, &manifest, Some(53), Some(0)).as_deref(), + Some("II") + ); + assert_eq!( + normalize_app_genotype( + "II", + "TTATAA", + "", + Some(bioscript_core::VariantKind::Deletion), + "22", + None + ), + ("0/0".to_owned(), "hom_ref".to_owned()) + ); + } + #[test] fn single_allele_sex_chromosome_calls_are_treated_as_hemizygous() { assert_eq!( - normalize_app_genotype("G", "C", "G", "X", None), + normalize_app_genotype("G", "C", "G", None, "X", None), ("1".to_owned(), "hem_alt".to_owned()) ); assert_eq!( - normalize_app_genotype("C", "C", "G", "chrX", None), + normalize_app_genotype("C", "C", "G", None, "chrX", None), ("0".to_owned(), "hem_ref".to_owned()) ); assert_eq!( - normalize_app_genotype("G", "C", "G", "1", None), + normalize_app_genotype("G", "C", "G", None, "1", None), ("G".to_owned(), "unknown".to_owned()) ); assert_eq!( - normalize_app_genotype("GG", "C", "G", "X", None), + normalize_app_genotype("GG", "C", "G", None, "X", None), ("1/1".to_owned(), "hom_alt".to_owned()) ); } @@ -302,11 +350,11 @@ mod report_observations_tests { evidence: vec!["called_y_snps=1200".to_owned()], }; assert_eq!( - normalize_app_genotype("GG", "C", "G", "X", Some(&inferred_sex)), + normalize_app_genotype("GG", "C", "G", None, "X", Some(&inferred_sex)), ("1".to_owned(), "hem_alt".to_owned()) ); assert_eq!( - normalize_app_genotype("CC", "C", "G", "chrX", Some(&inferred_sex)), + normalize_app_genotype("CC", "C", "G", None, "chrX", Some(&inferred_sex)), ("0".to_owned(), "hem_ref".to_owned()) ); } diff --git a/rust/bioscript-cli/src/report_html_analysis.rs b/rust/bioscript-cli/src/report_html_analysis.rs index b667dc5..573b77b 100644 --- a/rust/bioscript-cli/src/report_html_analysis.rs +++ b/rust/bioscript-cli/src/report_html_analysis.rs @@ -1,6 +1,7 @@ fn render_analysis_tables( out: &mut String, analyses: &[serde_json::Value], + observations: &[serde_json::Value], show_participant_id: bool, ) { if analyses.is_empty() { @@ -10,6 +11,7 @@ fn render_analysis_tables( for (index, analysis) in analyses.iter().enumerate() { let table_id = format!("analysis-table-{index}"); let title = analysis_title(analysis); + let weak_indel_dependency = analysis_depends_on_weak_observation(analysis, observations); let row_count = analysis .get("rows") .and_then(serde_json::Value::as_array) @@ -37,6 +39,7 @@ fn render_analysis_tables( out.push_str("

Results

"); render_analysis_key_values(out, analysis, &rows[0], &headers); render_analysis_notes(out, ¬es); + render_weak_indel_analysis_note(out, weak_indel_dependency); out.push_str(""); continue; } @@ -57,6 +60,7 @@ fn render_analysis_tables( } render_table_end(out); render_analysis_notes(out, ¬es); + render_weak_indel_analysis_note(out, weak_indel_dependency); out.push_str(""); } } @@ -182,7 +186,7 @@ fn render_analysis_value(key: &str, value: &str) -> String { html_escape(value) ); } - if key.ends_with("_status") || key.ends_with("_outcome") { + if is_analysis_badge_key(key) { format!( "{}", analysis_badge_class(value), @@ -193,6 +197,10 @@ fn render_analysis_value(key: &str, value: &str) -> String { } } +fn is_analysis_badge_key(key: &str) -> bool { + key.ends_with("_status") || key.ends_with("_outcome") +} + fn analysis_badge_class(value: &str) -> &'static str { match value { "normal" | "reference" => "analysis-badge-normal", @@ -214,6 +222,49 @@ fn render_analysis_notes(out: &mut String, notes: &[String]) { out.push_str(""); } +fn render_weak_indel_analysis_note(out: &mut String, weak_indel_dependency: bool) { + if !weak_indel_dependency { + return; + } + out.push_str("

Notes

* Result depends on a weak indel match from a consumer genotype file.

"); +} + +fn analysis_depends_on_weak_observation( + analysis: &serde_json::Value, + observations: &[serde_json::Value], +) -> bool { + let weak_paths = observations + .iter() + .filter(|observation| analysis_observation_is_weak_indel_match(observation)) + .filter_map(|observation| { + observation + .get("variant_path") + .and_then(serde_json::Value::as_str) + }) + .collect::>(); + if weak_paths.is_empty() { + return false; + } + analysis + .get("derived_from") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter_map(serde_json::Value::as_str) + .any(|derived| { + weak_paths + .iter() + .any(|path| path.ends_with(derived) || derived.ends_with(path)) + }) +} + +fn analysis_observation_is_weak_indel_match(observation: &serde_json::Value) -> bool { + observation + .get("match_quality") + .and_then(serde_json::Value::as_str) + == Some("weak") +} + fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { let Some(logic) = analysis.get("logic") else { return; @@ -249,4 +300,3 @@ fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { } out.push_str(""); } - diff --git a/rust/bioscript-cli/src/report_html_helpers.rs b/rust/bioscript-cli/src/report_html_helpers.rs index 65a6ae6..57bc8df 100644 --- a/rust/bioscript-cli/src/report_html_helpers.rs +++ b/rust/bioscript-cli/src/report_html_helpers.rs @@ -32,6 +32,8 @@ fn is_debug_column(header: &str) -> bool { | "genotype_quality" | "evidence_type" | "evidence_raw" + | "match_quality" + | "match_notes" | "assay_id" | "assay_version" | "variant_key" @@ -69,6 +71,8 @@ fn table_header_label(header: &str) -> String { "evidence_type" => "Evidence Type".to_owned(), "source" => "Source".to_owned(), "evidence_raw" => "Evidence Raw".to_owned(), + "match_quality" => "Match Quality".to_owned(), + "match_notes" => "Match Notes".to_owned(), "facets" => "Facets".to_owned(), "assay_id" => "Assay ID".to_owned(), "assay_version" => "Assay Version".to_owned(), diff --git a/rust/bioscript-cli/src/report_html_observations.rs b/rust/bioscript-cli/src/report_html_observations.rs index f5946b4..500e376 100644 --- a/rust/bioscript-cli/src/report_html_observations.rs +++ b/rust/bioscript-cli/src/report_html_observations.rs @@ -25,6 +25,8 @@ fn render_observation_table( "allele_balance", "evidence_type", "evidence_raw", + "match_quality", + "match_notes", "facets", "assay_id", "assay_version", @@ -41,7 +43,12 @@ fn render_observation_table( let show_facets = observations .iter() .any(|observation| !json_field_as_tsv(observation.get("facets")).is_empty()); + let show_match_quality = observations.iter().any(|observation| { + !json_field_as_tsv(observation.get("match_quality")).is_empty() + || !json_field_as_tsv(observation.get("match_notes")).is_empty() + }); let show_imputed_reference_note = observations.iter().any(observation_is_imputed_vcf_reference); + let show_weak_indel_note = observations.iter().any(observation_is_weak_indel_match); let headers = all_headers .iter() .copied() @@ -50,6 +57,9 @@ fn render_observation_table( show_counts || !matches!(*header, "ref_count" | "alt_count" | "depth" | "allele_balance") }) .filter(|header| show_genotype_quality || *header != "genotype_quality") + .filter(|header| { + show_match_quality || !matches!(*header, "match_quality" | "match_notes") + }) .filter(|header| show_facets || *header != "facets") .collect::>(); render_table_start(out, "observations-table", &headers); @@ -70,6 +80,9 @@ fn render_observation_table( if show_imputed_reference_note { out.push_str("

* In variant-only VCF inputs, absent queried variant rows are shown as imputed reference genotypes. This is usually appropriate for variant-only VCFs, but it may be wrong if the VCF omits loci for another reason.

"); } + if show_weak_indel_note { + out.push_str("

* Indel calls from consumer genotype files are weak matches: the file reports an insertion/deletion token at the marker, but does not provide sequence-resolved evidence for the exact deletion allele.

"); + } } fn observation_filter_group(observation: &serde_json::Value) -> &'static str { @@ -131,7 +144,9 @@ fn render_observation_cell(out: &mut String, observation: &serde_json::Value, he let cell_class = table_column_class(header); if header == "outcome" { let mut value = json_field_as_tsv(observation.get(header)); - if value == "reference" && observation_is_imputed_vcf_reference(observation) { + if (value == "reference" && observation_is_imputed_vcf_reference(observation)) + || (value == "variant" && observation_is_weak_indel_match(observation)) + { value.push('*'); } let _ = write!(out, "{}", cell_class, html_escape(&value)); @@ -205,6 +220,13 @@ fn observation_is_imputed_vcf_reference(observation: &serde_json::Value) -> bool }) } +fn observation_is_weak_indel_match(observation: &serde_json::Value) -> bool { + observation + .get("match_quality") + .and_then(serde_json::Value::as_str) + == Some("weak") +} + fn observation_ref_alt(observation: &serde_json::Value) -> String { let ref_allele = observation .get("ref") diff --git a/rust/bioscript-cli/src/report_observation_json.rs b/rust/bioscript-cli/src/report_observation_json.rs new file mode 100644 index 0000000..0c36bc3 --- /dev/null +++ b/rust/bioscript-cli/src/report_observation_json.rs @@ -0,0 +1,142 @@ +struct AppObservationJson { + allele_balance: Option, + alt_count: Option, + assay_id: String, + assembly: String, + call: ObservationCallValues, + chrom: String, + depth: Option, + evidence_raw: String, + gene: String, + genotype: String, + genotype_display: String, + kind: String, + locus: Option, + manifest: bioscript_schema::VariantManifest, + non_reportable_status: Option<&'static str>, + observed_alt_alleles: Vec, + ref_allele: String, + ref_count: Option, + reportable_alt: String, + row: BTreeMap, + row_path: String, + source: serde_json::Value, + weak_indel_match: bool, + zygosity: String, +} + +struct ObservationCallValues { + outcome: &'static str, + status: &'static str, + reported_genotype_display: String, +} + +fn observation_call_values( + depth: Option, + non_reportable_status: Option<&'static str>, + genotype: &str, + zygosity: &str, + genotype_display: &str, +) -> ObservationCallValues { + let outcome = if depth == Some(0) { + "not_covered" + } else if non_reportable_status == Some("observed_alt") { + "observed_alt" + } else if non_reportable_status == Some("unknown_alt") { + "unknown_alt" + } else if genotype == "./." { + "no_call" + } else if zygosity == "hom_ref" || zygosity == "hem_ref" { + "reference" + } else if zygosity == "het" || zygosity == "hom_alt" || zygosity == "hem_alt" { + "variant" + } else { + "unknown" + }; + let status = if matches!(outcome, "observed_alt" | "unknown_alt") { + outcome + } else if genotype == "./." { + "no_call" + } else { + "called" + }; + let reported_genotype_display = if matches!(zygosity, "hem_ref" | "hem_alt") { + hemizygous_display_genotype(genotype_display) + } else if genotype_display.is_empty() && matches!(outcome, "no_call" | "not_covered") { + "??".to_owned() + } else { + genotype_display.to_owned() + }; + ObservationCallValues { + outcome, + status, + reported_genotype_display, + } +} + +fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { + let AppObservationJson { + allele_balance, + alt_count, + assay_id, + assembly, + call, + chrom, + depth, + evidence_raw, + gene, + genotype, + genotype_display, + kind, + locus, + manifest, + non_reportable_status, + observed_alt_alleles, + ref_allele, + ref_count, + reportable_alt, + row, + row_path, + source, + weak_indel_match, + zygosity, + } = input; + serde_json::json!({ + "participant_id": row.get("participant_id").cloned().unwrap_or_default(), + "assay_id": assay_id, + "assay_version": "1.0", + "variant_key": manifest.name, + "variant_path": row_path, + "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), + "gene": gene, + "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, + "chrom": chrom, + "pos_start": locus.as_ref().map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), + "pos_end": locus.as_ref().map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), + "ref": ref_allele, + "alt": reportable_alt, + "kind": kind, + "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, + "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), + "call_status": call.status, + "genotype": genotype, + "genotype_display": call.reported_genotype_display, + "zygosity": zygosity, + "ref_count": ref_count, + "alt_count": alt_count, + "depth": depth, + "genotype_quality": serde_json::Value::Null, + "allele_balance": allele_balance, + "outcome": call.outcome, + "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, + "evidence_raw": evidence_raw, + "source": source, + "match_quality": if weak_indel_match { serde_json::Value::String("weak".to_owned()) } else { serde_json::Value::Null }, + "match_notes": if weak_indel_match { + serde_json::Value::String("consumer genotype file reported an insertion/deletion token at the marker, not sequence-resolved evidence for the exact deletion allele".to_owned()) + } else { + serde_json::Value::Null + }, + "facets": observation_facets(non_reportable_status, &observed_alt_alleles), + }) +} diff --git a/rust/bioscript-cli/src/report_observations.rs b/rust/bioscript-cli/src/report_observations.rs index dd155fa..50c59bb 100644 --- a/rust/bioscript-cli/src/report_observations.rs +++ b/rust/bioscript-cli/src/report_observations.rs @@ -16,7 +16,7 @@ fn app_observation_from_manifest_row( let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); let reportable_alt = manifest.spec.alternate.clone().unwrap_or_default(); let observed_alt_alleles = variant_observed_alt_alleles(&manifest_path)?; - let genotype_display = row + let mut genotype_display = row .get("genotype") .filter(|value| !value.is_empty()) .cloned() @@ -25,6 +25,12 @@ fn app_observation_from_manifest_row( let depth = parse_optional_u32(row.get("depth")); let ref_count = parse_optional_u32(row.get("ref_count")); let alt_count = parse_optional_u32(row.get("alt_count")); + if let Some(normalized_display) = + deletion_copy_number_display(row, &manifest, depth, alt_count) + { + genotype_display = normalized_display; + } + let weak_indel_match = is_weak_delimited_indel_match(row, &manifest, &genotype_display); let allele_balance = match (alt_count, depth) { (Some(alt_count), Some(depth)) if depth > 0 => { Some(f64::from(alt_count) / f64::from(depth)) @@ -51,6 +57,7 @@ fn app_observation_from_manifest_row( &genotype_display, &ref_allele, &reportable_alt, + manifest.spec.kind, &chrom, inferred_sex, ); @@ -65,89 +72,38 @@ fn app_observation_from_manifest_row( ); let evidence_raw = observation_evidence_raw(row, &chrom, inferred_sex); let source = variant_primary_source(&manifest_path)?; - Ok(serde_json::json!({ - "participant_id": row.get("participant_id").cloned().unwrap_or_default(), - "assay_id": assay_id, - "assay_version": "1.0", - "variant_key": manifest.name, - "variant_path": row_path, - "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), - "gene": gene, - "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, - "chrom": chrom, - "pos_start": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), - "pos_end": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), - "ref": ref_allele, - "alt": reportable_alt, - "kind": manifest.spec.kind.map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()), - "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, - "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), - "call_status": call.status, - "genotype": genotype, - "genotype_display": call.reported_genotype_display, - "zygosity": zygosity, - "ref_count": ref_count, - "alt_count": alt_count, - "depth": depth, - "genotype_quality": serde_json::Value::Null, - "allele_balance": allele_balance, - "outcome": call.outcome, - "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, - "evidence_raw": evidence_raw, - "source": source, - "facets": observation_facets(non_reportable_status, &observed_alt_alleles), + let kind = manifest + .spec + .kind + .map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()); + Ok(render_app_observation_json(AppObservationJson { + allele_balance, + alt_count, + assay_id: assay_id.to_owned(), + assembly, + call, + chrom, + depth, + evidence_raw, + gene, + genotype, + genotype_display, + kind, + locus: locus.cloned(), + manifest, + non_reportable_status, + observed_alt_alleles, + ref_allele, + ref_count, + reportable_alt, + row: row.clone(), + row_path, + source, + weak_indel_match, + zygosity, })) } -struct ObservationCallValues { - outcome: &'static str, - status: &'static str, - reported_genotype_display: String, -} - -fn observation_call_values( - depth: Option, - non_reportable_status: Option<&'static str>, - genotype: &str, - zygosity: &str, - genotype_display: &str, -) -> ObservationCallValues { - let outcome = if depth == Some(0) { - "not_covered" - } else if non_reportable_status == Some("observed_alt") { - "observed_alt" - } else if non_reportable_status == Some("unknown_alt") { - "unknown_alt" - } else if genotype == "./." { - "no_call" - } else if zygosity == "hom_ref" || zygosity == "hem_ref" { - "reference" - } else if zygosity == "het" || zygosity == "hom_alt" || zygosity == "hem_alt" { - "variant" - } else { - "unknown" - }; - let status = if matches!(outcome, "observed_alt" | "unknown_alt") { - outcome - } else if genotype == "./." { - "no_call" - } else { - "called" - }; - let reported_genotype_display = if matches!(zygosity, "hem_ref" | "hem_alt") { - hemizygous_display_genotype(genotype_display) - } else if genotype_display.is_empty() && matches!(outcome, "no_call" | "not_covered") { - "??".to_owned() - } else { - genotype_display.to_owned() - }; - ObservationCallValues { - outcome, - status, - reported_genotype_display, - } -} - fn assembly_row_value(assembly: bioscript_core::Assembly) -> String { match assembly { bioscript_core::Assembly::Grch37 => "grch37".to_owned(), @@ -162,6 +118,35 @@ fn hemizygous_display_genotype(display: &str) -> String { .map_or_else(|| display.to_owned(), |allele| allele.to_string()) } +fn deletion_copy_number_display( + row: &BTreeMap, + manifest: &bioscript_schema::VariantManifest, + depth: Option, + alt_count: Option, +) -> Option { + if !matches!(manifest.spec.kind, Some(bioscript_core::VariantKind::Deletion)) { + return None; + } + if row.get("backend").map(String::as_str) != Some("cram") { + return None; + } + if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { + return None; + } + let depth = depth?; + if depth == 0 { + return None; + } + let alt_fraction = f64::from(alt_count.unwrap_or(0)) / f64::from(depth); + if alt_fraction >= 0.8 { + Some("DD".to_owned()) + } else if alt_fraction <= 0.2 { + Some("II".to_owned()) + } else { + Some("DI".to_owned()) + } +} + fn variant_primary_source(path: &Path) -> Result { let value = load_yaml_value(path)?; let mut links = BTreeMap::::new(); @@ -236,12 +221,22 @@ fn normalize_app_genotype( display: &str, ref_allele: &str, alt_allele: &str, + kind: Option, chrom: &str, inferred_sex: Option<&SexInference>, ) -> (String, String) { if display.is_empty() { return ("./.".to_owned(), "unknown".to_owned()); } + if matches!(kind, Some(bioscript_core::VariantKind::Deletion)) + && ref_allele.len() != 1 + && display + .chars() + .filter(char::is_ascii_alphabetic) + .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) + { + return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex); + } let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); if ref_allele.len() != 1 || alt_allele.len() != 1 { return (display.to_owned(), "unknown".to_owned()); @@ -415,21 +410,44 @@ fn classify_non_reportable_alleles( } } +fn is_weak_delimited_indel_match( + row: &BTreeMap, + manifest: &bioscript_schema::VariantManifest, + genotype_display: &str, +) -> bool { + if !matches!(manifest.spec.kind, Some(bioscript_core::VariantKind::Deletion)) { + return false; + } + if !matches!(row.get("backend").map(String::as_str), Some("text" | "zip")) { + return false; + } + if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { + return false; + } + genotype_display + .chars() + .filter(char::is_ascii_alphabetic) + .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) +} + fn observation_facets( non_reportable_status: Option<&str>, observed_alts: &[String], ) -> serde_json::Value { - let Some(status) = non_reportable_status else { - return serde_json::Value::Null; - }; - let mut facets = vec![status.to_owned()]; - if status == "observed_alt" && !observed_alts.is_empty() { - facets.push(format!("known_observed_alts={}", observed_alts.join(","))); + let mut facets = Vec::new(); + if let Some(status) = non_reportable_status { + facets.push(status.to_owned()); + if status == "observed_alt" && !observed_alts.is_empty() { + facets.push(format!("known_observed_alts={}", observed_alts.join(","))); + } + } + if facets.is_empty() { + serde_json::Value::Null + } else { + serde_json::Value::String(facets.join(";")) } - serde_json::Value::String(facets.join(";")) } fn parse_optional_u32(value: Option<&String>) -> Option { value.and_then(|value| value.parse::().ok()) } - diff --git a/rust/bioscript-cli/src/report_output.rs b/rust/bioscript-cli/src/report_output.rs index 7f2eeb4..2b74dc2 100644 --- a/rust/bioscript-cli/src/report_output.rs +++ b/rust/bioscript-cli/src/report_output.rs @@ -301,6 +301,7 @@ fn write_app_html( ); let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); + let has_pgx_findings = !label_findings.is_empty() || !summary_findings.is_empty(); let analysis_outputs = collect_report_analyses(reports); let participants = collect_report_participants(reports); render_report_manifest_header(&mut out, reports); @@ -313,7 +314,11 @@ fn write_app_html( summary_findings.len() ); render_participant_filter(&mut out, &participants); - out.push_str(""); + out.push_str(""); out.push_str("

Input

"); render_input_debug(&mut out, reports, participants.len() > 1); out.push_str("
"); @@ -321,11 +326,18 @@ fn write_app_html( render_observation_table(&mut out, observations, participants.len() > 1); out.push_str(""); out.push_str("

Analysis

"); - render_analysis_tables(&mut out, &analysis_outputs, participants.len() > 1); - out.push_str("
"); - out.push_str("

PGx

"); - render_pgx_table(&mut out, &label_findings, &summary_findings); + render_analysis_tables( + &mut out, + &analysis_outputs, + observations, + participants.len() > 1, + ); out.push_str("
"); + if has_pgx_findings { + out.push_str("

PGx

"); + render_pgx_table(&mut out, &label_findings, &summary_findings); + out.push_str("
"); + } out.push_str("

Provenance

"); render_provenance_links(&mut out, reports); out.push_str("
"); diff --git a/rust/bioscript-core/src/observation.rs b/rust/bioscript-core/src/observation.rs index eb9d658..40e7da9 100644 --- a/rust/bioscript-core/src/observation.rs +++ b/rust/bioscript-core/src/observation.rs @@ -89,10 +89,12 @@ pub struct ObservationRecord { pub outcome: ObservationOutcome, pub evidence_type: Option, pub evidence_raw: Option, + pub match_quality: Option, + pub match_notes: Option, pub facets: Option, } -pub const OBSERVATION_TSV_HEADERS: [&str; 27] = [ +pub const OBSERVATION_TSV_HEADERS: [&str; 29] = [ "participant_id", "assay_id", "assay_version", @@ -119,5 +121,7 @@ pub const OBSERVATION_TSV_HEADERS: [&str; 27] = [ "outcome", "evidence_type", "evidence_raw", + "match_quality", + "match_notes", "facets", ]; diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 0fb784e..a1979a5 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -1234,13 +1234,6 @@ mod tests { kind: Some(VariantKind::Snp), ..VariantSpec::default() }, - VariantSpec { - grch38: Some(locus("1", 10, 10)), - reference: Some("A".to_owned()), - alternate: Some("G".to_owned()), - kind: Some(VariantKind::Deletion), - ..VariantSpec::default() - }, ] { assert!( lookup_indexed_vcf_variants(&indexed, &[variant]) diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index bf001ee..3ff9ad9 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -17,9 +17,11 @@ use super::{ describe_query, genotype_from_vcf_gt, is_bgzf_path, types::VcfBackend, variant_sort_key, }; +mod indexed; mod matching; mod reader; +use indexed::observe_indexed_vcf_variant; pub(crate) use matching::{ choose_variant_locus_for_assembly, normalize_chromosome_name, vcf_row_matches_variant, }; @@ -206,16 +208,13 @@ pub(crate) fn lookup_indexed_vcf_variants( let Some(locus) = choose_variant_locus_for_assembly(variant, detected_assembly) else { return Ok(None); }; - let Some(reference) = first_single_base_allele(variant.reference.as_deref()) else { - return Ok(None); - }; - let Some(alternate) = first_single_base_allele(variant.alternate.as_deref()) else { - return Ok(None); - }; - if !matches!(variant.kind, None | Some(VariantKind::Snp)) { + if matches!(variant.kind, None | Some(VariantKind::Snp)) + && (first_single_base_allele(variant.reference.as_deref()).is_none() + || first_single_base_allele(variant.alternate.as_deref()).is_none()) + { return Ok(None); } - indexed_variants.push((idx, variant, locus, reference, alternate)); + indexed_variants.push((idx, variant, locus)); } let tabix_index = alignment::parse_tbi_bytes(&std::fs::read(input_index).map_err(|err| { @@ -235,16 +234,25 @@ pub(crate) fn lookup_indexed_vcf_variants( ); let mut results = vec![VariantObservation::default(); variants.len()]; - for (idx, variant, locus, reference, alternate) in indexed_variants { - let observation = observe_vcf_snp_with_reader( - &mut indexed, - &backend.path.display().to_string(), - &locus, - reference, - alternate, - variant.rsids.first().cloned(), - detected_assembly, - )?; + let label = backend.path.display().to_string(); + for (idx, variant, locus) in indexed_variants { + let observation = if let (Some(reference), Some(alternate), true) = ( + first_single_base_allele(variant.reference.as_deref()), + first_single_base_allele(variant.alternate.as_deref()), + matches!(variant.kind, None | Some(VariantKind::Snp)), + ) { + observe_vcf_snp_with_reader( + &mut indexed, + &label, + &locus, + reference, + alternate, + variant.rsids.first().cloned(), + detected_assembly, + )? + } else { + observe_indexed_vcf_variant(&mut indexed, &label, variant, &locus, detected_assembly)? + }; results[idx] = if backend.options.impute_vcf_missing_as_reference && observation.genotype.is_none() && observation diff --git a/rust/bioscript-formats/src/genotype/vcf/indexed.rs b/rust/bioscript-formats/src/genotype/vcf/indexed.rs new file mode 100644 index 0000000..deaa6e2 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/vcf/indexed.rs @@ -0,0 +1,120 @@ +use std::fs::File; + +use noodles::bgzf; +use noodles::core::{Position, Region}; +use noodles::csi::{self, BinningIndex}; +use noodles::tabix; + +use bioscript_core::{Assembly, RuntimeError, VariantKind, VariantObservation, VariantSpec}; + +use super::{describe_query, parse_vcf_record, vcf_row_matches_variant}; + +pub(super) fn observe_indexed_vcf_variant( + indexed: &mut csi::io::IndexedReader, tabix::Index>, + label: &str, + variant: &VariantSpec, + locus: &bioscript_core::GenomicLocus, + assembly: Option, +) -> Result { + let query_pos = if matches!( + variant.kind, + Some(VariantKind::Deletion | VariantKind::Insertion | VariantKind::Indel) + ) { + locus.start.saturating_sub(1) + } else { + locus.start + }; + let locus_label = format!("{}:{query_pos}", locus.chrom); + let Some(seq_name) = resolve_vcf_chrom_name(indexed.index(), &locus.chrom) else { + return Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly, + evidence: vec![format!( + "{label}: tabix index has no contig matching {} (tried chr-prefixed and bare forms)", + locus.chrom + )], + ..VariantObservation::default() + }); + }; + let pos_usize = usize::try_from(query_pos).map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF position {query_pos} for {locus_label}: {err}" + )) + })?; + let position = Position::try_from(pos_usize).map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF position {query_pos} for {locus_label}: {err}" + )) + })?; + let region = Region::new(seq_name.as_str(), position..=position); + let query = indexed.query(®ion).map_err(|err| { + RuntimeError::Io(format!("{label}: tabix query for {locus_label}: {err}")) + })?; + + let mut saw_any = false; + for record_result in query { + let record = record_result + .map_err(|err| RuntimeError::Io(format!("{label}: tabix record iter: {err}")))?; + let line: &str = record.as_ref(); + let Some(row) = parse_vcf_record(line)? else { + continue; + }; + saw_any = true; + if vcf_row_matches_variant(&row, variant, assembly) { + return Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid: variant.rsids.first().cloned().or_else(|| row.rsid.clone()), + assembly, + genotype: Some(row.genotype.clone()), + evidence: vec![format!("{label}: resolved by indexed locus {locus_label}")], + ..VariantObservation::default() + }); + } + } + + let evidence = if saw_any { + vec![format!( + "{label}: {locus_label} present but no record matched {}", + describe_query(variant) + )] + } else { + vec![format!("{label}: no VCF record at {locus_label}")] + }; + Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly, + evidence, + ..VariantObservation::default() + }) +} + +fn resolve_vcf_chrom_name(index: &tabix::Index, user_chrom: &str) -> Option { + let header = index.header()?; + let names = header.reference_sequence_names(); + + let trimmed = user_chrom.trim(); + let stripped = trimmed.strip_prefix("chr").unwrap_or(trimmed); + + let candidates = [ + trimmed.to_owned(), + stripped.to_owned(), + format!("chr{stripped}"), + ]; + for cand in &candidates { + if names.contains(cand.as_bytes()) { + return Some(cand.clone()); + } + } + + let target = stripped.to_ascii_lowercase(); + for name in names { + let as_str = std::str::from_utf8(name.as_ref()).ok()?; + let as_stripped = as_str.strip_prefix("chr").unwrap_or(as_str); + if as_stripped.eq_ignore_ascii_case(&target) { + return Some(as_str.to_owned()); + } + } + None +} diff --git a/rust/bioscript-formats/src/genotype/vcf/matching.rs b/rust/bioscript-formats/src/genotype/vcf/matching.rs index 719ad9b..2d5bc1d 100644 --- a/rust/bioscript-formats/src/genotype/vcf/matching.rs +++ b/rust/bioscript-formats/src/genotype/vcf/matching.rs @@ -31,12 +31,22 @@ pub(super) fn imputed_reference_observation( inferred_sex: Option, missing_evidence: &str, ) -> Option { - if !matches!(variant.kind, None | Some(VariantKind::Snp)) { - return None; - } - let reference = first_single_base_allele(variant.reference.as_deref())?; - first_single_base_allele(variant.alternate.as_deref())?; - let genotype = reference_genotype_for_locus(reference, locus, inferred_sex); + let genotype = match variant.kind { + None | Some(VariantKind::Snp) => { + let reference = first_single_base_allele(variant.reference.as_deref())?; + first_single_base_allele(variant.alternate.as_deref())?; + reference_genotype_for_locus(reference, locus, inferred_sex) + } + Some(VariantKind::Deletion) => "II".to_owned(), + Some(VariantKind::Insertion | VariantKind::Indel) => { + let reference = variant.reference.as_deref()?; + let alternate = variant.alternate.as_deref()?; + let alternates = [alternate]; + let token = super::super::vcf_tokens::vcf_reference_token(reference, &alternates); + format!("{token}{token}") + } + _ => return None, + }; let evidence_prefix = if missing_evidence.contains(label) { missing_evidence.to_owned() } else { diff --git a/rust/bioscript-formats/tests/file_formats/vcf.rs b/rust/bioscript-formats/tests/file_formats/vcf.rs index cd81acf..be0d526 100644 --- a/rust/bioscript-formats/tests/file_formats/vcf.rs +++ b/rust/bioscript-formats/tests/file_formats/vcf.rs @@ -152,16 +152,51 @@ fn vcf_missing_locus_defaults_to_imputed_reference_with_sex_aware_ploidy() { kind: Some(VariantKind::Snp), ..VariantSpec::default() }, + VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 40, + end: 45, + }), + reference: Some("TTATAA".to_owned()), + alternate: Some("".to_owned()), + kind: Some(VariantKind::Deletion), + deletion_length: Some(6), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 50, + end: 50, + }), + reference: Some("A".to_owned()), + alternate: Some("AT".to_owned()), + kind: Some(VariantKind::Insertion), + ..VariantSpec::default() + }, ]) .unwrap(); assert_eq!(observations[0].genotype.as_deref(), Some("CC")); assert_eq!(observations[1].genotype.as_deref(), Some("G")); + assert_eq!(observations[2].genotype.as_deref(), Some("II")); + assert_eq!(observations[3].genotype.as_deref(), Some("DD")); assert!( observations[0].evidence[0].contains("imputed reference genotype"), "{:?}", observations[0].evidence ); + assert!( + observations[2].evidence[0].contains("imputed reference genotype"), + "{:?}", + observations[2].evidence + ); + assert!( + observations[3].evidence[0].contains("imputed reference genotype"), + "{:?}", + observations[3].evidence + ); } #[test]