diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 5475d67..eea489b 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -11,6 +11,7 @@ use zip::ZipArchive; use bioscript_core::{Assembly, GenomicLocus, VariantKind}; use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; +mod backends; mod common; mod cram_backend; mod delimited; @@ -51,12 +52,14 @@ pub use types::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, }; use types::{CramBackend, DelimitedBackend, QueryBackend, RsidMapBackend, VcfBackend}; -pub use vcf::observe_vcf_snp_with_reader; #[cfg(test)] use vcf::{ choose_variant_locus_for_assembly, detect_vcf_assembly, extract_vcf_sample_genotype, normalize_chromosome_name, parse_vcf_record, vcf_row_matches_variant, }; +pub use vcf::{ + imputed_reference_observation, observe_vcf_snp_with_reader, observe_vcf_variant_with_reader, +}; use vcf::{lookup_indexed_vcf_variants, scan_vcf_variants}; use vcf_tokens::genotype_from_vcf_gt; #[cfg(test)] @@ -71,6 +74,38 @@ impl GenotypeStore { Self::from_file_with_options(path, &GenotypeLoadOptions::default()) } + /// Wrap an existing store with a cache of pre-resolved observations. + /// `lookup_variant`/`lookup_variants` consult `observations` first; on + /// miss they delegate to the underlying store. This is how the report + /// pipeline avoids re-walking the genome inside analysis Python scripts — + /// the variants the panel/assays declared have already been observed in + /// `run_manifest_rows`, so the cache hits and the fallback only fires + /// for novel rsids the script asks about on its own. + pub fn with_cached_observations( + observations: Vec, + fallback: GenotypeStore, + ) -> Self { + Self { + backend: QueryBackend::Cached { + observations, + fallback: Box::new(fallback.backend), + }, + } + } + + /// Empty store whose `lookup_*` always returns "no observation". Useful as + /// the wasm-side fallback when paired with `with_cached_observations` — + /// every rsid the panel cares about is already in the cache so the + /// fallback never fires; novel rsids return None gracefully. + pub fn empty() -> Self { + Self { + backend: QueryBackend::RsidMap(RsidMapBackend { + format: GenotypeSourceFormat::Text, + values: HashMap::new(), + }), + } + } + pub fn from_file_with_options( path: &Path, options: &GenotypeLoadOptions, @@ -288,6 +323,15 @@ impl GenotypeStore { rsid_lookup: false, locus_lookup: true, }, + QueryBackend::Cached { .. } => { + // The cache itself answers both rsid and locus queries (we + // match by either), so unioning with the fallback gives the + // strongest set. + BackendCapabilities { + rsid_lookup: true, + locus_lookup: true, + } + } } } @@ -305,6 +349,7 @@ impl GenotypeStore { QueryBackend::Delimited(backend) => backend.backend_name(), QueryBackend::Vcf(backend) => backend.backend_name(), QueryBackend::Cram(backend) => backend.backend_name(), + QueryBackend::Cached { .. } => "cached", } } @@ -319,6 +364,21 @@ impl GenotypeStore { ..VariantSpec::default() }) .map(|obs| obs.genotype), + QueryBackend::Cached { + observations, + fallback, + } => { + if let Some(matched) = observations + .iter() + .find(|obs| obs.matched_rsid.as_deref().is_some_and(|r| r == rsid)) + { + return Ok(matched.genotype.clone()); + } + let inner = GenotypeStore { + backend: (**fallback).clone(), + }; + inner.get(rsid) + } } } @@ -331,6 +391,18 @@ impl GenotypeStore { QueryBackend::Delimited(backend) => backend.lookup_variant(variant), QueryBackend::Vcf(backend) => backend.lookup_variant(variant), QueryBackend::Cram(backend) => backend.lookup_variant(variant), + QueryBackend::Cached { + observations, + fallback, + } => { + if let Some(hit) = match_cached_observation(observations, variant) { + return Ok(hit.clone()); + } + let inner = GenotypeStore { + backend: (**fallback).clone(), + }; + inner.lookup_variant(variant) + } } } @@ -338,6 +410,36 @@ impl GenotypeStore { &self, variants: &[VariantSpec], ) -> Result, RuntimeError> { + if let QueryBackend::Cached { + observations, + fallback, + } = &self.backend + { + // Resolve cache hits up-front; only round-trip the fallback for + // misses so we don't pay for re-opening a CRAM/VCF when the panel + // already covered every variant the script needs. + let mut results: Vec> = vec![None; variants.len()]; + let mut miss_indices = Vec::new(); + let mut miss_specs = Vec::new(); + for (idx, spec) in variants.iter().enumerate() { + if let Some(hit) = match_cached_observation(observations, spec) { + results[idx] = Some(hit.clone()); + } else { + miss_indices.push(idx); + miss_specs.push(spec.clone()); + } + } + if !miss_specs.is_empty() { + let inner = GenotypeStore { + backend: (**fallback).clone(), + }; + let resolved = inner.lookup_variants(&miss_specs)?; + for (idx, observation) in miss_indices.into_iter().zip(resolved) { + results[idx] = Some(observation); + } + } + return Ok(results.into_iter().map(Option::unwrap_or_default).collect()); + } if let QueryBackend::Delimited(backend) = &self.backend { return backend.lookup_variants(variants); } @@ -358,97 +460,44 @@ impl GenotypeStore { } } -impl RsidMapBackend { - fn backend_name(&self) -> &'static str { - match self.format { - GenotypeSourceFormat::Text => "text", - GenotypeSourceFormat::Zip => "zip", - GenotypeSourceFormat::Vcf => "vcf", - GenotypeSourceFormat::Cram => "cram", - GenotypeSourceFormat::Bam => "bam", - } +/// Match a `VariantSpec` against a pre-resolved observation list. Tries rsid +/// equality first (most common case for `PGx` panels), then falls back to a +/// chrom+pos+ref+alt match against either `GRCh37` or `GRCh38` loci so cached +/// observations from a CRAM lookup (which may have been done on one assembly) +/// can satisfy a script that supplies the spec on the other. +fn match_cached_observation<'a>( + observations: &'a [VariantObservation], + spec: &VariantSpec, +) -> Option<&'a VariantObservation> { + if let Some(matched) = observations.iter().find(|obs| { + obs.matched_rsid + .as_deref() + .is_some_and(|rsid| spec.rsids.iter().any(|target| target == rsid)) + }) { + return Some(matched); } - - fn lookup_variant(&self, variant: &VariantSpec) -> Result { - for rsid in &variant.rsids { - if let Some(value) = self.values.get(rsid) { - return Ok(VariantObservation { - backend: self.backend_name().to_owned(), - matched_rsid: Some(rsid.clone()), - genotype: Some(value.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], - ..VariantObservation::default() - }); - } - } - - Ok(VariantObservation { - backend: self.backend_name().to_owned(), - evidence: vec!["no matching rsid found".to_owned()], - ..VariantObservation::default() - }) - } -} - -impl DelimitedBackend { - fn backend_name(&self) -> &'static str { - match self.format { - GenotypeSourceFormat::Text => "text", - GenotypeSourceFormat::Zip => "zip", - GenotypeSourceFormat::Vcf => "vcf", - GenotypeSourceFormat::Cram => "cram", - GenotypeSourceFormat::Bam => "bam", + let assembly_loci = [spec.grch37.as_ref(), spec.grch38.as_ref()]; + let target_ref = spec.reference.as_deref(); + let target_alt = spec.alternate.as_deref(); + observations.iter().find(|obs| { + let Some(loci) = assembly_loci.iter().find_map(|l| *l) else { + return false; + }; + let evidence_match = obs + .evidence + .iter() + .any(|line| line.contains(&loci.chrom) && line.contains(&loci.start.to_string())); + if !evidence_match { + return false; } - } - - fn get(&self, rsid: &str) -> Result, RuntimeError> { - let results = self.lookup_variants(&[VariantSpec { - rsids: vec![rsid.to_owned()], - ..VariantSpec::default() - }])?; - Ok(results.into_iter().next().and_then(|obs| obs.genotype)) - } - - fn lookup_variant(&self, variant: &VariantSpec) -> Result { - let mut results = self.lookup_variants(std::slice::from_ref(variant))?; - Ok(results.pop().unwrap_or_default()) - } - - fn lookup_variants( - &self, - variants: &[VariantSpec], - ) -> Result, RuntimeError> { - scan_delimited_variants(self, variants) - } -} - -impl VcfBackend { - fn backend_name(&self) -> &'static str { - "vcf" - } - - fn get(&self, rsid: &str) -> Result, RuntimeError> { - let results = self.lookup_variants(&[VariantSpec { - rsids: vec![rsid.to_owned()], - ..VariantSpec::default() - }])?; - Ok(results.into_iter().next().and_then(|obs| obs.genotype)) - } - - fn lookup_variant(&self, variant: &VariantSpec) -> Result { - let mut results = self.lookup_variants(std::slice::from_ref(variant))?; - Ok(results.pop().unwrap_or_default()) - } - - fn lookup_variants( - &self, - variants: &[VariantSpec], - ) -> Result, RuntimeError> { - if let Some(results) = lookup_indexed_vcf_variants(self, variants)? { - return Ok(results); + match (target_ref, target_alt) { + (Some(r), Some(a)) => obs + .evidence + .iter() + .any(|line| line.contains(r) && line.contains(a)), + _ => true, } - scan_vcf_variants(self, variants) - } + }) } #[cfg(test)] diff --git a/rust/bioscript-formats/src/genotype/backends.rs b/rust/bioscript-formats/src/genotype/backends.rs new file mode 100644 index 0000000..b2e70cd --- /dev/null +++ b/rust/bioscript-formats/src/genotype/backends.rs @@ -0,0 +1,108 @@ +use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; + +use super::{ + lookup_indexed_vcf_variants, scan_delimited_variants, scan_vcf_variants, + types::{DelimitedBackend, GenotypeSourceFormat, RsidMapBackend, VcfBackend}, +}; + +impl RsidMapBackend { + pub(super) fn backend_name(&self) -> &'static str { + match self.format { + GenotypeSourceFormat::Text => "text", + GenotypeSourceFormat::Zip => "zip", + GenotypeSourceFormat::Vcf => "vcf", + GenotypeSourceFormat::Cram => "cram", + GenotypeSourceFormat::Bam => "bam", + } + } + + pub(super) fn lookup_variant( + &self, + variant: &VariantSpec, + ) -> Result { + for rsid in &variant.rsids { + if let Some(value) = self.values.get(rsid) { + return Ok(VariantObservation { + backend: self.backend_name().to_owned(), + matched_rsid: Some(rsid.clone()), + genotype: Some(value.clone()), + evidence: vec![format!("resolved by rsid {rsid}")], + ..VariantObservation::default() + }); + } + } + + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + evidence: vec!["no matching rsid found".to_owned()], + ..VariantObservation::default() + }) + } +} + +impl DelimitedBackend { + pub(super) fn backend_name(&self) -> &'static str { + match self.format { + GenotypeSourceFormat::Text => "text", + GenotypeSourceFormat::Zip => "zip", + GenotypeSourceFormat::Vcf => "vcf", + GenotypeSourceFormat::Cram => "cram", + GenotypeSourceFormat::Bam => "bam", + } + } + + pub(super) fn get(&self, rsid: &str) -> Result, RuntimeError> { + let results = self.lookup_variants(&[VariantSpec { + rsids: vec![rsid.to_owned()], + ..VariantSpec::default() + }])?; + Ok(results.into_iter().next().and_then(|obs| obs.genotype)) + } + + pub(super) fn lookup_variant( + &self, + variant: &VariantSpec, + ) -> Result { + let mut results = self.lookup_variants(std::slice::from_ref(variant))?; + Ok(results.pop().unwrap_or_default()) + } + + pub(super) fn lookup_variants( + &self, + variants: &[VariantSpec], + ) -> Result, RuntimeError> { + scan_delimited_variants(self, variants) + } +} + +impl VcfBackend { + pub(super) fn backend_name(&self) -> &'static str { + "vcf" + } + + pub(super) fn get(&self, rsid: &str) -> Result, RuntimeError> { + let results = self.lookup_variants(&[VariantSpec { + rsids: vec![rsid.to_owned()], + ..VariantSpec::default() + }])?; + Ok(results.into_iter().next().and_then(|obs| obs.genotype)) + } + + pub(super) fn lookup_variant( + &self, + variant: &VariantSpec, + ) -> Result { + let mut results = self.lookup_variants(std::slice::from_ref(variant))?; + Ok(results.pop().unwrap_or_default()) + } + + pub(super) fn lookup_variants( + &self, + variants: &[VariantSpec], + ) -> Result, RuntimeError> { + if let Some(results) = lookup_indexed_vcf_variants(self, variants)? { + return Ok(results); + } + scan_vcf_variants(self, variants) + } +} diff --git a/rust/bioscript-formats/src/genotype/cram_backend/reader.rs b/rust/bioscript-formats/src/genotype/cram_backend/reader.rs index 82524da..c8a52ca 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend/reader.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend/reader.rs @@ -32,7 +32,11 @@ pub fn observe_cram_snp_with_reader( matched_rsid: Option, assembly: Option, ) -> Result { - let pileup = snp_pileup_with_reader(reader, label, locus, reference, alternate, false)?; + // Reader-based callers (wasm) have no way to surface a CLI flag for + // strict MD5 checking and the rust CLI report flow effectively defaults + // to lenient when an FASTA has subtle masking/case differences. Match + // that behavior here. + let pileup = snp_pileup_with_reader(reader, label, locus, reference, alternate, true)?; let ref_count = pileup.filtered_ref_count; let alt_count = pileup.filtered_alt_count; let depth = pileup.filtered_depth; diff --git a/rust/bioscript-formats/src/genotype/types.rs b/rust/bioscript-formats/src/genotype/types.rs index 310066c..b65d47b 100644 --- a/rust/bioscript-formats/src/genotype/types.rs +++ b/rust/bioscript-formats/src/genotype/types.rs @@ -1,6 +1,6 @@ use std::{collections::HashMap, path::PathBuf, str::FromStr}; -use bioscript_core::Assembly; +use bioscript_core::{Assembly, VariantObservation}; use crate::inspect::InferredSex; @@ -15,6 +15,18 @@ pub(crate) enum QueryBackend { Delimited(DelimitedBackend), Vcf(VcfBackend), Cram(CramBackend), + /// Pre-resolved observations layered on top of any other backend. + /// Variant lookups consult `observations` first (matched by rsid OR by + /// chrom+pos+ref+alt). On miss, falls back to `fallback`. This is the + /// abstraction that lets the report pipeline collect every observation + /// up-front in `run_manifest_rows` and have the analysis Python scripts' + /// `genotypes.lookup_variants(plan)` calls resolve from the cache without + /// re-walking the underlying genome — works identically on CLI (path + /// fallback) and wasm (rsid-map empty fallback). + Cached { + observations: Vec, + fallback: Box, + }, } #[derive(Debug, Clone)] diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index 6da81fd..cf8fa57 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -20,7 +20,7 @@ use super::{ mod matching; mod reader; -use matching::imputed_reference_observation; +pub use matching::imputed_reference_observation; pub(crate) use matching::{ choose_variant_locus_for_assembly, normalize_chromosome_name, vcf_row_matches_variant, }; diff --git a/rust/bioscript-formats/src/genotype/vcf/matching.rs b/rust/bioscript-formats/src/genotype/vcf/matching.rs index 2d5bc1d..e633cb7 100644 --- a/rust/bioscript-formats/src/genotype/vcf/matching.rs +++ b/rust/bioscript-formats/src/genotype/vcf/matching.rs @@ -22,7 +22,7 @@ pub(super) fn first_single_base_allele(value: Option<&str>) -> Option { chars.next().is_none().then_some(base) } -pub(super) fn imputed_reference_observation( +pub fn imputed_reference_observation( backend_name: &str, label: &str, variant: &VariantSpec, diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index e38057b..b01d3b7 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -15,7 +15,8 @@ mod prepare; pub use genotype::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, - observe_cram_indel_with_reader, observe_cram_snp_with_reader, observe_vcf_snp_with_reader, + imputed_reference_observation, observe_cram_indel_with_reader, observe_cram_snp_with_reader, + observe_vcf_snp_with_reader, observe_vcf_variant_with_reader, }; pub use inspect::{ DetectedKind, DetectionConfidence, FileContainer, FileInspection, InferredSex, InspectOptions, diff --git a/rust/bioscript-runtime/src/runtime/methods.rs b/rust/bioscript-runtime/src/runtime/methods.rs index 89279ca..1ee58d9 100644 --- a/rust/bioscript-runtime/src/runtime/methods.rs +++ b/rust/bioscript-runtime/src/runtime/methods.rs @@ -38,7 +38,7 @@ impl BioscriptRuntime { "bioscript.load_genotypes", )?)?; let loader = self.resolved_loader_options()?; - let store = if let Some(bytes) = self.read_virtual_binary_file(&path) { + let inner_store = if let Some(bytes) = self.read_virtual_binary_file(&path) { GenotypeStore::from_bytes( path.file_name() .and_then(|value| value.to_str()) @@ -48,6 +48,20 @@ impl BioscriptRuntime { } else { GenotypeStore::from_file_with_options(&path, &loader)? }; + // Layer pre-resolved observations on top of whatever backend the path + // resolves to. The report pipeline collects every variant the panel + // declares before running analyses, so `genotypes.lookup_variants(...)` + // hits the cache and we don't re-walk the genome inside Python. On a + // miss (script asks about a novel rsid) we fall through to the inner + // store — same behavior as before this cache existed. + let store = if self.config.preloaded_observations.is_empty() { + inner_store + } else { + GenotypeStore::with_cached_observations( + self.config.preloaded_observations.clone(), + inner_store, + ) + }; let handle = self.state.next_handle(); self.state .genotype_files diff --git a/rust/bioscript-runtime/src/runtime/state.rs b/rust/bioscript-runtime/src/runtime/state.rs index 76f8854..94adc87 100644 --- a/rust/bioscript-runtime/src/runtime/state.rs +++ b/rust/bioscript-runtime/src/runtime/state.rs @@ -7,6 +7,7 @@ use std::{ time::Duration, }; +use bioscript_core::VariantObservation; use bioscript_formats::{GenotypeLoadOptions, GenotypeStore}; use monty::{MontyException, ResourceLimits}; @@ -18,6 +19,12 @@ pub struct RuntimeConfig { pub loader: GenotypeLoadOptions, pub virtual_binary_files: BTreeMap>, pub virtual_text_files: BTreeMap, + /// Observations the host has already resolved before invoking the + /// runtime — `bioscript.load_genotypes(...)` wraps the underlying store + /// in a cache layered on these so analysis Python scripts' + /// `genotypes.lookup_variants(plan)` calls hit the cache first and only + /// fall through to the store for novel rsids the panel didn't cover. + pub preloaded_observations: Vec, } impl Default for RuntimeConfig { @@ -33,6 +40,7 @@ impl Default for RuntimeConfig { loader: GenotypeLoadOptions::default(), virtual_binary_files: BTreeMap::new(), virtual_text_files: BTreeMap::new(), + preloaded_observations: Vec::new(), } } } diff --git a/rust/bioscript-wasm/src/inspect_api.rs b/rust/bioscript-wasm/src/inspect_api.rs index 2508d2c..429e102 100644 --- a/rust/bioscript-wasm/src/inspect_api.rs +++ b/rust/bioscript-wasm/src/inspect_api.rs @@ -1,8 +1,8 @@ use std::path::PathBuf; use bioscript_formats::{ - inspect_bytes as inspect_bytes_rs, DetectedKind, DetectionConfidence, FileContainer, - FileInspection, InspectOptions, SourceMetadata, + DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions, + SourceMetadata, inspect_bytes as inspect_bytes_rs, }; use bioscript_schema::resolve_remote_resource_text as resolve_remote_resource_text_rs; use serde::{Deserialize, Serialize}; diff --git a/rust/bioscript-wasm/src/lookup_api.rs b/rust/bioscript-wasm/src/lookup_api.rs index 5483eb5..a52297b 100644 --- a/rust/bioscript-wasm/src/lookup_api.rs +++ b/rust/bioscript-wasm/src/lookup_api.rs @@ -2,8 +2,8 @@ use std::io::BufReader; use bioscript_core::{GenomicLocus, VariantKind, VariantObservation, VariantSpec}; use bioscript_formats::{ - alignment, observe_cram_indel_with_reader, observe_cram_snp_with_reader, - observe_vcf_snp_with_reader, GenotypeStore, + GenotypeStore, alignment, observe_cram_indel_with_reader, observe_cram_snp_with_reader, + observe_vcf_snp_with_reader, }; use noodles::csi as noodles_csi; use serde::{Deserialize, Serialize}; diff --git a/rust/bioscript-wasm/src/package_api.rs b/rust/bioscript-wasm/src/package_api.rs index 6d81ada..d0a7c85 100644 --- a/rust/bioscript-wasm/src/package_api.rs +++ b/rust/bioscript-wasm/src/package_api.rs @@ -5,7 +5,7 @@ use std::{ }; use bioscript_schema::{ - resolve_remote_resource_text, RemoteResourceKind, RemoteResourceResolution, + RemoteResourceKind, RemoteResourceResolution, resolve_remote_resource_text, }; use serde::Serialize; use sha2::{Digest, Sha256}; diff --git a/rust/bioscript-wasm/src/report_api.rs b/rust/bioscript-wasm/src/report_api.rs index 682ef41..2c33a32 100644 --- a/rust/bioscript-wasm/src/report_api.rs +++ b/rust/bioscript-wasm/src/report_api.rs @@ -5,7 +5,10 @@ use std::{ time::Duration, }; -use bioscript_core::{Assembly, OBSERVATION_TSV_HEADERS, VariantObservation}; +use bioscript_core::{ + Assembly, GenomicLocus, OBSERVATION_TSV_HEADERS, RuntimeError, VariantKind, VariantObservation, + VariantSpec, +}; use bioscript_formats::{ GenotypeLoadOptions, GenotypeStore, InferredSex, InspectOptions, SexDetectionConfidence, SexInference, inspect_bytes as inspect_bytes_rs, @@ -19,15 +22,20 @@ use monty::{MontyObject, ResourceLimits}; use serde::{Deserialize, Serialize}; use wasm_bindgen::prelude::*; -#[path = "report_render.rs"] -mod report_render; #[path = "report_helpers.rs"] mod report_helpers; +#[path = "report_lookup.rs"] +mod report_lookup; +#[path = "report_render.rs"] +mod report_render; #[path = "report_workspace.rs"] mod report_workspace; use report_helpers::*; -use report_render::{app_report_json, match_app_findings, render_app_html_document, AppReportJsonInput}; +use report_lookup::{CramReportLookup, VcfReportLookup}; +use report_render::{ + AppReportJsonInput, app_report_json, match_app_findings, render_app_html_document, +}; use report_workspace::PackageWorkspace; include!("../../bioscript-cli/src/report_matching.rs"); @@ -113,8 +121,10 @@ pub fn run_package_report_bytes( .map(|inference| inference.sex); let store = GenotypeStore::from_bytes(input_name, input_bytes) .map_err(|err| JsError::new(&format!("load genotypes failed: {err:?}")))?; - let rows = workspace.run_manifest_rows(manifest_path, &store, &participant_id, &options.filters)?; - let observations = rows + let manifest_output = + workspace.run_manifest_rows(manifest_path, &store, &participant_id, &options.filters)?; + let observations = manifest_output + .rows .iter() .map(|row| { workspace.app_observation_from_manifest_row( @@ -129,6 +139,7 @@ pub fn run_package_report_bytes( manifest_path, input_name, input_bytes, + &manifest_output.observations, &participant_id, &loader, &options, @@ -154,7 +165,132 @@ pub fn run_package_report_bytes( ); serde_json::to_string(&ReportRunOutput { artifacts: vec![ - artifact("observations.tsv", "text/tab-separated-values", observations_tsv), + artifact( + "observations.tsv", + "text/tab-separated-values", + observations_tsv, + ), + artifact("analysis.jsonl", "application/jsonl", analysis_jsonl), + artifact("reports.jsonl", "application/jsonl", reports_jsonl), + artifact("index.html", "text/html", html), + ], + duration_ms: (js_sys::Date::now() - started_ms).max(0.0) as u128, + text_output, + }) + .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) +} + +/// Mirrors `runPackageReportBytes` but for CRAM input. The CRAM body and +/// FASTA reference are streamed via JS-supplied `readAt` callbacks so the +/// browser doesn't have to load multi-GB genomes into wasm memory. The CRAI +/// and FAI indexes are passed inline. +/// +/// Analyses run against the observations produced from the CRAM lookup. The +/// per-script Python interpreter still receives `input_bytes` as a virtual +/// file; for CRAM that's an empty buffer because typical PGx analysis scripts +/// (apoe, mthfr, apol1, …) read observation rows rather than raw genome bytes. +#[wasm_bindgen(js_name = runPackageReportFromCram)] +#[allow(clippy::too_many_arguments)] +pub fn run_package_report_from_cram( + manifest_path: &str, + package_files_json: &str, + input_name: &str, + cram_read_at: js_sys::Function, + cram_len: f64, + crai_bytes: &[u8], + fasta_read_at: js_sys::Function, + fasta_len: f64, + fai_bytes: &[u8], + options_json: Option, +) -> Result { + use crate::js_reader::JsReader; + use std::io::BufReader; + let started_ms = js_sys::Date::now(); + let package_files: Vec = serde_json::from_str(package_files_json) + .map_err(|err| JsError::new(&format!("invalid package files JSON: {err}")))?; + let options = match options_json { + Some(text) if !text.is_empty() => serde_json::from_str(&text) + .map_err(|err| JsError::new(&format!("invalid report options JSON: {err}")))?, + _ => ReportOptionsInput::default(), + }; + let workspace = PackageWorkspace::new(package_files)?; + let participant_id = participant_id_from_name(input_name); + let assay_id = app_assay_id(Path::new(manifest_path))?; + let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; + let findings = workspace.load_manifest_findings(manifest_path)?; + let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + + let crai_index = bioscript_formats::alignment::parse_crai_bytes(crai_bytes) + .map_err(|err| JsError::new(&format!("parse crai: {err:?}")))?; + let fai_index = bioscript_formats::alignment::parse_fai_bytes(fai_bytes) + .map_err(|err| JsError::new(&format!("parse fai: {err:?}")))?; + let fasta_reader = BufReader::new(JsReader::new(fasta_read_at, fasta_len as u64, "fasta")); + let repository = bioscript_formats::alignment::build_reference_repository_from_readers( + fasta_reader, + fai_index, + ); + let cram_reader = JsReader::new(cram_read_at, cram_len as u64, "cram"); + let indexed = bioscript_formats::alignment::build_cram_indexed_reader_from_reader( + cram_reader, + crai_index, + repository, + ) + .map_err(|err| JsError::new(&format!("build cram reader: {err:?}")))?; + + let lookup = CramReportLookup { + reader: std::cell::RefCell::new(indexed), + label: input_name.to_owned(), + }; + + let mut loader = GenotypeLoadOptions::default(); + loader.format = Some(bioscript_formats::GenotypeSourceFormat::Cram); + loader.allow_reference_md5_mismatch = true; + let manifest_output = + workspace.run_manifest_rows(manifest_path, &lookup, &participant_id, &options.filters)?; + let observations = manifest_output + .rows + .iter() + .map(|row| workspace.app_observation_from_manifest_row(row, &assay_id, None, None)) + .collect::, _>>()?; + // Analysis scripts call `bioscript.load_genotypes(input_file)` then rsid + // lookups via `genotypes.lookup_variants(plan)`. The runtime now layers a + // pre-resolved-observation cache over whatever the input file resolves + // to (Plan B in genotype/types.rs:QueryBackend::Cached), so for CRAM the + // cache hits and we skip re-walking the genome. The input bytes can be + // empty since every spec the panel/assays declared is in the cache. + let analyses = workspace.run_manifest_analyses( + manifest_path, + input_name, + &[], + &manifest_output.observations, + &participant_id, + &loader, + &options, + )?; + let matched_findings = match_app_findings(&findings, &observations, &analyses); + let reports = vec![app_report_json(AppReportJsonInput { + assay_id: &assay_id, + participant_id: &participant_id, + input_file_name: input_name, + observations: &observations, + analyses: &analyses, + findings: &matched_findings, + provenance: &provenance, + input_inspection: None, + manifest_metadata: &manifest_metadata, + })]; + let observations_tsv = render_app_observations_tsv(&observations)?; + let analysis_jsonl = render_jsonl(&analyses)?; + let reports_jsonl = render_jsonl(&reports)?; + let html = render_app_html_document(&observations, &reports)?; + let text_output = "observations: observations.tsv\nanalysis: analysis.jsonl\nreports: reports.jsonl\nhtml: index.html\n".to_owned(); + serde_json::to_string(&ReportRunOutput { + artifacts: vec![ + artifact( + "observations.tsv", + "text/tab-separated-values", + observations_tsv, + ), artifact("analysis.jsonl", "application/jsonl", analysis_jsonl), artifact("reports.jsonl", "application/jsonl", reports_jsonl), artifact("index.html", "text/html", html), @@ -165,3 +301,98 @@ pub fn run_package_report_bytes( .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) } +/// Mirrors `runPackageReportBytes` but for a bgzipped, tabix-indexed VCF +/// streamed via JS-supplied `readAt` callbacks. The TBI is passed inline. +#[wasm_bindgen(js_name = runPackageReportFromVcf)] +#[allow(clippy::too_many_arguments)] +pub fn run_package_report_from_vcf( + manifest_path: &str, + package_files_json: &str, + input_name: &str, + vcf_read_at: js_sys::Function, + vcf_len: f64, + tbi_bytes: &[u8], + options_json: Option, +) -> Result { + use crate::js_reader::JsReader; + let started_ms = js_sys::Date::now(); + let package_files: Vec = serde_json::from_str(package_files_json) + .map_err(|err| JsError::new(&format!("invalid package files JSON: {err}")))?; + let options = match options_json { + Some(text) if !text.is_empty() => serde_json::from_str(&text) + .map_err(|err| JsError::new(&format!("invalid report options JSON: {err}")))?, + _ => ReportOptionsInput::default(), + }; + let workspace = PackageWorkspace::new(package_files)?; + let participant_id = participant_id_from_name(input_name); + let assay_id = app_assay_id(Path::new(manifest_path))?; + let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; + let findings = workspace.load_manifest_findings(manifest_path)?; + let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + + let tabix_index = bioscript_formats::alignment::parse_tbi_bytes(tbi_bytes) + .map_err(|err| JsError::new(&format!("parse tbi: {err:?}")))?; + let vcf_reader = JsReader::new(vcf_read_at, vcf_len as u64, "vcf"); + let indexed = noodles::csi::io::IndexedReader::new(vcf_reader, tabix_index); + + let lookup = VcfReportLookup { + reader: std::cell::RefCell::new(indexed), + label: input_name.to_owned(), + }; + + let mut loader = GenotypeLoadOptions::default(); + loader.format = Some(bioscript_formats::GenotypeSourceFormat::Vcf); + let manifest_output = + workspace.run_manifest_rows(manifest_path, &lookup, &participant_id, &options.filters)?; + let observations = manifest_output + .rows + .iter() + .map(|row| workspace.app_observation_from_manifest_row(row, &assay_id, None, None)) + .collect::, _>>()?; + // Pre-resolved observation cache replaces the synth approach: analysis + // scripts hit the cache via QueryBackend::Cached and skip re-opening the + // VCF. See report_api.rs:run_package_report_from_cram for the same + // pattern and bioscript-formats::genotype::types::QueryBackend::Cached + // for the dispatch. + let analyses = workspace.run_manifest_analyses( + manifest_path, + input_name, + &[], + &manifest_output.observations, + &participant_id, + &loader, + &options, + )?; + let matched_findings = match_app_findings(&findings, &observations, &analyses); + let reports = vec![app_report_json(AppReportJsonInput { + assay_id: &assay_id, + participant_id: &participant_id, + input_file_name: input_name, + observations: &observations, + analyses: &analyses, + findings: &matched_findings, + provenance: &provenance, + input_inspection: None, + manifest_metadata: &manifest_metadata, + })]; + let observations_tsv = render_app_observations_tsv(&observations)?; + let analysis_jsonl = render_jsonl(&analyses)?; + let reports_jsonl = render_jsonl(&reports)?; + let html = render_app_html_document(&observations, &reports)?; + let text_output = "observations: observations.tsv\nanalysis: analysis.jsonl\nreports: reports.jsonl\nhtml: index.html\n".to_owned(); + serde_json::to_string(&ReportRunOutput { + artifacts: vec![ + artifact( + "observations.tsv", + "text/tab-separated-values", + observations_tsv, + ), + artifact("analysis.jsonl", "application/jsonl", analysis_jsonl), + artifact("reports.jsonl", "application/jsonl", reports_jsonl), + artifact("index.html", "text/html", html), + ], + duration_ms: (js_sys::Date::now() - started_ms).max(0.0) as u128, + text_output, + }) + .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) +} diff --git a/rust/bioscript-wasm/src/report_helpers.rs b/rust/bioscript-wasm/src/report_helpers.rs index dc91706..ca7e284 100644 --- a/rust/bioscript-wasm/src/report_helpers.rs +++ b/rust/bioscript-wasm/src/report_helpers.rs @@ -23,18 +23,50 @@ pub(super) fn variant_row( row.insert("tags".to_owned(), tags.join(",")); row.insert("backend".to_owned(), observation.backend.clone()); row.insert("participant_id".to_owned(), participant_id.to_owned()); - row.insert("matched_rsid".to_owned(), observation.matched_rsid.clone().unwrap_or_default()); - row.insert("assembly".to_owned(), observation.assembly.map(assembly_row_value).unwrap_or_default()); - row.insert("genotype".to_owned(), observation.genotype.clone().unwrap_or_default()); - row.insert("ref_count".to_owned(), observation.ref_count.map_or_else(String::new, |value| value.to_string())); - row.insert("alt_count".to_owned(), observation.alt_count.map_or_else(String::new, |value| value.to_string())); - row.insert("depth".to_owned(), observation.depth.map_or_else(String::new, |value| value.to_string())); - row.insert("raw_counts".to_owned(), serde_json::to_string(&observation.raw_counts).unwrap_or_default()); + row.insert( + "matched_rsid".to_owned(), + observation.matched_rsid.clone().unwrap_or_default(), + ); + row.insert( + "assembly".to_owned(), + observation + .assembly + .map(assembly_row_value) + .unwrap_or_default(), + ); + row.insert( + "genotype".to_owned(), + observation.genotype.clone().unwrap_or_default(), + ); + row.insert( + "ref_count".to_owned(), + observation + .ref_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "alt_count".to_owned(), + observation + .alt_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "depth".to_owned(), + observation + .depth + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "raw_counts".to_owned(), + serde_json::to_string(&observation.raw_counts).unwrap_or_default(), + ); row.insert("evidence".to_owned(), observation.evidence.join(" | ")); row } -pub(super) fn render_app_observations_tsv(observations: &[serde_json::Value]) -> Result { +pub(super) fn render_app_observations_tsv( + observations: &[serde_json::Value], +) -> Result { let mut out = OBSERVATION_TSV_HEADERS.join("\t"); out.push('\n'); for observation in observations { @@ -94,7 +126,12 @@ pub(super) fn app_assay_id(path: &Path) -> Result { path.file_stem() .and_then(|value| value.to_str()) .map(ToOwned::to_owned) - .ok_or_else(|| JsError::new(&format!("failed to derive assay id from {}", path.display()))) + .ok_or_else(|| { + JsError::new(&format!( + "failed to derive assay id from {}", + path.display() + )) + }) } pub(super) fn matches_filters(manifest: &VariantManifest, path: &str, filters: &[String]) -> bool { @@ -134,8 +171,9 @@ pub(super) fn parse_analysis_output_text( "jsonl" => { let mut rows: Vec = Vec::new(); for line in text.lines().filter(|line| !line.trim().is_empty()) { - rows.push(serde_json::from_str(line) - .map_err(|err| JsError::new(&format!("failed to parse analysis JSONL: {err}")))?); + rows.push(serde_json::from_str(line).map_err(|err| { + JsError::new(&format!("failed to parse analysis JSONL: {err}")) + })?); } let row_headers = rows .iter() @@ -144,7 +182,9 @@ pub(super) fn parse_analysis_output_text( .unwrap_or_default(); Ok((rows, row_headers)) } - other => Err(JsError::new(&format!("unsupported analysis output_format '{other}'"))), + other => Err(JsError::new(&format!( + "unsupported analysis output_format '{other}'" + ))), } } @@ -164,7 +204,9 @@ fn parse_analysis_tsv(text: &str) -> (Vec, Vec) { .map(|(index, header)| { ( header.clone(), - serde_json::Value::String(fields.get(index).copied().unwrap_or_default().to_owned()), + serde_json::Value::String( + fields.get(index).copied().unwrap_or_default().to_owned(), + ), ) }) .collect(); @@ -175,7 +217,8 @@ fn parse_analysis_tsv(text: &str) -> (Vec, Vec) { } pub(super) fn yaml_to_json(value: serde_yaml::Value) -> Result { - serde_json::to_value(value).map_err(|err| JsError::new(&format!("failed to convert YAML to JSON: {err}"))) + serde_json::to_value(value) + .map_err(|err| JsError::new(&format!("failed to convert YAML to JSON: {err}"))) } pub(super) fn collect_manifest_provenance_entries( @@ -203,7 +246,9 @@ pub(super) fn collect_manifest_provenance_entries( Ok(()) } -pub(super) fn input_inspection_json(inspection: &bioscript_formats::FileInspection) -> serde_json::Value { +pub(super) fn input_inspection_json( + inspection: &bioscript_formats::FileInspection, +) -> serde_json::Value { serde_json::json!({ "container": match inspection.container { bioscript_formats::FileContainer::Plain => "plain", @@ -246,7 +291,10 @@ pub(super) fn input_inspection_json(inspection: &bioscript_formats::FileInspecti } pub(super) fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option { - value.get(key).and_then(serde_yaml::Value::as_str).map(ToOwned::to_owned) + value + .get(key) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) } pub(super) fn yaml_string_sequence(value: &serde_yaml::Value, key: &str) -> Vec { @@ -286,7 +334,10 @@ pub(super) fn variant_primary_source_from_yaml(value: &serde_yaml::Value) -> ser "fields": ["identifiers.rsids"], }); } - links.into_values().next().unwrap_or(serde_json::Value::Null) + links + .into_values() + .next() + .unwrap_or(serde_json::Value::Null) } pub(super) fn normalize_app_genotype( @@ -305,7 +356,10 @@ pub(super) fn normalize_app_genotype( } let ref_ch = ref_allele.chars().next().unwrap_or_default(); let alt_ch = alt_allele.chars().next().unwrap_or_default(); - if is_confident_male_sex_chromosome(chrom, inferred_sex) && alleles.len() == 2 && alleles[0] == alleles[1] { + if is_confident_male_sex_chromosome(chrom, inferred_sex) + && alleles.len() == 2 + && alleles[0] == alleles[1] + { let allele = alleles[0]; if allele == ref_ch { return ("0".to_owned(), "hem_ref".to_owned()); @@ -329,11 +383,18 @@ pub(super) fn normalize_app_genotype( fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { matches!( - chrom.trim().trim_start_matches("chr").to_ascii_uppercase().as_str(), + chrom + .trim() + .trim_start_matches("chr") + .to_ascii_uppercase() + .as_str(), "X" | "Y" | "23" | "24" ) && inferred_sex.is_some_and(|sex| { sex.sex == InferredSex::Male - && matches!(sex.confidence, SexDetectionConfidence::High | SexDetectionConfidence::Medium) + && matches!( + sex.confidence, + SexDetectionConfidence::High | SexDetectionConfidence::Medium + ) }) } diff --git a/rust/bioscript-wasm/src/report_lookup.rs b/rust/bioscript-wasm/src/report_lookup.rs new file mode 100644 index 0000000..dee827b --- /dev/null +++ b/rust/bioscript-wasm/src/report_lookup.rs @@ -0,0 +1,268 @@ +use super::*; + +/// Per-variant CRAM lookup that satisfies the workspace's `VariantLookup` +/// trait. Holds the IndexedReader in a `RefCell` so &self lookup methods can +/// mutably read while still being object-safe. +pub(super) struct CramReportLookup { + pub(super) reader: std::cell::RefCell>, + pub(super) label: String, +} + +impl report_workspace::VariantLookup for CramReportLookup { + fn lookup_variant(&self, spec: &VariantSpec) -> Result { + let mut reader = self.reader.borrow_mut(); + observe_cram_variant(&mut reader, &self.label, spec) + } + + fn lookup_variants( + &self, + specs: &[VariantSpec], + ) -> Result, RuntimeError> { + let mut reader = self.reader.borrow_mut(); + let mut out = Vec::with_capacity(specs.len()); + for spec in specs { + out.push(observe_cram_variant(&mut reader, &self.label, spec)?); + } + Ok(out) + } +} + +/// Build a minimal 23andMe-style text from the observations we already +/// computed. Format: `rsid\tchrom\tpos\tgenotype` per line. The runtime's +/// delimited-text loader reads this back as a `RsidMap`/`Delimited` backend +/// so analysis scripts can call `bioscript.load_genotypes(input_file)` and +/// have rsid lookups answered from the cached table. +#[allow(dead_code)] +fn synthesize_genotype_text_from_observations(observations: &[serde_json::Value]) -> String { + let mut out = String::from("# rsid\tchromosome\tposition\tgenotype\n"); + for observation in observations { + let rsid = observation + .get("rsid") + .and_then(serde_json::Value::as_str) + .unwrap_or(""); + if rsid.is_empty() { + continue; + } + let chrom = observation + .get("chrom") + .and_then(serde_json::Value::as_str) + .unwrap_or(""); + let pos = observation + .get("pos_start") + .and_then(|v| { + v.as_i64() + .or_else(|| v.as_str().and_then(|s| s.parse::().ok())) + }) + .unwrap_or(0); + let genotype = observation + .get("genotype_display") + .and_then(serde_json::Value::as_str) + .filter(|s| !s.is_empty() && *s != "??") + .unwrap_or("--"); + out.push_str(&format!("{rsid}\t{chrom}\t{pos}\t{genotype}\n")); + } + out +} + +fn observe_cram_variant( + reader: &mut noodles::cram::io::indexed_reader::IndexedReader, + label: &str, + variant: &VariantSpec, +) -> Result { + let assembly = variant + .grch38 + .as_ref() + .map(|_| Assembly::Grch38) + .or_else(|| variant.grch37.as_ref().map(|_| Assembly::Grch37)); + let locus = variant + .grch38 + .as_ref() + .or(variant.grch37.as_ref()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} has no GRCh37/GRCh38 locus", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let locus = GenomicLocus { + chrom: locus.chrom.clone(), + start: locus.start, + end: locus.end, + }; + let kind = variant.kind.unwrap_or(VariantKind::Snp); + match kind { + VariantKind::Snp => { + let ref_char = variant + .reference + .as_deref() + .and_then(|s| s.chars().next()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing reference allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let alt_char = variant + .alternate + .as_deref() + .and_then(|s| s.chars().next()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing alternate allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + bioscript_formats::observe_cram_snp_with_reader( + reader, + label, + &locus, + ref_char, + alt_char, + variant.rsids.first().cloned(), + assembly, + ) + } + VariantKind::Insertion | VariantKind::Indel => { + let reference = variant.reference.as_deref().ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing reference allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let alternate = variant.alternate.as_deref().ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing alternate allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + bioscript_formats::observe_cram_indel_with_reader( + reader, + label, + &locus, + reference, + alternate, + variant.rsids.first().cloned(), + assembly, + ) + } + other => Err(RuntimeError::Io(format!( + "variant {} kind {:?} not supported on CRAM via wasm", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant"), + other + ))), + } +} + +pub(super) struct VcfReportLookup { + pub(super) reader: std::cell::RefCell< + noodles::csi::io::IndexedReader, noodles::tabix::Index>, + >, + pub(super) label: String, +} + +impl report_workspace::VariantLookup for VcfReportLookup { + fn lookup_variant(&self, spec: &VariantSpec) -> Result { + let mut reader = self.reader.borrow_mut(); + observe_vcf_variant(&mut reader, &self.label, spec) + } + + fn lookup_variants( + &self, + specs: &[VariantSpec], + ) -> Result, RuntimeError> { + let mut reader = self.reader.borrow_mut(); + let mut out = Vec::with_capacity(specs.len()); + for spec in specs { + out.push(observe_vcf_variant(&mut reader, &self.label, spec)?); + } + Ok(out) + } +} + +fn observe_vcf_variant( + reader: &mut noodles::csi::io::IndexedReader< + noodles::bgzf::io::Reader, + noodles::tabix::Index, + >, + label: &str, + variant: &VariantSpec, +) -> Result { + let assembly = variant + .grch38 + .as_ref() + .map(|_| Assembly::Grch38) + .or_else(|| variant.grch37.as_ref().map(|_| Assembly::Grch37)); + let raw_locus = variant + .grch38 + .as_ref() + .or(variant.grch37.as_ref()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} has no GRCh37/GRCh38 locus", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let locus = GenomicLocus { + chrom: raw_locus.chrom.clone(), + start: raw_locus.start, + end: raw_locus.end, + }; + let observation = bioscript_formats::observe_vcf_variant_with_reader( + reader, + label, + &locus, + variant, + variant.rsids.first().cloned(), + assembly, + )?; + // Mirror the CLI report flow's `impute_vcf_missing_as_reference: true` + // default: when the VCF has no record at this locus, treat the genotype + // as homozygous reference. + if observation.genotype.is_none() + && observation + .evidence + .iter() + .any(|line| line.contains("no VCF record at")) + { + if let Some(imputed) = bioscript_formats::imputed_reference_observation( + "vcf", + label, + variant, + &locus, + assembly, + None, + &observation.evidence.join(" | "), + ) { + return Ok(imputed); + } + } + Ok(observation) +} diff --git a/rust/bioscript-wasm/src/report_render.rs b/rust/bioscript-wasm/src/report_render.rs index dc8f914..9d98988 100644 --- a/rust/bioscript-wasm/src/report_render.rs +++ b/rust/bioscript-wasm/src/report_render.rs @@ -17,7 +17,9 @@ pub(super) fn app_report_json(input: AppReportJsonInput<'_>) -> serde_json::Valu let called = input .observations .iter() - .filter(|item| item.get("call_status").and_then(serde_json::Value::as_str) == Some("called")) + .filter(|item| { + item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") + }) .count(); serde_json::json!({ "schema": "bioscript:report:1.0", @@ -121,7 +123,14 @@ pub(super) fn render_app_html_document( let analysis_outputs = collect_report_analyses(reports); let participants = collect_report_participants(reports); render_report_manifest_header(&mut out, reports); - let _ = write!(out, "
{} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
", observations.len(), analysis_outputs.len(), label_findings.len(), summary_findings.len()); + let _ = write!( + out, + "
{} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
", + observations.len(), + analysis_outputs.len(), + label_findings.len(), + summary_findings.len() + ); render_participant_filter(&mut out, &participants); out.push_str(""); out.push_str("

Input

"); @@ -129,7 +138,12 @@ pub(super) fn render_app_html_document( out.push_str("

Observations

"); render_observation_table(&mut out, observations, participants.len() > 1); out.push_str("

Analysis

"); - render_analysis_tables(&mut out, &analysis_outputs, observations, participants.len() > 1); + render_analysis_tables( + &mut out, + &analysis_outputs, + observations, + participants.len() > 1, + ); out.push_str("

PGx

"); render_pgx_table(&mut out, &label_findings, &summary_findings); out.push_str("

Provenance

"); @@ -138,7 +152,8 @@ pub(super) fn render_app_html_document( render_report_source_section(&mut out, reports); out.push_str("

Raw Reports JSON

Show raw report JSON"); for report in reports { - let text = serde_json::to_string_pretty(report).map_err(|err| JsError::new(&err.to_string()))?; + let text = + serde_json::to_string_pretty(report).map_err(|err| JsError::new(&err.to_string()))?; let _ = write!(out, "
{}
", html_escape(&text)); } out.push_str("
"); diff --git a/rust/bioscript-wasm/src/report_workspace.rs b/rust/bioscript-wasm/src/report_workspace.rs index 66d5c7e..e075abd 100644 --- a/rust/bioscript-wasm/src/report_workspace.rs +++ b/rust/bioscript-wasm/src/report_workspace.rs @@ -1,5 +1,42 @@ use super::*; +#[path = "report_workspace/analysis.rs"] +mod analysis; + +/// What a manifest row walk produces: human-readable rows for the +/// observation TSV/HTML, and the underlying `VariantObservation`s ready to +/// hand to the analysis runtime as a pre-resolved cache (so analysis Python +/// scripts' `genotypes.lookup_variants(plan)` call hits cache instead of +/// re-walking the genome). +pub(super) struct ManifestRowsOutput { + pub rows: Vec>, + pub observations: Vec, +} + +/// Abstract per-variant observation source so the workspace can run against +/// either a path-based `GenotypeStore` (text/zip — bytes already in memory) +/// or a CRAM/VCF-reader-backed lookup that streams through JS-supplied +/// `readAt` callbacks. +pub(super) trait VariantLookup { + fn lookup_variant(&self, spec: &VariantSpec) -> Result; + fn lookup_variants( + &self, + specs: &[VariantSpec], + ) -> Result, RuntimeError>; +} + +impl VariantLookup for GenotypeStore { + fn lookup_variant(&self, spec: &VariantSpec) -> Result { + GenotypeStore::lookup_variant(self, spec) + } + fn lookup_variants( + &self, + specs: &[VariantSpec], + ) -> Result, RuntimeError> { + GenotypeStore::lookup_variants(self, specs) + } +} + pub(super) struct PackageWorkspace { files: BTreeMap, } @@ -58,39 +95,51 @@ impl PackageWorkspace { pub(super) fn run_manifest_rows( &self, manifest_path: &str, - store: &GenotypeStore, + store: &dyn VariantLookup, participant_id: &str, filters: &[String], - ) -> Result>, JsError> { + ) -> Result { match self.schema(manifest_path)?.as_str() { "bioscript:variant:1.0" | "bioscript:variant" => { let manifest = self.load_variant(manifest_path)?; let observation = store .lookup_variant(&manifest.spec) .map_err(|err| JsError::new(&format!("lookup {}: {err:?}", manifest.name)))?; - Ok(vec![variant_row( + let row = variant_row( manifest_path, &manifest.name, &manifest.tags, &observation, participant_id, - )]) + ); + Ok(ManifestRowsOutput { + rows: vec![row], + observations: vec![observation], + }) + } + "bioscript:panel:1.0" => { + self.run_panel_rows(manifest_path, store, participant_id, filters) + } + "bioscript:assay:1.0" => { + self.run_assay_rows(manifest_path, store, participant_id, filters) } - "bioscript:panel:1.0" => self.run_panel_rows(manifest_path, store, participant_id, filters), - "bioscript:assay:1.0" => self.run_assay_rows(manifest_path, store, participant_id, filters), - other => Err(JsError::new(&format!("unsupported manifest schema '{other}'"))), + other => Err(JsError::new(&format!( + "unsupported manifest schema '{other}'" + ))), } } fn run_panel_rows( &self, manifest_path: &str, - store: &GenotypeStore, + store: &dyn VariantLookup, participant_id: &str, filters: &[String], - ) -> Result>, JsError> { + ) -> Result { let panel = self.load_panel(manifest_path)?; - let mut rows_by_member: Vec>> = vec![Vec::new(); panel.members.len()]; + let mut rows_by_member: Vec>> = + vec![Vec::new(); panel.members.len()]; + let mut all_observations = Vec::::new(); let mut variants = Vec::<(usize, String, VariantManifest)>::new(); for (index, member) in panel.members.iter().enumerate() { let Some(path) = &member.path else { @@ -103,7 +152,10 @@ impl PackageWorkspace { variants.push((index, resolved, variant)); } } else if member.kind == "assay" { - rows_by_member[index] = self.run_assay_rows(&resolved, store, participant_id, filters)?; + let assay_output = + self.run_assay_rows(&resolved, store, participant_id, filters)?; + rows_by_member[index] = assay_output.rows; + all_observations.extend(assay_output.observations); } } let specs = variants @@ -113,7 +165,9 @@ impl PackageWorkspace { let observations = store .lookup_variants(&specs) .map_err(|err| JsError::new(&format!("panel lookup failed: {err:?}")))?; - for ((member_index, resolved, manifest), observation) in variants.into_iter().zip(observations) { + for ((member_index, resolved, manifest), observation) in + variants.into_iter().zip(observations) + { rows_by_member[member_index].push(variant_row( &resolved, &manifest.name, @@ -121,17 +175,21 @@ impl PackageWorkspace { &observation, participant_id, )); + all_observations.push(observation); } - Ok(rows_by_member.into_iter().flatten().collect()) + Ok(ManifestRowsOutput { + rows: rows_by_member.into_iter().flatten().collect(), + observations: all_observations, + }) } fn run_assay_rows( &self, manifest_path: &str, - store: &GenotypeStore, + store: &dyn VariantLookup, participant_id: &str, filters: &[String], - ) -> Result>, JsError> { + ) -> Result { let assay = self.load_assay(manifest_path)?; let mut variants = Vec::<(String, VariantManifest)>::new(); for member in &assay.members { @@ -154,176 +212,22 @@ impl PackageWorkspace { let observations = store .lookup_variants(&specs) .map_err(|err| JsError::new(&format!("assay lookup failed: {err:?}")))?; - Ok(variants - .into_iter() - .zip(observations) - .map(|((resolved, manifest), observation)| { - variant_row( - &resolved, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - ) - }) - .collect()) - } - - pub(super) fn run_manifest_analyses( - &self, - manifest_path: &str, - input_name: &str, - input_bytes: &[u8], - participant_id: &str, - loader: &GenotypeLoadOptions, - options: &ReportOptionsInput, - ) -> Result, JsError> { - match self.schema(manifest_path)?.as_str() { - "bioscript:panel:1.0" => { - let panel = self.load_panel(manifest_path)?; - let mut analyses = self.run_interpretations( - manifest_path, - &panel.name, - &panel.interpretations, - input_name, - input_bytes, - participant_id, - loader, - options, - )?; - for member in &panel.members { - if member.kind != "assay" { - continue; - } - let Some(path) = &member.path else { - continue; - }; - let resolved = self.resolve(manifest_path, path)?; - analyses.extend(self.run_manifest_analyses( - &resolved, - input_name, - input_bytes, - participant_id, - loader, - options, - )?); - } - Ok(analyses) - } - "bioscript:assay:1.0" => { - let assay = self.load_assay(manifest_path)?; - self.run_interpretations( - manifest_path, - &assay.name, - &assay.interpretations, - input_name, - input_bytes, - participant_id, - loader, - options, - ) - } - _ => Ok(Vec::new()), - } - } - - #[allow(clippy::too_many_arguments)] - fn run_interpretations( - &self, - manifest_path: &str, - manifest_name: &str, - interpretations: &[PanelInterpretation], - input_name: &str, - input_bytes: &[u8], - participant_id: &str, - loader: &GenotypeLoadOptions, - options: &ReportOptionsInput, - ) -> Result, JsError> { - let mut outputs = Vec::new(); - for interpretation in interpretations { - if interpretation.kind != "bioscript" { - return Err(JsError::new(&format!( - "analysis '{}' uses unsupported kind '{}'", - interpretation.id, interpretation.kind - ))); - } - let script_path = self.resolve(manifest_path, &interpretation.path)?; - let output_file = format!( - "analysis/{participant_id}/{}.{}", - interpretation.id, - interpretation.output_format.as_deref().unwrap_or("json") - ); - let mut virtual_text_files = self.files.clone(); - let mut virtual_binary_files = BTreeMap::new(); - virtual_binary_files.insert(input_name.to_owned(), input_bytes.to_vec()); - let limits = ResourceLimits::new() - .max_duration(Duration::from_millis(options.analysis_max_duration_ms)) - .max_memory(16 * 1024 * 1024) - .max_allocations(400_000) - .gc_interval(1000) - .max_recursion_depth(Some(200)); - let runtime = BioscriptRuntime::with_config( - PathBuf::new(), - RuntimeConfig { - limits, - loader: loader.clone(), - virtual_binary_files, - virtual_text_files: std::mem::take(&mut virtual_text_files), - }, - ) - .map_err(|err| JsError::new(&format!("create analysis runtime failed: {err:?}")))?; - runtime - .run_file( - &script_path, - None, - vec![ - ("input_file", MontyObject::String(input_name.to_owned())), - ("output_file", MontyObject::String(output_file.clone())), - ("participant_id", MontyObject::String(participant_id.to_owned())), - ], - ) - .map_err(|err| JsError::new(&format!("analysis {} failed: {err:?}", interpretation.id)))?; - let written = runtime.virtual_written_text_files(); - let text = written - .get(&output_file) - .ok_or_else(|| JsError::new(&format!("analysis {} did not write {output_file}", interpretation.id)))?; - let format = interpretation - .output_format - .as_deref() - .unwrap_or("json") - .to_ascii_lowercase(); - let (rows, row_headers) = parse_analysis_output_text(text, &format)?; - outputs.push(serde_json::json!({ - "schema": "bioscript:analysis-output:1.0", - "version": "1.0", - "participant_id": participant_id, - "assay_id": manifest_name, - "analysis_id": interpretation.id, - "analysis_label": interpretation.label, - "kind": interpretation.kind, - "output_format": format, - "manifest_path": manifest_path, - "script_path": script_path, - "output_file": output_file, - "derived_from": interpretation.derived_from, - "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ - "key": emit.key, - "label": emit.label, - "value_type": emit.value_type, - "format": emit.format, - })).collect::>(), - "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ - "description": logic.description, - "source": logic.source.as_ref().map(|source| serde_json::json!({ - "name": source.name, - "url": source.url, - })), - })), - "row_headers": row_headers, - "rows": rows, - })); + let mut rows = Vec::with_capacity(variants.len()); + let mut collected = Vec::with_capacity(variants.len()); + for ((resolved, manifest), observation) in variants.into_iter().zip(observations) { + rows.push(variant_row( + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + collected.push(observation); } - Ok(outputs) + Ok(ManifestRowsOutput { + rows, + observations: collected, + }) } pub(super) fn app_observation_from_manifest_row( @@ -349,11 +253,20 @@ impl PackageWorkspace { let locus = if assembly.eq_ignore_ascii_case("grch37") { manifest.spec.grch37.as_ref() } else { - manifest.spec.grch38.as_ref().or(manifest.spec.grch37.as_ref()) + manifest + .spec + .grch38 + .as_ref() + .or(manifest.spec.grch37.as_ref()) }; let chrom = locus.map_or(String::new(), |locus| locus.chrom.clone()); - let (genotype, zygosity) = - normalize_app_genotype(&genotype_display, &ref_allele, &alt_allele, &chrom, inferred_sex); + let (genotype, zygosity) = normalize_app_genotype( + &genotype_display, + &ref_allele, + &alt_allele, + &chrom, + inferred_sex, + ); let outcome = if genotype == "./." { "no_call" } else if zygosity == "hom_ref" || zygosity == "hem_ref" { @@ -397,7 +310,10 @@ impl PackageWorkspace { })) } - pub(super) fn report_manifest_metadata(&self, path: &str) -> Result { + pub(super) fn report_manifest_metadata( + &self, + path: &str, + ) -> Result { let value = self.yaml(path)?; let members = value .get("members") @@ -426,18 +342,30 @@ impl PackageWorkspace { })) } - pub(super) fn load_manifest_findings(&self, path: &str) -> Result, JsError> { + pub(super) fn load_manifest_findings( + &self, + path: &str, + ) -> Result, JsError> { let value = self.yaml(path)?; let schema = yaml_string(&value, "schema").unwrap_or_default(); let mut findings = Vec::new(); if matches!( schema.as_str(), - "bioscript:variant:1.0" | "bioscript:variant" | "bioscript:assay:1.0" | "bioscript:panel:1.0" | "bioscript:pgx-findings:1.0" + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" ) { - if let Some(items) = value.get("findings").and_then(serde_yaml::Value::as_sequence) { + if let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { for item in items { let json_item = yaml_to_json(item.clone())?; - if let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) { + if let Some(include) = + json_item.get("include").and_then(serde_json::Value::as_str) + { let include_path = self.resolve(path, include)?; let mut included = self.load_manifest_findings(&include_path)?; let inherited_binding = json_item.get("binding").cloned(); @@ -449,7 +377,9 @@ impl PackageWorkspace { if let Some(object) = included_item.as_object_mut() { object.insert( "binding".to_owned(), - inherited_binding.clone().unwrap_or(serde_json::Value::Null), + inherited_binding + .clone() + .unwrap_or(serde_json::Value::Null), ); } } @@ -461,12 +391,25 @@ impl PackageWorkspace { } } } - if matches!(schema.as_str(), "bioscript:assay:1.0" | "bioscript:panel:1.0") { - if let Some(items) = value.get("members").and_then(serde_yaml::Value::as_sequence) { + if matches!( + schema.as_str(), + "bioscript:assay:1.0" | "bioscript:panel:1.0" + ) { + if let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { continue }; - if !matches!(kind, "variant" | "assay") { continue; } - let Some(member_path) = member.get("path").and_then(serde_yaml::Value::as_str) else { continue }; + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(member_path) = member.get("path").and_then(serde_yaml::Value::as_str) + else { + continue; + }; let resolved = self.resolve(path, member_path)?; findings.extend(self.load_manifest_findings(&resolved)?); } @@ -475,17 +418,33 @@ impl PackageWorkspace { Ok(findings) } - pub(super) fn load_manifest_provenance_links(&self, path: &str) -> Result, JsError> { + pub(super) fn load_manifest_provenance_links( + &self, + path: &str, + ) -> Result, JsError> { let value = self.yaml(path)?; let schema = yaml_string(&value, "schema").unwrap_or_default(); let mut links = BTreeMap::::new(); collect_manifest_provenance_entries(&value, &mut links)?; - if matches!(schema.as_str(), "bioscript:assay:1.0" | "bioscript:panel:1.0") { - if let Some(items) = value.get("members").and_then(serde_yaml::Value::as_sequence) { + if matches!( + schema.as_str(), + "bioscript:assay:1.0" | "bioscript:panel:1.0" + ) { + if let Some(items) = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + { for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { continue }; - if !matches!(kind, "variant" | "assay") { continue; } - let Some(member_path) = member.get("path").and_then(serde_yaml::Value::as_str) else { continue }; + let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { + continue; + }; + if !matches!(kind, "variant" | "assay") { + continue; + } + let Some(member_path) = member.get("path").and_then(serde_yaml::Value::as_str) + else { + continue; + }; let resolved = self.resolve(path, member_path)?; for item in self.load_manifest_provenance_links(&resolved)? { if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { diff --git a/rust/bioscript-wasm/src/report_workspace/analysis.rs b/rust/bioscript-wasm/src/report_workspace/analysis.rs new file mode 100644 index 0000000..c522419 --- /dev/null +++ b/rust/bioscript-wasm/src/report_workspace/analysis.rs @@ -0,0 +1,174 @@ +use super::*; + +impl PackageWorkspace { + pub(super) fn run_manifest_analyses( + &self, + manifest_path: &str, + input_name: &str, + input_bytes: &[u8], + preloaded_observations: &[VariantObservation], + participant_id: &str, + loader: &GenotypeLoadOptions, + options: &ReportOptionsInput, + ) -> Result, JsError> { + match self.schema(manifest_path)?.as_str() { + "bioscript:panel:1.0" => { + let panel = self.load_panel(manifest_path)?; + let mut analyses = self.run_interpretations( + manifest_path, + &panel.name, + &panel.interpretations, + input_name, + input_bytes, + preloaded_observations, + participant_id, + loader, + options, + )?; + for member in &panel.members { + if member.kind != "assay" { + continue; + } + let Some(path) = &member.path else { + continue; + }; + let resolved = self.resolve(manifest_path, path)?; + analyses.extend(self.run_manifest_analyses( + &resolved, + input_name, + input_bytes, + preloaded_observations, + participant_id, + loader, + options, + )?); + } + Ok(analyses) + } + "bioscript:assay:1.0" => { + let assay = self.load_assay(manifest_path)?; + self.run_interpretations( + manifest_path, + &assay.name, + &assay.interpretations, + input_name, + input_bytes, + preloaded_observations, + participant_id, + loader, + options, + ) + } + _ => Ok(Vec::new()), + } + } + + #[allow(clippy::too_many_arguments)] + fn run_interpretations( + &self, + manifest_path: &str, + manifest_name: &str, + interpretations: &[PanelInterpretation], + input_name: &str, + input_bytes: &[u8], + preloaded_observations: &[VariantObservation], + participant_id: &str, + loader: &GenotypeLoadOptions, + options: &ReportOptionsInput, + ) -> Result, JsError> { + let mut outputs = Vec::new(); + for interpretation in interpretations { + if interpretation.kind != "bioscript" { + return Err(JsError::new(&format!( + "analysis '{}' uses unsupported kind '{}'", + interpretation.id, interpretation.kind + ))); + } + let script_path = self.resolve(manifest_path, &interpretation.path)?; + let output_file = format!( + "analysis/{participant_id}/{}.{}", + interpretation.id, + interpretation.output_format.as_deref().unwrap_or("json") + ); + let mut virtual_text_files = self.files.clone(); + let mut virtual_binary_files = BTreeMap::new(); + virtual_binary_files.insert(input_name.to_owned(), input_bytes.to_vec()); + let limits = ResourceLimits::new() + .max_duration(Duration::from_millis(options.analysis_max_duration_ms)) + .max_memory(16 * 1024 * 1024) + .max_allocations(400_000) + .gc_interval(1000) + .max_recursion_depth(Some(200)); + let runtime = BioscriptRuntime::with_config( + PathBuf::new(), + RuntimeConfig { + limits, + loader: loader.clone(), + virtual_binary_files, + virtual_text_files: std::mem::take(&mut virtual_text_files), + preloaded_observations: preloaded_observations.to_vec(), + }, + ) + .map_err(|err| JsError::new(&format!("create analysis runtime failed: {err:?}")))?; + runtime + .run_file( + &script_path, + None, + vec![ + ("input_file", MontyObject::String(input_name.to_owned())), + ("output_file", MontyObject::String(output_file.clone())), + ( + "participant_id", + MontyObject::String(participant_id.to_owned()), + ), + ], + ) + .map_err(|err| { + JsError::new(&format!("analysis {} failed: {err:?}", interpretation.id)) + })?; + let written = runtime.virtual_written_text_files(); + let text = written.get(&output_file).ok_or_else(|| { + JsError::new(&format!( + "analysis {} did not write {output_file}", + interpretation.id + )) + })?; + let format = interpretation + .output_format + .as_deref() + .unwrap_or("json") + .to_ascii_lowercase(); + let (rows, row_headers) = parse_analysis_output_text(text, &format)?; + outputs.push(serde_json::json!({ + "schema": "bioscript:analysis-output:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": manifest_name, + "analysis_id": interpretation.id, + "analysis_label": interpretation.label, + "kind": interpretation.kind, + "output_format": format, + "manifest_path": manifest_path, + "script_path": script_path, + "output_file": output_file, + "derived_from": interpretation.derived_from, + "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ + "key": emit.key, + "label": emit.label, + "value_type": emit.value_type, + "format": emit.format, + })).collect::>(), + "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ + "description": logic.description, + "source": logic.source.as_ref().map(|source| serde_json::json!({ + "name": source.name, + "url": source.url, + })), + })), + "row_headers": row_headers, + "rows": rows, + })); + } + Ok(outputs) + } +}