diff --git a/vortex-bench/src/statpopgen/builder.rs b/vortex-bench/src/statpopgen/builder.rs index 4e11b4e43e4..8d772d5cf9b 100644 --- a/vortex-bench/src/statpopgen/builder.rs +++ b/vortex-bench/src/statpopgen/builder.rs @@ -11,6 +11,7 @@ use arrow_array::builder::Float32Builder; use arrow_array::builder::Int32Builder; use arrow_array::builder::ListBuilder; use arrow_array::builder::StringBuilder; +use arrow_array::builder::UInt8Builder; use arrow_array::builder::UInt64Builder; use arrow_schema::ArrowError; use arrow_schema::SchemaRef; @@ -125,7 +126,7 @@ pub struct GnomADBuilder<'a> { /// `1|1`. We do not support these in the GT field. /// /// Every list is the same length; however, individual positions may be missing. - pub GT_builder: ListBuilder, + pub GT_builder: ListBuilder, /// The genotype quality. /// /// A small non-negative integer indicating our confidence in this genotype. It is usually the @@ -444,3 +445,33 @@ impl<'a> GnomADBuilder<'a> { ) } } + +#[cfg(test)] +mod tests { + use arrow_schema::DataType; + use noodles_vcf::Header; + + use super::GnomADBuilder; + + /// Regression test: the GnomAD schema declares `GT` as `list(u8)`, so the builder must + /// emit a `List` array. Previously the builder used `UInt64`, so `finish` failed + /// inside `RecordBatch::try_new` with a schema/array type mismatch. + #[test] + fn gt_builder_matches_u8_schema() -> anyhow::Result<()> { + let header = Header::default(); + let schema = super::super::schema::schema_from_vcf_header(&header); + + // `finish` validates every column against the schema via `RecordBatch::try_new`, + // so this only succeeds when the GT builder's element type matches `list(u8)`. + let batch = GnomADBuilder::new(&header, schema).finish()?; + + let gt = batch + .column_by_name("GT") + .expect("GT column must be present"); + let DataType::List(item) = gt.data_type() else { + panic!("GT must be a List, got {:?}", gt.data_type()); + }; + assert_eq!(item.data_type(), &DataType::UInt8, "GT items must be u8"); + Ok(()) + } +} diff --git a/vortex-bench/src/statpopgen/schema.rs b/vortex-bench/src/statpopgen/schema.rs index e19afaa09a1..ec00e5a138c 100644 --- a/vortex-bench/src/statpopgen/schema.rs +++ b/vortex-bench/src/statpopgen/schema.rs @@ -34,7 +34,8 @@ pub fn schema_from_vcf_header(header: &Header) -> SchemaRef { .into_iter() .chain(info_fields) .chain([ - Arc::new(Field::new("GT", list(UInt64), true)), + // GT is NULL, 0, 1, or 2 + Arc::new(Field::new("GT", list(UInt8), true)), Arc::new(Field::new("GQ", list(Int32), true)), Arc::new(Field::new("DP", list(Int32), true)), Arc::new(Field::new("AD", list(list(Int32)), true)), diff --git a/vortex-bench/src/statpopgen/vcf_conversion.rs b/vortex-bench/src/statpopgen/vcf_conversion.rs index 357e81a93e3..f9a3ab1f844 100644 --- a/vortex-bench/src/statpopgen/vcf_conversion.rs +++ b/vortex-bench/src/statpopgen/vcf_conversion.rs @@ -128,7 +128,7 @@ pub fn value_list_string<'a>( }) } -pub fn parse_genotype(gt: Option) -> VortexResult> { +pub fn parse_genotype(gt: Option) -> VortexResult> { let Some(gt) = gt else { return Ok(None); }; @@ -140,7 +140,11 @@ pub fn parse_genotype(gt: Option) -> VortexResult> { .process_results(|iter| iter.map(|x| x.0).collect::>())?[..] { [None, None] => Ok(None), - [Some(l), Some(r)] => Ok(Some(l as u64 + r as u64)), + // GT dosage is the number of alternate alleles: 0, 1, or 2. + [Some(l), Some(r)] => match u8::try_from(l + r) { + Ok(dosage) => Ok(Some(dosage)), + Err(_) => vortex_bail!("genotype allele sum {} does not fit in u8", l + r), + }, _ => vortex_bail!("wtf {:?}", gt), } }