From 340e38754fdc6e1276dd45152b2f1ee4da4d8fd5 Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Thu, 21 May 2026 17:50:16 +0100 Subject: [PATCH 1/3] Use Uint8 for GT field in statpopgen instead of UInt64 Signed-off-by: Mikhail Kot --- vortex-bench/src/statpopgen/schema.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vortex-bench/src/statpopgen/schema.rs b/vortex-bench/src/statpopgen/schema.rs index e19afaa09a1..ec00e5a138c 100644 --- a/vortex-bench/src/statpopgen/schema.rs +++ b/vortex-bench/src/statpopgen/schema.rs @@ -34,7 +34,8 @@ pub fn schema_from_vcf_header(header: &Header) -> SchemaRef { .into_iter() .chain(info_fields) .chain([ - Arc::new(Field::new("GT", list(UInt64), true)), + // GT is NULL, 0, 1, or 2 + Arc::new(Field::new("GT", list(UInt8), true)), Arc::new(Field::new("GQ", list(Int32), true)), Arc::new(Field::new("DP", list(Int32), true)), Arc::new(Field::new("AD", list(list(Int32)), true)), From 6eecce7da48ffa5beb1d0f34e80c2dd55a9f50a6 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 2 Jun 2026 21:49:43 +0000 Subject: [PATCH 2/3] Fix statpopgen GnomADBuilder to emit UInt8 GT to match schema The "Use Uint8 for GT field" change set the GT field schema to list(UInt8), but the Arrow builder still produced list(UInt64), causing data generation to fail at write time: column types must match schema types, expected List(UInt8) but found List(UInt64) Switch GT_builder to ListBuilder and have parse_genotype return Option (genotype dosage is only ever NULL/0/1/2), so the produced arrays match the declared schema. Signed-off-by: Joe Isaacs Signed-off-by: Claude --- vortex-bench/src/statpopgen/builder.rs | 3 ++- vortex-bench/src/statpopgen/vcf_conversion.rs | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vortex-bench/src/statpopgen/builder.rs b/vortex-bench/src/statpopgen/builder.rs index 4e11b4e43e4..86921a81007 100644 --- a/vortex-bench/src/statpopgen/builder.rs +++ b/vortex-bench/src/statpopgen/builder.rs @@ -11,6 +11,7 @@ use arrow_array::builder::Float32Builder; use arrow_array::builder::Int32Builder; use arrow_array::builder::ListBuilder; use arrow_array::builder::StringBuilder; +use arrow_array::builder::UInt8Builder; use arrow_array::builder::UInt64Builder; use arrow_schema::ArrowError; use arrow_schema::SchemaRef; @@ -125,7 +126,7 @@ pub struct GnomADBuilder<'a> { /// `1|1`. We do not support these in the GT field. /// /// Every list is the same length; however, individual positions may be missing. - pub GT_builder: ListBuilder, + pub GT_builder: ListBuilder, /// The genotype quality. /// /// A small non-negative integer indicating our confidence in this genotype. It is usually the diff --git a/vortex-bench/src/statpopgen/vcf_conversion.rs b/vortex-bench/src/statpopgen/vcf_conversion.rs index 357e81a93e3..b92285c4ce1 100644 --- a/vortex-bench/src/statpopgen/vcf_conversion.rs +++ b/vortex-bench/src/statpopgen/vcf_conversion.rs @@ -128,7 +128,7 @@ pub fn value_list_string<'a>( }) } -pub fn parse_genotype(gt: Option) -> VortexResult> { +pub fn parse_genotype(gt: Option) -> VortexResult> { let Some(gt) = gt else { return Ok(None); }; @@ -140,7 +140,7 @@ pub fn parse_genotype(gt: Option) -> VortexResult> { .process_results(|iter| iter.map(|x| x.0).collect::>())?[..] { [None, None] => Ok(None), - [Some(l), Some(r)] => Ok(Some(l as u64 + r as u64)), + [Some(l), Some(r)] => Ok(Some(l as u8 + r as u8)), _ => vortex_bail!("wtf {:?}", gt), } } From fd8c9b3f630ad10ecfdb1f8fc7b6e0d618c2ef50 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 08:03:23 +0000 Subject: [PATCH 3/3] Add regression test for statpopgen GT u8 builder Add a zero-row GnomADBuilder test that exercises RecordBatch::try_new and fails unless GT is emitted as list(u8), locking in the schema/ builder type match. Also convert genotype dosage with a checked u8::try_from to satisfy clippy's cast_possible_truncation lint. Signed-off-by: Joe Isaacs Signed-off-by: Claude --- vortex-bench/src/statpopgen/builder.rs | 30 +++++++++++++++++++ vortex-bench/src/statpopgen/vcf_conversion.rs | 6 +++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/vortex-bench/src/statpopgen/builder.rs b/vortex-bench/src/statpopgen/builder.rs index 86921a81007..8d772d5cf9b 100644 --- a/vortex-bench/src/statpopgen/builder.rs +++ b/vortex-bench/src/statpopgen/builder.rs @@ -445,3 +445,33 @@ impl<'a> GnomADBuilder<'a> { ) } } + +#[cfg(test)] +mod tests { + use arrow_schema::DataType; + use noodles_vcf::Header; + + use super::GnomADBuilder; + + /// Regression test: the GnomAD schema declares `GT` as `list(u8)`, so the builder must + /// emit a `List` array. Previously the builder used `UInt64`, so `finish` failed + /// inside `RecordBatch::try_new` with a schema/array type mismatch. + #[test] + fn gt_builder_matches_u8_schema() -> anyhow::Result<()> { + let header = Header::default(); + let schema = super::super::schema::schema_from_vcf_header(&header); + + // `finish` validates every column against the schema via `RecordBatch::try_new`, + // so this only succeeds when the GT builder's element type matches `list(u8)`. + let batch = GnomADBuilder::new(&header, schema).finish()?; + + let gt = batch + .column_by_name("GT") + .expect("GT column must be present"); + let DataType::List(item) = gt.data_type() else { + panic!("GT must be a List, got {:?}", gt.data_type()); + }; + assert_eq!(item.data_type(), &DataType::UInt8, "GT items must be u8"); + Ok(()) + } +} diff --git a/vortex-bench/src/statpopgen/vcf_conversion.rs b/vortex-bench/src/statpopgen/vcf_conversion.rs index b92285c4ce1..f9a3ab1f844 100644 --- a/vortex-bench/src/statpopgen/vcf_conversion.rs +++ b/vortex-bench/src/statpopgen/vcf_conversion.rs @@ -140,7 +140,11 @@ pub fn parse_genotype(gt: Option) -> VortexResult> { .process_results(|iter| iter.map(|x| x.0).collect::>())?[..] { [None, None] => Ok(None), - [Some(l), Some(r)] => Ok(Some(l as u8 + r as u8)), + // GT dosage is the number of alternate alleles: 0, 1, or 2. + [Some(l), Some(r)] => match u8::try_from(l + r) { + Ok(dosage) => Ok(Some(dosage)), + Err(_) => vortex_bail!("genotype allele sum {} does not fit in u8", l + r), + }, _ => vortex_bail!("wtf {:?}", gt), } }