Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion vortex-bench/src/statpopgen/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use arrow_array::builder::Float32Builder;
use arrow_array::builder::Int32Builder;
use arrow_array::builder::ListBuilder;
use arrow_array::builder::StringBuilder;
use arrow_array::builder::UInt8Builder;
use arrow_array::builder::UInt64Builder;
use arrow_schema::ArrowError;
use arrow_schema::SchemaRef;
Expand Down Expand Up @@ -125,7 +126,7 @@ pub struct GnomADBuilder<'a> {
/// `1|1`. We do not support these in the GT field.
///
/// Every list is the same length; however, individual positions may be missing.
pub GT_builder: ListBuilder<UInt64Builder>,
pub GT_builder: ListBuilder<UInt8Builder>,
/// The genotype quality.
///
/// A small non-negative integer indicating our confidence in this genotype. It is usually the
Expand Down Expand Up @@ -444,3 +445,33 @@ impl<'a> GnomADBuilder<'a> {
)
}
}

#[cfg(test)]
mod tests {
use arrow_schema::DataType;
use noodles_vcf::Header;

use super::GnomADBuilder;

/// Regression test: the GnomAD schema declares `GT` as `list(u8)`, so the builder must
/// emit a `List<UInt8>` array. Previously the builder used `UInt64`, so `finish` failed
/// inside `RecordBatch::try_new` with a schema/array type mismatch.
#[test]
fn gt_builder_matches_u8_schema() -> anyhow::Result<()> {
let header = Header::default();
let schema = super::super::schema::schema_from_vcf_header(&header);

// `finish` validates every column against the schema via `RecordBatch::try_new`,
// so this only succeeds when the GT builder's element type matches `list(u8)`.
let batch = GnomADBuilder::new(&header, schema).finish()?;

let gt = batch
.column_by_name("GT")
.expect("GT column must be present");
let DataType::List(item) = gt.data_type() else {
panic!("GT must be a List, got {:?}", gt.data_type());
};
assert_eq!(item.data_type(), &DataType::UInt8, "GT items must be u8");
Ok(())
}
}
3 changes: 2 additions & 1 deletion vortex-bench/src/statpopgen/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ pub fn schema_from_vcf_header(header: &Header) -> SchemaRef {
.into_iter()
.chain(info_fields)
.chain([
Arc::new(Field::new("GT", list(UInt64), true)),
// GT is NULL, 0, 1, or 2
Arc::new(Field::new("GT", list(UInt8), true)),
Arc::new(Field::new("GQ", list(Int32), true)),
Arc::new(Field::new("DP", list(Int32), true)),
Arc::new(Field::new("AD", list(list(Int32)), true)),
Expand Down
8 changes: 6 additions & 2 deletions vortex-bench/src/statpopgen/vcf_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ pub fn value_list_string<'a>(
})
}

pub fn parse_genotype(gt: Option<EntryValue>) -> VortexResult<Option<u64>> {
pub fn parse_genotype(gt: Option<EntryValue>) -> VortexResult<Option<u8>> {
let Some(gt) = gt else {
return Ok(None);
};
Expand All @@ -140,7 +140,11 @@ pub fn parse_genotype(gt: Option<EntryValue>) -> VortexResult<Option<u64>> {
.process_results(|iter| iter.map(|x| x.0).collect::<Vec<_>>())?[..]
{
[None, None] => Ok(None),
[Some(l), Some(r)] => Ok(Some(l as u64 + r as u64)),
// GT dosage is the number of alternate alleles: 0, 1, or 2.
[Some(l), Some(r)] => match u8::try_from(l + r) {
Ok(dosage) => Ok(Some(dosage)),
Err(_) => vortex_bail!("genotype allele sum {} does not fit in u8", l + r),
},
_ => vortex_bail!("wtf {:?}", gt),
}
}
Expand Down
Loading