From 944755bbeb7093d300423164f24c1374b17fb7b5 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Mon, 1 Jun 2026 00:26:08 +0100 Subject: [PATCH] Aggregating bounded min/max values has to not ignore null values Signed-off-by: Robert Kruszewski --- vortex-file/src/tests.rs | 97 ++++++++++++++++++++++ vortex-layout/src/layouts/zoned/builder.rs | 22 ++++- 2 files changed, 117 insertions(+), 2 deletions(-) diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index 2ba10d96684..6df3b97bd6a 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -1953,3 +1953,100 @@ async fn test_segment_ordering_zonemaps_after_data() -> VortexResult<()> { Ok(()) } + +// Regression test for https://github.com/vortex-data/vortex/issues/8166. +// +// A binary column whose per-zone max value has an all-0xff prefix (longer than the +// statistics-truncation length) cannot be upper-bounded, so the max stat is written as NULL. +// Filtering with `col > literal` must still return the row, but a pruning bug drops it. +#[tokio::test] +#[cfg_attr(miri, ignore)] +async fn repro_8166_binary_gt_all_ff_max() -> VortexResult<()> { + use vortex_buffer::ByteBuffer; + let mut ctx = SESSION.create_execution_ctx(); + + let empty: Vec = vec![]; + let chunk0: Vec> = vec![ + vec![0x1d, 0x00], + empty.clone(), + vec![0x1d, 0x10, 0x9d, 0x08], + empty.clone(), + empty.clone(), + empty.clone(), + empty.clone(), + empty.clone(), + empty.clone(), + ]; + let chunk1: Vec> = vec![ + empty.clone(), + empty.clone(), + vec![0x40], + empty.clone(), + empty.clone(), + empty.clone(), + empty.clone(), + empty.clone(), + vec![0x24], + vec![0x43, 0xff], + ]; + // Row 27 (local index 8 in chunk2): 112-byte value, all 0xff except a 0x03 at byte 89. + let mut big = vec![0xffu8; 112]; + big[89] = 0x03; + let mut chunk2: Vec> = vec![empty.clone(); 10]; + chunk2[8] = big; + + let bin = DType::Binary(Nullability::NonNullable); + // Chunk at the struct level: the zoned writer maps one input chunk to one zone, so this + // produces 3 zones. Zones 0/1 have small, valid maxes (< literal, pruned); zone 2's max is + // the all-0xff value, whose upper bound overflows and is written as a NULL max stat. + let mk_struct = |vals: Vec>| -> VortexResult { + let yyw = VarBinArray::from_vec(vals, bin.clone()).into_array(); + Ok(StructArray::from_fields(&[("yyw", yyw)])?.into_array()) + }; + let array = + ChunkedArray::from_iter([mk_struct(chunk0)?, mk_struct(chunk1)?, mk_struct(chunk2)?]) + .into_array(); + + let mut buf = ByteBufferMut::empty(); + SESSION + .write_options() + .write(&mut buf, array.to_array_stream()) + .await + .unwrap(); + + // literal = 0x6f x5 + 0xff x57 + 0x98 (63 bytes). Only row 27 is > literal. + let mut literal = vec![0x6fu8; 5]; + literal.extend(iter::repeat_n(0xffu8, 57)); + literal.push(0x98); + assert_eq!(literal.len(), 63); + + let filter = gt( + get_item("yyw", root()), + lit(Scalar::binary( + ByteBuffer::from(literal), + Nullability::NonNullable, + )), + ); + + let result = SESSION + .open_options() + .open_buffer(buf) + .unwrap() + .scan() + .unwrap() + .with_filter(filter) + .into_array_stream() + .unwrap() + .read_all() + .await + .unwrap() + .execute::(&mut ctx) + .unwrap(); + + assert_eq!( + result.len(), + 1, + "expected exactly row 27 to match the filter" + ); + Ok(()) +} diff --git a/vortex-layout/src/layouts/zoned/builder.rs b/vortex-layout/src/layouts/zoned/builder.rs index 7d19a8bb666..6e142150f86 100644 --- a/vortex-layout/src/layouts/zoned/builder.rs +++ b/vortex-layout/src/layouts/zoned/builder.rs @@ -147,8 +147,26 @@ impl StatsAccumulator { // Different stats need different aggregations match stat { - // For stats that are associative, we can just compute them over the stat column - Stat::Min | Stat::Max | Stat::Sum => { + // Min/Max are associative, so we aggregate them over the per-chunk stat column. + // For variable-width (utf8/binary) types these are *bounded* aggregates: a chunk's + // bound is null when its value could not be represented within + // `max_variable_length_statistics_size` (an all-0xFF prefix for Max), i.e. the + // bound is unknown. A plain Min/Max aggregate would silently skip that null and + // emit a bound derived from the remaining chunks, under-estimating Max / + // over-estimating Min and pruning rows that actually match + // (https://github.com/vortex-data/vortex/issues/8166). Treat a null bound as + // "no aggregate stat" for these types instead. + Stat::Min | Stat::Max => { + let bound_unknown = matches!(array.dtype(), DType::Utf8(_) | DType::Binary(_)) + && !array.all_valid(ctx)?; + if !bound_unknown + && let Some(s) = array.statistics().compute_stat(stat, ctx)? + && let Some(v) = s.into_value() + { + stats_set.set(stat, Precision::exact(v)) + } + } + Stat::Sum => { if let Some(s) = array.statistics().compute_stat(stat, ctx)? && let Some(v) = s.into_value() {