Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions vortex-file/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1953,3 +1953,100 @@ async fn test_segment_ordering_zonemaps_after_data() -> VortexResult<()> {

Ok(())
}

// Regression test for https://github.com/vortex-data/vortex/issues/8166.
//
// A binary column whose per-zone max value has an all-0xff prefix (longer than the
// statistics-truncation length) cannot be upper-bounded, so the max stat is written as NULL.
// Filtering with `col > literal` must still return the row, but a pruning bug drops it.
#[tokio::test]
#[cfg_attr(miri, ignore)]
async fn repro_8166_binary_gt_all_ff_max() -> VortexResult<()> {
use vortex_buffer::ByteBuffer;
let mut ctx = SESSION.create_execution_ctx();

let empty: Vec<u8> = vec![];
let chunk0: Vec<Vec<u8>> = vec![
vec![0x1d, 0x00],
empty.clone(),
vec![0x1d, 0x10, 0x9d, 0x08],
empty.clone(),
empty.clone(),
empty.clone(),
empty.clone(),
empty.clone(),
empty.clone(),
];
let chunk1: Vec<Vec<u8>> = vec![
empty.clone(),
empty.clone(),
vec![0x40],
empty.clone(),
empty.clone(),
empty.clone(),
empty.clone(),
empty.clone(),
vec![0x24],
vec![0x43, 0xff],
];
// Row 27 (local index 8 in chunk2): 112-byte value, all 0xff except a 0x03 at byte 89.
let mut big = vec![0xffu8; 112];
big[89] = 0x03;
let mut chunk2: Vec<Vec<u8>> = vec![empty.clone(); 10];
chunk2[8] = big;

let bin = DType::Binary(Nullability::NonNullable);
// Chunk at the struct level: the zoned writer maps one input chunk to one zone, so this
// produces 3 zones. Zones 0/1 have small, valid maxes (< literal, pruned); zone 2's max is
// the all-0xff value, whose upper bound overflows and is written as a NULL max stat.
let mk_struct = |vals: Vec<Vec<u8>>| -> VortexResult<ArrayRef> {
let yyw = VarBinArray::from_vec(vals, bin.clone()).into_array();
Ok(StructArray::from_fields(&[("yyw", yyw)])?.into_array())
};
let array =
ChunkedArray::from_iter([mk_struct(chunk0)?, mk_struct(chunk1)?, mk_struct(chunk2)?])
.into_array();

let mut buf = ByteBufferMut::empty();
SESSION
.write_options()
.write(&mut buf, array.to_array_stream())
.await
.unwrap();

// literal = 0x6f x5 + 0xff x57 + 0x98 (63 bytes). Only row 27 is > literal.
let mut literal = vec![0x6fu8; 5];
literal.extend(iter::repeat_n(0xffu8, 57));
literal.push(0x98);
assert_eq!(literal.len(), 63);

let filter = gt(
get_item("yyw", root()),
lit(Scalar::binary(
ByteBuffer::from(literal),
Nullability::NonNullable,
)),
);

let result = SESSION
.open_options()
.open_buffer(buf)
.unwrap()
.scan()
.unwrap()
.with_filter(filter)
.into_array_stream()
.unwrap()
.read_all()
.await
.unwrap()
.execute::<StructArray>(&mut ctx)
.unwrap();

assert_eq!(
result.len(),
1,
"expected exactly row 27 to match the filter"
);
Ok(())
}
22 changes: 20 additions & 2 deletions vortex-layout/src/layouts/zoned/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,26 @@ impl StatsAccumulator {

// Different stats need different aggregations
match stat {
// For stats that are associative, we can just compute them over the stat column
Stat::Min | Stat::Max | Stat::Sum => {
// Min/Max are associative, so we aggregate them over the per-chunk stat column.
// For variable-width (utf8/binary) types these are *bounded* aggregates: a chunk's
// bound is null when its value could not be represented within
// `max_variable_length_statistics_size` (an all-0xFF prefix for Max), i.e. the
// bound is unknown. A plain Min/Max aggregate would silently skip that null and
// emit a bound derived from the remaining chunks, under-estimating Max /
// over-estimating Min and pruning rows that actually match
// (https://github.com/vortex-data/vortex/issues/8166). Treat a null bound as
// "no aggregate stat" for these types instead.
Stat::Min | Stat::Max => {
let bound_unknown = matches!(array.dtype(), DType::Utf8(_) | DType::Binary(_))
&& !array.all_valid(ctx)?;
if !bound_unknown
&& let Some(s) = array.statistics().compute_stat(stat, ctx)?
&& let Some(v) = s.into_value()
{
stats_set.set(stat, Precision::exact(v))
}
}
Stat::Sum => {
if let Some(s) = array.statistics().compute_stat(stat, ctx)?
&& let Some(v) = s.into_value()
{
Expand Down
Loading