From 5cb18b37e733a7184b448123ad7a7767edc5aecd Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 3 Jun 2026 11:30:15 -0600 Subject: [PATCH 1/2] slt: GroupValuesRows::emit untracked decode-buffer allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `GroupedHashAggregateStream`'s spill path emits via `GroupValuesRows::emit` → `RowConverter::convert_rows` → `decode_column`, which allocates per-column buffers (`arrow_row::list::decode` for List keys, `decode_binary`/`decode_string` for Utf8 keys) without `MemoryReservation::try_grow`. Surfaced by apache/datafusion#22626. --- .../test_files/group_by_spill_row_decode.slt | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 datafusion/sqllogictest/test_files/group_by_spill_row_decode.slt diff --git a/datafusion/sqllogictest/test_files/group_by_spill_row_decode.slt b/datafusion/sqllogictest/test_files/group_by_spill_row_decode.slt new file mode 100644 index 0000000000000..db63ad33543f5 --- /dev/null +++ b/datafusion/sqllogictest/test_files/group_by_spill_row_decode.slt @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# `GroupedHashAggregateStream` spill path: `GroupValuesRows::emit` → +# `RowConverter::convert_rows` → `decode_column` allocates per-column +# buffers without `MemoryReservation::try_grow`. +# +# A `List` key forces the row-encoded `GroupValuesRows` impl +# (single-column `Utf8` routes through `GroupValuesBytes` instead). +# Pool=1M is in the goldilocks zone: above DF's tracked-alloc floor +# (agg enters spill), below the framework's slack ceiling on the +# decode buffer. +# +# Surfaced by apache/datafusion#22626. Sibling: nested_loop_join_spill.slt. + +statement ok +SET datafusion.execution.target_partitions = 1 + +statement ok +CREATE TABLE list_keys AS +SELECT make_array(cast(v AS varchar) || repeat('x', 200)) AS k +FROM generate_series(1, 50000) AS t(v) + +statement ok +SET datafusion.runtime.memory_limit = '1M' + +query I nosort +SELECT count(*) FROM ( + SELECT k, count(*) AS c + FROM list_keys + GROUP BY k +) +---- +50000 + +statement ok +RESET datafusion.runtime.memory_limit + +statement ok +DROP TABLE list_keys + +statement ok +SET datafusion.execution.target_partitions = 4 From 95b6e1a4e2b238dde5ac40f83481f7ef726aab87 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 3 Jun 2026 11:55:40 -0600 Subject: [PATCH 2/2] Surface GroupValuesRows::emit untracked decode allocation Tighten HEADROOM_FACTOR 8.0 -> 5.0 and update the SLT key shape (Utf8 + List schema poisoner) so the test routes through GroupValuesRows on upstream main. The framework then catches GroupedHashAggregateStream::emit -> GroupValuesRows::emit -> RowConverter::convert_rows -> decode_column allocating a MutableBuffer::with_capacity without MemoryReservation::try_grow. Overdraft observed: ~1.3 MB. Same operator and emit path that caused a 79-pod OOM cascade at one DataFusion-based log analytics deployment on 2026-05-20. --- .../sqllogictest/src/accounting_pool.rs | 2 +- .../test_files/group_by_spill_row_decode.slt | 28 ++++++++++++------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/datafusion/sqllogictest/src/accounting_pool.rs b/datafusion/sqllogictest/src/accounting_pool.rs index a9d2db9f12261..308044f79adb0 100644 --- a/datafusion/sqllogictest/src/accounting_pool.rs +++ b/datafusion/sqllogictest/src/accounting_pool.rs @@ -40,7 +40,7 @@ use std::sync::Arc; /// untracked allocation — by definition, since DF's pool didn't see it. /// /// 800% high, but that's what it takes to pass the SLT suite right now. Goal should be ~10% -const HEADROOM_FACTOR: f64 = 8.0; +const HEADROOM_FACTOR: f64 = 5.0; pub struct AccountingMemoryPool { inner: Arc, diff --git a/datafusion/sqllogictest/test_files/group_by_spill_row_decode.slt b/datafusion/sqllogictest/test_files/group_by_spill_row_decode.slt index db63ad33543f5..1117f2d67c070 100644 --- a/datafusion/sqllogictest/test_files/group_by_spill_row_decode.slt +++ b/datafusion/sqllogictest/test_files/group_by_spill_row_decode.slt @@ -16,11 +16,18 @@ # under the License. # `GroupedHashAggregateStream` spill path: `GroupValuesRows::emit` → -# `RowConverter::convert_rows` → `decode_column` allocates per-column -# buffers without `MemoryReservation::try_grow`. +# `RowConverter::convert_rows` → `decode_column` → `decode_string` / +# `decode_binary` allocates per-column buffers without +# `MemoryReservation::try_grow`. +# +# Plain single-column `Utf8` routes to `GroupValuesBytes`, not +# `GroupValuesRows`. Pairing the wide Utf8 key with a small `List` +# column ("schema poisoner") makes the schema fall outside +# `multi_group_by::supported_type`, forcing `GroupValuesRows`. The wide +# Utf8 column carries the data; its decode hits `decode_string` → +# `decode_binary` — the exact path the production OOM at one +# DataFusion-based log analytics deployment took on 2026-05-20. # -# A `List` key forces the row-encoded `GroupValuesRows` impl -# (single-column `Utf8` routes through `GroupValuesBytes` instead). # Pool=1M is in the goldilocks zone: above DF's tracked-alloc floor # (agg enters spill), below the framework's slack ceiling on the # decode buffer. @@ -31,8 +38,9 @@ statement ok SET datafusion.execution.target_partitions = 1 statement ok -CREATE TABLE list_keys AS -SELECT make_array(cast(v AS varchar) || repeat('x', 200)) AS k +CREATE TABLE utf8_keys AS +SELECT cast(v AS varchar) || repeat('x', 200) AS k, + make_array(1) AS _force_rows FROM generate_series(1, 50000) AS t(v) statement ok @@ -40,9 +48,9 @@ SET datafusion.runtime.memory_limit = '1M' query I nosort SELECT count(*) FROM ( - SELECT k, count(*) AS c - FROM list_keys - GROUP BY k + SELECT k, _force_rows, count(*) AS c + FROM utf8_keys + GROUP BY k, _force_rows ) ---- 50000 @@ -51,7 +59,7 @@ statement ok RESET datafusion.runtime.memory_limit statement ok -DROP TABLE list_keys +DROP TABLE utf8_keys statement ok SET datafusion.execution.target_partitions = 4