From e1e1d09aba64873d6f9d65c7c960e73b33bb618b Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Thu, 9 Apr 2026 16:47:49 -0400 Subject: [PATCH 01/25] Bring over apache/arrow-rs/9683, integrate into sorts, add heuristic to choose between sort implementations. --- .../physical-plan/src/aggregates/row_hash.rs | 1 + datafusion/physical-plan/src/sorts/mod.rs | 1 + datafusion/physical-plan/src/sorts/radix.rs | 118 ++++++++ datafusion/physical-plan/src/sorts/sort.rs | 267 +++++++++++++++++- datafusion/physical-plan/src/sorts/stream.rs | 12 +- 5 files changed, 390 insertions(+), 9 deletions(-) create mode 100644 datafusion/physical-plan/src/sorts/radix.rs diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 056a7f171a516..a762e2ddd0b4b 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -1162,6 +1162,7 @@ impl GroupedHashAggregateStream { emit, self.spill_state.spill_expr.clone(), self.batch_size, + false, ); let spillfile = self .spill_state diff --git a/datafusion/physical-plan/src/sorts/mod.rs b/datafusion/physical-plan/src/sorts/mod.rs index a73872a175b9b..a83b44e6a2544 100644 --- a/datafusion/physical-plan/src/sorts/mod.rs +++ b/datafusion/physical-plan/src/sorts/mod.rs @@ -22,6 +22,7 @@ mod cursor; mod merge; mod multi_level_merge; pub mod partial_sort; +mod radix; pub mod sort; pub mod sort_preserving_merge; mod stream; diff --git a/datafusion/physical-plan/src/sorts/radix.rs b/datafusion/physical-plan/src/sorts/radix.rs new file mode 100644 index 0000000000000..377bad33a9d20 --- /dev/null +++ b/datafusion/physical-plan/src/sorts/radix.rs @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// TODO: replace with arrow_row::radix::radix_sort_to_indices once +// available in arrow-rs (see https://github.com/apache/arrow-rs/pull/9683) + +//! MSD radix sort on row-encoded keys. + +use arrow::array::UInt32Array; +use arrow::row::{RowConverter, Rows, SortField}; +use arrow_ord::sort::SortColumn; +use std::sync::Arc; + +/// 256-bucket histogram + scatter costs more than comparison sort at small n. +const FALLBACK_THRESHOLD: usize = 64; + +/// 8 bytes covers the discriminating prefix of most key layouts; deeper +/// recursion hits diminishing returns as buckets become sparse. +const MAX_DEPTH: usize = 8; + +/// Sort row indices using MSD radix sort on row-encoded keys. +/// +/// Returns a `UInt32Array` of row indices in sorted order. +pub(crate) fn radix_sort_to_indices( + sort_columns: &[SortColumn], +) -> arrow::error::Result { + let sort_fields: Vec = sort_columns + .iter() + .map(|col| { + SortField::new_with_options( + col.values.data_type().clone(), + col.options.unwrap_or_default(), + ) + }) + .collect(); + + let arrays: Vec<_> = sort_columns + .iter() + .map(|col| Arc::clone(&col.values)) + .collect(); + + let converter = RowConverter::new(sort_fields)?; + let rows = converter.convert_columns(&arrays)?; + + let n = rows.num_rows(); + let mut indices: Vec = (0..n as u32).collect(); + let mut temp = vec![0u32; n]; + msd_radix_sort(&mut indices, &mut temp, &rows, 0); + Ok(UInt32Array::from(indices)) +} + +fn msd_radix_sort(indices: &mut [u32], temp: &mut [u32], rows: &Rows, byte_pos: usize) { + let n = indices.len(); + + if n <= FALLBACK_THRESHOLD || byte_pos >= MAX_DEPTH { + indices.sort_unstable_by(|&a, &b| { + let ra = unsafe { rows.row_unchecked(a as usize) }; + let rb = unsafe { rows.row_unchecked(b as usize) }; + ra.cmp(&rb) + }); + return; + } + + let mut counts = [0u32; 256]; + for &idx in &*indices { + let row = unsafe { rows.row_unchecked(idx as usize) }; + let byte = row.data().get(byte_pos).copied().unwrap_or(0); + counts[byte as usize] += 1; + } + + // Skip scatter when all rows share the same byte + if counts.iter().filter(|&&c| c > 0).count() == 1 { + msd_radix_sort(indices, temp, rows, byte_pos + 1); + return; + } + + let mut offsets = [0u32; 257]; + for i in 0..256 { + offsets[i + 1] = offsets[i] + counts[i]; + } + + let temp = &mut temp[..n]; + let mut write_pos = offsets; + for &idx in &*indices { + let row = unsafe { rows.row_unchecked(idx as usize) }; + let byte = row.data().get(byte_pos).copied().unwrap_or(0) as usize; + temp[write_pos[byte] as usize] = idx; + write_pos[byte] += 1; + } + indices.copy_from_slice(temp); + + for bucket in 0..256 { + let start = offsets[bucket] as usize; + let end = offsets[bucket + 1] as usize; + if end - start > 1 { + msd_radix_sort( + &mut indices[start..end], + &mut temp[start..end], + rows, + byte_pos + 1, + ); + } + } +} diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 583bfa29b04ad..ddd97c330e330 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -55,7 +55,7 @@ use crate::{ use arrow::array::{Array, RecordBatch, RecordBatchOptions, StringViewArray}; use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays}; -use arrow::datatypes::SchemaRef; +use arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::config::SpillCompression; use datafusion_common::tree_node::TreeNodeRecursion; use datafusion_common::{ @@ -220,6 +220,8 @@ struct ExternalSorter { /// the data will be concatenated and sorted in place rather than /// sort/merged. sort_in_place_threshold_bytes: usize, + /// Whether to use radix sort (decided once from expression types). + use_radix: bool, // ======================================================================== // STATE BUFFERS: @@ -294,6 +296,12 @@ impl ExternalSorter { ) .with_compression_type(spill_compression); + let sort_data_types: Vec = expr + .iter() + .map(|e| e.expr.data_type(&schema)) + .collect::>()?; + let use_radix = use_radix_sort(&sort_data_types.iter().collect::>()); + Ok(Self { schema, in_mem_batches: vec![], @@ -308,6 +316,7 @@ impl ExternalSorter { batch_size, sort_spill_reservation_bytes, sort_in_place_threshold_bytes, + use_radix, }) } @@ -735,13 +744,15 @@ impl ExternalSorter { let schema = batch.schema(); let expressions = self.expr.clone(); let batch_size = self.batch_size; + let use_radix = self.use_radix; let output_row_metrics = metrics.output_rows().clone(); let stream = futures::stream::once(async move { let schema = batch.schema(); // Sort the batch immediately and get all output batches - let sorted_batches = sort_batch_chunked(&batch, &expressions, batch_size)?; + let sorted_batches = + sort_batch_chunked(&batch, &expressions, batch_size, use_radix)?; // Resize the reservation to match the actual sorted output size. // Using try_resize avoids a release-then-reacquire cycle, which @@ -886,6 +897,24 @@ pub fn sort_batch( .map(|expr| expr.evaluate_to_sort_column(batch)) .collect::>>()?; + if fetch.is_none() + && use_radix_sort( + &sort_columns + .iter() + .map(|c| c.values.data_type()) + .collect::>(), + ) + { + let indices = super::radix::radix_sort_to_indices(&sort_columns)?; + let columns = take_arrays(batch.columns(), &indices, None)?; + let options = RecordBatchOptions::new().with_row_count(Some(indices.len())); + return Ok(RecordBatch::try_new_with_options( + batch.schema(), + columns, + &options, + )?); + } + let indices = lexsort_to_indices(&sort_columns, fetch)?; let columns = take_arrays(batch.columns(), &indices, None)?; @@ -897,6 +926,36 @@ pub fn sort_batch( )?) } +/// Returns true if radix sort should be used for the given sort column types. +/// +/// Radix sort is faster for most multi-column sorts but falls back to +/// lexsort when: +/// - All sort columns are dictionary-typed (long shared row prefixes +/// waste radix passes before falling back to comparison sort) +/// - Any sort column is a nested type (encoding cost is high and lexsort +/// short-circuits comparison on leading columns) +pub(super) fn use_radix_sort(data_types: &[&DataType]) -> bool { + if data_types.is_empty() { + return false; + } + + let mut all_dict = true; + for dt in data_types { + match dt { + DataType::Dictionary(_, _) => {} + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + | DataType::Struct(_) + | DataType::Map(_, _) + | DataType::Union(_, _) => return false, + _ => all_dict = false, + } + } + + !all_dict +} + /// Sort a batch and return the result as multiple batches of size `batch_size`. /// This is useful when you want to avoid creating one large sorted batch in memory, /// and instead want to process the sorted data in smaller chunks. @@ -904,8 +963,15 @@ pub fn sort_batch_chunked( batch: &RecordBatch, expressions: &LexOrdering, batch_size: usize, + use_radix: bool, ) -> Result> { - IncrementalSortIterator::new(batch.clone(), expressions.clone(), batch_size).collect() + IncrementalSortIterator::new( + batch.clone(), + expressions.clone(), + batch_size, + use_radix, + ) + .collect() } /// Sort execution plan. @@ -2594,7 +2660,7 @@ mod tests { [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(); // Sort with batch_size = 250 - let result_batches = sort_batch_chunked(&batch, &expressions, 250)?; + let result_batches = sort_batch_chunked(&batch, &expressions, 250, false)?; // Verify 4 batches are returned assert_eq!(result_batches.len(), 4); @@ -2647,7 +2713,7 @@ mod tests { [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(); // Sort with batch_size = 100 - let result_batches = sort_batch_chunked(&batch, &expressions, 100)?; + let result_batches = sort_batch_chunked(&batch, &expressions, 100, false)?; // Should return exactly 1 batch assert_eq!(result_batches.len(), 1); @@ -2679,7 +2745,7 @@ mod tests { [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(); // Sort with batch_size = 100 - let result_batches = sort_batch_chunked(&batch, &expressions, 100)?; + let result_batches = sort_batch_chunked(&batch, &expressions, 100, false)?; // Should return exactly 10 batches of 100 rows each assert_eq!(result_batches.len(), 10); @@ -2706,7 +2772,7 @@ mod tests { let expressions: LexOrdering = [PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0)))].into(); - let result_batches = sort_batch_chunked(&batch, &expressions, 100)?; + let result_batches = sort_batch_chunked(&batch, &expressions, 100, false)?; // Empty input produces no output batches (0 chunks) assert_eq!(result_batches.len(), 0); @@ -2929,4 +2995,191 @@ mod tests { assert_eq!(desc.self_filters()[0].len(), 1); Ok(()) } + + #[test] + fn test_sort_batch_radix_multi_column() { + let a1: ArrayRef = Arc::new(Int32Array::from(vec![2, 1, 2, 1])); + let a2: ArrayRef = Arc::new(Int32Array::from(vec![4, 3, 2, 1])); + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new(schema, vec![a1, a2]).unwrap(); + + let expressions = LexOrdering::new(vec![ + PhysicalSortExpr::new_default(Arc::new(Column::new("a", 0))), + PhysicalSortExpr::new_default(Arc::new(Column::new("b", 1))), + ]) + .unwrap(); + + // No fetch -> should take the radix path + let sorted = sort_batch(&batch, &expressions, None).unwrap(); + let col_a = sorted + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let col_b = sorted + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col_a.values(), &[1, 1, 2, 2]); + assert_eq!(col_b.values(), &[1, 3, 2, 4]); + } + + #[test] + fn test_sort_batch_lexsort_with_fetch() { + let a: ArrayRef = Arc::new(Int32Array::from(vec![5, 3, 1, 4, 2])); + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let batch = RecordBatch::try_new(schema, vec![a]).unwrap(); + + let expressions = LexOrdering::new(vec![PhysicalSortExpr::new_default( + Arc::new(Column::new("a", 0)), + )]) + .unwrap(); + + // With fetch -> should use lexsort path + let sorted = sort_batch(&batch, &expressions, Some(2)).unwrap(); + assert_eq!(sorted.num_rows(), 2); + let col = sorted + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.values(), &[1, 2]); + } + + #[test] + fn test_use_radix_sort_heuristic() { + // Primitive columns -> radix + assert!(use_radix_sort(&[&DataType::Int32])); + + // All dictionary -> lexsort + let dict_type = + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + assert!(!use_radix_sort(&[&dict_type])); + + // List column -> lexsort + let list_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); + assert!(!use_radix_sort(&[&list_type])); + + // Mixed dict + primitive -> radix + assert!(use_radix_sort(&[&dict_type, &DataType::Int32])); + + // Empty -> no radix + assert!(!use_radix_sort(&[])); + } + + #[test] + fn test_sort_batch_radix_with_nulls_and_options() { + let a: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(3), + None, + Some(1), + None, + Some(2), + ])); + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])); + let batch = RecordBatch::try_new(schema, vec![a]).unwrap(); + + // Descending, nulls first + let expressions = LexOrdering::new(vec![PhysicalSortExpr::new( + Arc::new(Column::new("a", 0)), + SortOptions { + descending: true, + nulls_first: true, + }, + )]) + .unwrap(); + + let sorted = sort_batch(&batch, &expressions, None).unwrap(); + let col = sorted + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + // nulls first, then descending: NULL, NULL, 3, 2, 1 + assert!(col.is_null(0)); + assert!(col.is_null(1)); + assert_eq!(col.value(2), 3); + assert_eq!(col.value(3), 2); + assert_eq!(col.value(4), 1); + } + + #[test] + fn test_sort_batch_radix_matches_lexsort() { + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + + let mut rng = StdRng::seed_from_u64(0xCAFE); + + for _ in 0..50 { + let len = rng.random_range(10..500); + let a1: ArrayRef = Arc::new(Int32Array::from( + (0..len) + .map(|_| { + if rng.random_bool(0.1) { + None + } else { + Some(rng.random_range(-100..100)) + } + }) + .collect::>(), + )); + let a2: ArrayRef = Arc::new(StringArray::from( + (0..len) + .map(|_| { + if rng.random_bool(0.1) { + None + } else { + Some( + ["alpha", "beta", "gamma", "delta", "epsilon"] + [rng.random_range(0..5)], + ) + } + }) + .collect::>(), + )); + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new(schema, vec![a1, a2]).unwrap(); + + let desc = rng.random_bool(0.5); + let nf = rng.random_bool(0.5); + let opts = SortOptions { + descending: desc, + nulls_first: nf, + }; + let expressions = LexOrdering::new(vec![ + PhysicalSortExpr::new(Arc::new(Column::new("a", 0)), opts), + PhysicalSortExpr::new(Arc::new(Column::new("b", 1)), opts), + ]) + .unwrap(); + + // fetch=Some(len) forces the lexsort path while returning all rows + let lexsort_result = + sort_batch(&batch, &expressions, Some(len as usize)).unwrap(); + // fetch=None takes the radix path for these column types + let radix_result = sort_batch(&batch, &expressions, None).unwrap(); + + assert_eq!( + radix_result.num_rows(), + lexsort_result.num_rows(), + "row count mismatch" + ); + + for col_idx in 0..batch.num_columns() { + assert_eq!( + radix_result.column(col_idx).as_ref(), + lexsort_result.column(col_idx).as_ref(), + "column {col_idx} mismatch" + ); + } + } + } } diff --git a/datafusion/physical-plan/src/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs index ff7f259dd1347..9d297c4ea679f 100644 --- a/datafusion/physical-plan/src/sorts/stream.rs +++ b/datafusion/physical-plan/src/sorts/stream.rs @@ -299,6 +299,7 @@ pub(crate) struct IncrementalSortIterator { batch: RecordBatch, expressions: LexOrdering, batch_size: usize, + use_radix: bool, indices: Option, cursor: usize, } @@ -308,11 +309,13 @@ impl IncrementalSortIterator { batch: RecordBatch, expressions: LexOrdering, batch_size: usize, + use_radix: bool, ) -> Self { Self { batch, expressions, batch_size, + use_radix, cursor: 0, indices: None, } @@ -339,7 +342,12 @@ impl Iterator for IncrementalSortIterator { Err(e) => return Some(Err(e)), }; - let indices = match lexsort_to_indices(&sort_columns, None) { + let indices = if self.use_radix { + super::radix::radix_sort_to_indices(&sort_columns) + } else { + lexsort_to_indices(&sort_columns, None) + }; + let indices = match indices { Ok(indices) => indices, Err(e) => return Some(Err(e.into())), }; @@ -414,7 +422,7 @@ mod tests { .unwrap(); let mut total_rows = 0; - IncrementalSortIterator::new(batch.clone(), expressions, batch_size).try_for_each( + IncrementalSortIterator::new(batch.clone(), expressions, batch_size, false).try_for_each( |result| { let chunk = result?; total_rows += chunk.num_rows(); From fc35b08d57a6e7b7cc1942426980184e88d18c68 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 14:36:55 -0400 Subject: [PATCH 02/25] Stash with implementation, need to fix accounting for one test. --- datafusion/common/src/config.rs | 17 +- .../core/tests/fuzz_cases/sort_query_fuzz.rs | 18 +- datafusion/core/tests/memory_limit/mod.rs | 5 +- datafusion/execution/src/config.rs | 24 + datafusion/physical-plan/src/sorts/sort.rs | 431 +++++++++--------- .../test_files/information_schema.slt | 2 + 6 files changed, 257 insertions(+), 240 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 0eec3f948034a..21d0a52cbb16d 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -555,7 +555,22 @@ config_namespace! { /// When sorting, below what size should data be concatenated /// and sorted in a single RecordBatch rather than sorted in /// batches and merged. - pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024 + /// + /// Deprecated: this option is no longer used. The sort pipeline + /// now always coalesces batches before sorting. Use + /// `sort_coalesce_target_rows` instead. + pub sort_in_place_threshold_bytes: usize, warn = "`sort_in_place_threshold_bytes` is deprecated and ignored. Use `sort_coalesce_target_rows` instead.", default = 1024 * 1024 + + /// Target number of rows to coalesce before sorting in ExternalSorter. + /// + /// Larger values give radix sort (used for primitives and strings) + /// enough rows to amortize RowConverter encoding overhead. Under + /// memory pressure the actual chunk size may be smaller. + /// + /// For schemas that are not eligible for radix sort (all-dictionary + /// or nested types), the coalesce target falls back to `batch_size` + /// regardless of this setting. + pub sort_coalesce_target_rows: usize, default = 32768 /// Maximum buffer capacity (in bytes) per partition for BufferExec /// inserted during sort pushdown optimization. diff --git a/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs index 376306f3e0659..578076955842b 100644 --- a/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs @@ -497,17 +497,6 @@ impl SortFuzzerTestGenerator { ..=(per_partition_mem_limit as f64 * 0.3) as usize, ); - // 1 to 3 times of the approx batch size. Setting this to a very large nvalue - // will cause external sort to fail. - let sort_in_place_threshold_bytes = if with_memory_limit { - // For memory-limited query, setting `sort_in_place_threshold_bytes` too - // large will cause failure. - 0 - } else { - let dataset_size = self.dataset_state.as_ref().unwrap().dataset_size; - rng.random_range(0..=dataset_size * 2_usize) - }; - // Set up strings for printing let memory_limit_str = if with_memory_limit { human_readable_size(memory_limit) @@ -530,16 +519,11 @@ impl SortFuzzerTestGenerator { " Sort spill reservation bytes: {}", human_readable_size(sort_spill_reservation_bytes) ); - println!( - " Sort in place threshold bytes: {}", - human_readable_size(sort_in_place_threshold_bytes) - ); let config = SessionConfig::new() .with_target_partitions(num_partitions) .with_batch_size(init_state.approx_batch_num_rows / 2) - .with_sort_spill_reservation_bytes(sort_spill_reservation_bytes) - .with_sort_in_place_threshold_bytes(sort_in_place_threshold_bytes); + .with_sort_spill_reservation_bytes(sort_spill_reservation_bytes); let memory_pool: Arc = if with_memory_limit { Arc::new(FairSpillPool::new(memory_limit)) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 90459960c5561..912d92332fe9a 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -273,9 +273,7 @@ async fn sort_spill_reservation() { let scenario = Scenario::new_dictionary_strings(1); let partition_size = scenario.partition_size(); - let base_config = SessionConfig::new() - // do not allow the sort to use the 'concat in place' path - .with_sort_in_place_threshold_bytes(10); + let base_config = SessionConfig::new(); // This test case shows how sort_spill_reservation works by // purposely sorting data that requires non trivial memory to @@ -583,7 +581,6 @@ async fn setup_context( let config = SessionConfig::new() .with_sort_spill_reservation_bytes(64 * 1024) // 256KB - .with_sort_in_place_threshold_bytes(0) .with_spill_compression(spill_compression) .with_batch_size(64) // To reduce test memory usage .with_target_partitions(1); diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs index 854d239236766..5e3f187b19fcb 100644 --- a/datafusion/execution/src/config.rs +++ b/datafusion/execution/src/config.rs @@ -455,7 +455,15 @@ impl SessionConfig { /// Set the size of [`sort_in_place_threshold_bytes`] to control /// how sort does things. /// + /// Deprecated: this option is no longer used. Use + /// [`with_sort_coalesce_target_rows`] instead. + /// /// [`sort_in_place_threshold_bytes`]: datafusion_common::config::ExecutionOptions::sort_in_place_threshold_bytes + /// [`with_sort_coalesce_target_rows`]: Self::with_sort_coalesce_target_rows + #[deprecated( + since = "46.0.0", + note = "No longer used. Sort pipeline now coalesces batches before sorting. Use with_sort_coalesce_target_rows instead." + )] pub fn with_sort_in_place_threshold_bytes( mut self, sort_in_place_threshold_bytes: usize, @@ -465,6 +473,22 @@ impl SessionConfig { self } + /// Set the target number of rows to coalesce before sorting. + /// + /// Larger values give radix sort enough rows to amortize encoding + /// overhead (2-3x faster at 32K+ rows). Under memory pressure the + /// actual chunk size may be smaller. + /// + /// [`sort_coalesce_target_rows`]: datafusion_common::config::ExecutionOptions::sort_coalesce_target_rows + pub fn with_sort_coalesce_target_rows( + mut self, + sort_coalesce_target_rows: usize, + ) -> Self { + self.options_mut().execution.sort_coalesce_target_rows = + sort_coalesce_target_rows; + self + } + /// Enables or disables the enforcement of batch size in joins pub fn with_enforce_batch_size_in_joins( mut self, diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index ddd97c330e330..d6f8848194b50 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -54,7 +54,7 @@ use crate::{ }; use arrow::array::{Array, RecordBatch, RecordBatchOptions, StringViewArray}; -use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays}; +use arrow::compute::{BatchCoalescer, lexsort_to_indices, take_arrays}; use arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::config::SpillCompression; use datafusion_common::tree_node::TreeNodeRecursion; @@ -216,10 +216,6 @@ struct ExternalSorter { expr: LexOrdering, /// The target number of rows for output batches batch_size: usize, - /// If the in size of buffered memory batches is below this size, - /// the data will be concatenated and sorted in place rather than - /// sort/merged. - sort_in_place_threshold_bytes: usize, /// Whether to use radix sort (decided once from expression types). use_radix: bool, @@ -227,8 +223,14 @@ struct ExternalSorter { // STATE BUFFERS: // Fields that hold intermediate data during sorting // ======================================================================== - /// Unsorted input batches stored in the memory buffer - in_mem_batches: Vec, + /// Accumulates incoming batches until `coalesce_target_rows` is reached, + /// at which point the coalesced batch is sorted and stored as a run. + /// Set to `None` after `sort()` consumes it. + coalescer: Option, + + /// Pre-sorted runs of `batch_size`-chunked `RecordBatch`es. Each inner + /// `Vec` is a single sorted run produced by sorting one coalesced batch. + sorted_runs: Vec>, /// During external sorting, in-memory intermediate data will be appended to /// this file incrementally. Once finished, this file will be moved to [`Self::finished_spill_files`]. @@ -251,11 +253,11 @@ struct ExternalSorter { metrics: ExternalSorterMetrics, /// A handle to the runtime to get spill files runtime: Arc, - /// Reservation for in_mem_batches + /// Reservation for sorted_runs (and coalescer contents) reservation: MemoryReservation, spill_manager: SpillManager, - /// Reservation for the merging of in-memory batches. If the sort + /// Reservation for the merging of sorted runs. If the sort /// might spill, `sort_spill_reservation_bytes` will be /// pre-reserved to ensure there is some space for this sort/merge. merge_reservation: MemoryReservation, @@ -274,7 +276,7 @@ impl ExternalSorter { expr: LexOrdering, batch_size: usize, sort_spill_reservation_bytes: usize, - sort_in_place_threshold_bytes: usize, + sort_coalesce_target_rows: usize, // Configured via `datafusion.execution.spill_compression`. spill_compression: SpillCompression, metrics: &ExecutionPlanMetricsSet, @@ -302,9 +304,18 @@ impl ExternalSorter { .collect::>()?; let use_radix = use_radix_sort(&sort_data_types.iter().collect::>()); + let coalesce_target_rows = if use_radix { + sort_coalesce_target_rows + } else { + batch_size + }; + + let coalescer = BatchCoalescer::new(Arc::clone(&schema), coalesce_target_rows); + Ok(Self { schema, - in_mem_batches: vec![], + coalescer: Some(coalescer), + sorted_runs: vec![], in_progress_spill_file: None, finished_spill_files: vec![], expr, @@ -315,14 +326,15 @@ impl ExternalSorter { runtime, batch_size, sort_spill_reservation_bytes, - sort_in_place_threshold_bytes, use_radix, }) } - /// Appends an unsorted [`RecordBatch`] to `in_mem_batches` + /// Appends an unsorted [`RecordBatch`] to the coalescer. /// - /// Updates memory usage metrics, and possibly triggers spilling to disk + /// The coalescer accumulates batches until `coalesce_target_rows` is + /// reached, then sorts the coalesced batch and stores it as a sorted run. + /// Updates memory usage metrics, and possibly triggers spilling to disk. async fn insert_batch(&mut self, input: RecordBatch) -> Result<()> { if input.num_rows() == 0 { return Ok(()); @@ -332,7 +344,85 @@ impl ExternalSorter { self.reserve_memory_for_batch_and_maybe_spill(&input) .await?; - self.in_mem_batches.push(input); + let coalescer = self + .coalescer + .as_mut() + .expect("coalescer must exist during insert phase"); + coalescer + .push_batch(input) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + // Sort any completed coalesced batches and store as runs + self.drain_completed_batches()?; + + Ok(()) + } + + /// Drains completed (full) batches from the coalescer, sorts each, + /// and appends the sorted chunks to `sorted_runs`. + fn drain_completed_batches(&mut self) -> Result<()> { + // Collect completed batches first to avoid borrow conflict + let mut completed = vec![]; + if let Some(coalescer) = self.coalescer.as_mut() { + while let Some(batch) = coalescer.next_completed_batch() { + completed.push(batch); + } + } + for batch in &completed { + self.sort_and_store_run(batch)?; + } + Ok(()) + } + + /// Sorts a single coalesced batch and stores the result as a new run. + /// + /// Uses radix sort when the batch is large enough to amortize encoding + /// overhead (more than `batch_size` rows). Otherwise falls back to lexsort. + fn sort_and_store_run(&mut self, batch: &RecordBatch) -> Result<()> { + let use_radix_for_this_batch = + self.use_radix && batch.num_rows() > self.batch_size; + + let sorted_chunks = if use_radix_for_this_batch { + // Radix sort on large coalesced batch, chunk output to batch_size + sort_batch_chunked( + batch, + &self.expr, + self.batch_size, + true, // use_radix + )? + } else { + // Lexsort (or small batch from degraded radix path) + vec![sort_batch(batch, &self.expr, None)?] + }; + + self.sorted_runs.push(sorted_chunks); + + // Resize reservation to match actual total of all sorted runs. + // After coalescing + sorting, the sorted runs may differ in size + // from the 2x reservations made for original input batches + // (e.g., StringView GC compacts buffers, slicing changes layout). + let total_sorted_size: usize = self + .sorted_runs + .iter() + .flat_map(|run| run.iter()) + .map(get_record_batch_memory_size) + .sum(); + self.reservation + .try_resize(total_sorted_size) + .map_err(Self::err_with_oom_context)?; + + Ok(()) + } + + /// Flushes any partially accumulated rows from the coalescer, sorts them, + /// and stores as a run. Called before spilling and at sort() time. + fn flush_coalescer(&mut self) -> Result<()> { + if let Some(coalescer) = self.coalescer.as_mut() { + coalescer + .finish_buffered_batch() + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + self.drain_completed_batches()?; + } Ok(()) } @@ -340,22 +430,31 @@ impl ExternalSorter { !self.finished_spill_files.is_empty() } + /// Returns true if there are sorted runs in memory. + fn has_sorted_runs(&self) -> bool { + !self.sorted_runs.is_empty() + } + /// Returns the final sorted output of all batches inserted via /// [`Self::insert_batch`] as a stream of [`RecordBatch`]es. /// /// This process could either be: /// - /// 1. An in-memory sort/merge (if the input fit in memory) + /// 1. An in-memory merge of sorted runs (if the input fit in memory) /// - /// 2. A combined streaming merge incorporating both in-memory - /// batches and data from spill files on disk. + /// 2. A combined streaming merge incorporating sorted runs + /// and data from spill files on disk. async fn sort(&mut self) -> Result { + // Flush remaining data from the coalescer + self.flush_coalescer()?; + self.coalescer = None; + if self.spilled_before() { - // Sort `in_mem_batches` and spill it first. If there are many - // `in_mem_batches` and the memory limit is almost reached, merging + // Merge and spill remaining sorted runs first. If there are many + // sorted runs and the memory limit is almost reached, merging // them with the spilled files at the same time might cause OOM. - if !self.in_mem_batches.is_empty() { - self.sort_and_spill_in_mem_batches().await?; + if self.has_sorted_runs() { + self.spill_sorted_runs().await?; } // Transfer the pre-reserved merge memory to the streaming merge @@ -377,11 +476,11 @@ impl ExternalSorter { .build() } else { // Release the memory reserved for merge back to the pool so - // there is some left when `in_mem_sort_stream` requests an + // there is some left when `merge_sorted_runs` requests an // allocation. Only needed for the non-spill path; the spill // path transfers the reservation to the merge stream instead. self.merge_reservation.free(); - self.in_mem_sort_stream(self.metrics.baseline.clone()) + self.merge_sorted_runs(self.metrics.baseline.clone()) } } @@ -537,27 +636,26 @@ impl ExternalSorter { Ok(()) } - /// Sorts the in-memory batches and merges them into a single sorted run, then writes - /// the result to spill files. - async fn sort_and_spill_in_mem_batches(&mut self) -> Result<()> { + /// Merges the pre-sorted runs and writes the result to spill files. + async fn spill_sorted_runs(&mut self) -> Result<()> { assert_or_internal_err!( - !self.in_mem_batches.is_empty(), - "in_mem_batches must not be empty when attempting to sort and spill" + self.has_sorted_runs(), + "sorted_runs must not be empty when attempting to spill" ); // Release the memory reserved for merge back to the pool so - // there is some left when `in_mem_sort_stream` requests an + // there is some left when `merge_sorted_runs` requests an // allocation. At the end of this function, memory will be // reserved again for the next spill. self.merge_reservation.free(); let mut sorted_stream = - self.in_mem_sort_stream(self.metrics.baseline.intermediate())?; - // After `in_mem_sort_stream()` is constructed, all `in_mem_batches` is taken + self.merge_sorted_runs(self.metrics.baseline.intermediate())?; + // After `merge_sorted_runs()` is constructed, all `sorted_runs` are taken // to construct a globally sorted stream. assert_or_internal_err!( - self.in_mem_batches.is_empty(), - "in_mem_batches should be empty after constructing sorted stream" + self.sorted_runs.is_empty(), + "sorted_runs should be empty after constructing sorted stream" ); // 'global' here refers to all buffered batches when the memory limit is // reached. This variable will buffer the sorted batches after @@ -589,10 +687,10 @@ impl ExternalSorter { // Sanity check after spilling let buffers_cleared_property = - self.in_mem_batches.is_empty() && globally_sorted_batches.is_empty(); + self.sorted_runs.is_empty() && globally_sorted_batches.is_empty(); assert_or_internal_err!( buffers_cleared_property, - "in_mem_batches and globally_sorted_batches should be cleared before" + "sorted_runs and globally_sorted_batches should be cleared before" ); // Reserve headroom for next sort/merge @@ -601,112 +699,85 @@ impl ExternalSorter { Ok(()) } - /// Consumes in_mem_batches returning a sorted stream of - /// batches. This proceeds in one of two ways: - /// - /// # Small Datasets - /// - /// For "smaller" datasets, the data is first concatenated into a - /// single batch and then sorted. This is often faster than - /// sorting and then merging. - /// - /// ```text - /// ┌─────┐ - /// │ 2 │ - /// │ 3 │ - /// │ 1 │─ ─ ─ ─ ┐ ┌─────┐ - /// │ 4 │ │ 2 │ - /// │ 2 │ │ │ 3 │ - /// └─────┘ │ 1 │ sorted output - /// ┌─────┐ ▼ │ 4 │ stream - /// │ 1 │ │ 2 │ - /// │ 4 │─ ─▶ concat ─ ─ ─ ─ ▶│ 1 │─ ─ ▶ sort ─ ─ ─ ─ ─▶ - /// │ 1 │ │ 4 │ - /// └─────┘ ▲ │ 1 │ - /// ... │ │ ... │ - /// │ 4 │ - /// ┌─────┐ │ │ 3 │ - /// │ 4 │ └─────┘ - /// │ 3 │─ ─ ─ ─ ┘ - /// └─────┘ - /// in_mem_batches - /// ``` - /// - /// # Larger datasets - /// - /// For larger datasets, the batches are first sorted individually - /// and then merged together. + /// Merges the pre-sorted runs stored in `sorted_runs` into a single + /// sorted output stream. Each run is already sorted internally; this + /// method k-way merges them using the loser tree. /// /// ```text - /// ┌─────┐ ┌─────┐ - /// │ 2 │ │ 1 │ - /// │ 3 │ │ 2 │ - /// │ 1 │─ ─▶ sort ─ ─▶│ 2 │─ ─ ─ ─ ─ ┐ - /// │ 4 │ │ 3 │ - /// │ 2 │ │ 4 │ │ - /// └─────┘ └─────┘ sorted output - /// ┌─────┐ ┌─────┐ ▼ stream - /// │ 1 │ │ 1 │ - /// │ 4 │─ ▶ sort ─ ─ ▶│ 1 ├ ─ ─ ▶ merge ─ ─ ─ ─▶ - /// │ 1 │ │ 4 │ - /// └─────┘ └─────┘ ▲ - /// ... ... ... │ - /// - /// ┌─────┐ ┌─────┐ │ - /// │ 4 │ │ 3 │ - /// │ 3 │─ ▶ sort ─ ─ ▶│ 4 │─ ─ ─ ─ ─ ┘ - /// └─────┘ └─────┘ - /// - /// in_mem_batches + /// sorted_runs[0] sorted_runs[1] + /// ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ + /// │ 1,2 │ │ 3,4 │ │ 1,3 │ │ 5,7 │ + /// └──┬──┘ └──┬──┘ └──┬──┘ └──┬──┘ + /// └───┬───┘ └───┬───┘ + /// ▼ ▼ + /// stream 0 ─ ─ ─ ─ ─ ─ ─▶ merge ◀─ ─ ─ stream 1 + /// │ + /// ▼ + /// sorted output stream /// ``` - fn in_mem_sort_stream( + fn merge_sorted_runs( &mut self, metrics: BaselineMetrics, ) -> Result { - if self.in_mem_batches.is_empty() { + let all_runs = std::mem::take(&mut self.sorted_runs); + + if all_runs.is_empty() { return Ok(Box::pin(EmptyRecordBatchStream::new(Arc::clone( &self.schema, )))); } - // The elapsed compute timer is updated when the value is dropped. - // There is no need for an explicit call to drop. let elapsed_compute = metrics.elapsed_compute().clone(); let _timer = elapsed_compute.timer(); - // Please pay attention that any operation inside of `in_mem_sort_stream` will - // not perform any memory reservation. This is for avoiding the need of handling - // reservation failure and spilling in the middle of the sort/merge. The memory - // space for batches produced by the resulting stream will be reserved by the - // consumer of the stream. - - if self.in_mem_batches.len() == 1 { - let batch = self.in_mem_batches.swap_remove(0); + // Single run: stream the chunks directly, no merge needed + if all_runs.len() == 1 { + let run = all_runs.into_iter().next().unwrap(); let reservation = self.reservation.take(); - return self.sort_batch_stream(batch, &metrics, reservation); - } - - // If less than sort_in_place_threshold_bytes, concatenate and sort in place - if self.reservation.size() < self.sort_in_place_threshold_bytes { - // Concatenate memory batches together and sort - let batch = concat_batches(&self.schema, &self.in_mem_batches)?; - self.in_mem_batches.clear(); - self.reservation - .try_resize(get_reserved_bytes_for_record_batch(&batch)?) - .map_err(Self::err_with_oom_context)?; - let reservation = self.reservation.take(); - return self.sort_batch_stream(batch, &metrics, reservation); + let schema = Arc::clone(&self.schema); + let output_rows = metrics.output_rows().clone(); + let stream = + futures::stream::iter(run.into_iter().map(Ok)).map(move |batch| { + match batch { + Ok(batch) => { + output_rows.add(batch.num_rows()); + Ok(batch) + } + Err(e) => Err(e), + } + }); + return Ok(Box::pin(ReservationStream::new( + Arc::clone(&schema), + Box::pin(RecordBatchStreamAdapter::new(schema, stream)), + reservation, + ))); } - let streams = std::mem::take(&mut self.in_mem_batches) + // Multiple runs: create one stream per run and merge + let streams = all_runs .into_iter() - .map(|batch| { - let metrics = self.metrics.baseline.intermediate(); - let reservation = self - .reservation - .split(get_reserved_bytes_for_record_batch(&batch)?); - let input = self.sort_batch_stream(batch, &metrics, reservation)?; - Ok(spawn_buffered(input, 1)) + .map(|run| { + let run_size: usize = run.iter().map(get_record_batch_memory_size).sum(); + let reservation = self.reservation.split(run_size); + let schema = Arc::clone(&self.schema); + let intermediate_metrics = self.metrics.baseline.intermediate(); + let output_rows = intermediate_metrics.output_rows().clone(); + let stream = + futures::stream::iter(run.into_iter().map(Ok)).map(move |batch| { + match batch { + Ok(batch) => { + output_rows.add(batch.num_rows()); + Ok(batch) + } + Err(e) => Err(e), + } + }); + let boxed: SendableRecordBatchStream = Box::pin(ReservationStream::new( + Arc::clone(&schema), + Box::pin(RecordBatchStreamAdapter::new(schema, stream)), + reservation, + )); + Ok(spawn_buffered(boxed, 1)) }) .collect::>()?; @@ -721,73 +792,6 @@ impl ExternalSorter { .build() } - /// Sorts a single `RecordBatch` into a single stream. - /// - /// This may output multiple batches depending on the size of the - /// sorted data and the target batch size. - /// For single-batch output cases, `reservation` will be freed immediately after sorting, - /// as the batch will be output and is expected to be reserved by the consumer of the stream. - /// For multi-batch output cases, `reservation` will be grown to match the actual - /// size of sorted output, and as each batch is output, its memory will be freed from the reservation. - /// (This leads to the same behaviour, as futures are only evaluated when polled by the consumer.) - fn sort_batch_stream( - &self, - batch: RecordBatch, - metrics: &BaselineMetrics, - reservation: MemoryReservation, - ) -> Result { - assert_eq!( - get_reserved_bytes_for_record_batch(&batch)?, - reservation.size() - ); - - let schema = batch.schema(); - let expressions = self.expr.clone(); - let batch_size = self.batch_size; - let use_radix = self.use_radix; - let output_row_metrics = metrics.output_rows().clone(); - - let stream = futures::stream::once(async move { - let schema = batch.schema(); - - // Sort the batch immediately and get all output batches - let sorted_batches = - sort_batch_chunked(&batch, &expressions, batch_size, use_radix)?; - - // Resize the reservation to match the actual sorted output size. - // Using try_resize avoids a release-then-reacquire cycle, which - // matters for MemoryPool implementations where grow/shrink have - // non-trivial cost (e.g. JNI calls in Comet). - let total_sorted_size: usize = sorted_batches - .iter() - .map(get_record_batch_memory_size) - .sum(); - reservation - .try_resize(total_sorted_size) - .map_err(Self::err_with_oom_context)?; - - // Wrap in ReservationStream to hold the reservation - Result::<_, DataFusionError>::Ok(Box::pin(ReservationStream::new( - Arc::clone(&schema), - Box::pin(RecordBatchStreamAdapter::new( - Arc::clone(&schema), - futures::stream::iter(sorted_batches.into_iter().map(Ok)), - )), - reservation, - )) as SendableRecordBatchStream) - }) - .try_flatten() - .map(move |batch| match batch { - Ok(batch) => { - output_row_metrics.add(batch.num_rows()); - Ok(batch) - } - Err(e) => Err(e), - }); - - Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) - } - /// If this sort may spill, pre-allocates /// `sort_spill_reservation_bytes` of memory to guarantee memory /// left for the in memory sort/merge. @@ -806,7 +810,8 @@ impl ExternalSorter { } /// Reserves memory to be able to accommodate the given batch. - /// If memory is scarce, tries to spill current in-memory batches to disk first. + /// If memory is scarce, flushes the coalescer, spills sorted runs to disk, + /// and retries. async fn reserve_memory_for_batch_and_maybe_spill( &mut self, input: &RecordBatch, @@ -816,12 +821,15 @@ impl ExternalSorter { match self.reservation.try_grow(size) { Ok(_) => Ok(()), Err(e) => { - if self.in_mem_batches.is_empty() { + // Flush coalescer: sort whatever's accumulated (even if partial) + self.flush_coalescer()?; + + if !self.has_sorted_runs() { return Err(Self::err_with_oom_context(e)); } - // Spill and try again. - self.sort_and_spill_in_mem_batches().await?; + // Spill sorted runs and try again. + self.spill_sorted_runs().await?; self.reservation .try_grow(size) .map_err(Self::err_with_oom_context) @@ -1384,7 +1392,7 @@ impl ExecutionPlan for SortExec { self.expr.clone(), context.session_config().batch_size(), execution_options.sort_spill_reservation_bytes, - execution_options.sort_in_place_threshold_bytes, + execution_options.sort_coalesce_target_rows, context.session_config().spill_compression(), &self.metrics_set, context.runtime_env(), @@ -1511,7 +1519,7 @@ mod tests { use crate::test::{assert_is_pending, make_partition}; use arrow::array::*; - use arrow::compute::SortOptions; + use arrow::compute::{SortOptions, concat_batches}; use arrow::datatypes::*; use datafusion_common::ScalarValue; use datafusion_common::cast::as_primitive_array; @@ -1828,7 +1836,6 @@ mod tests { async fn test_sort_spill_utf8_strings() -> Result<()> { let session_config = SessionConfig::new() .with_batch_size(100) - .with_sort_in_place_threshold_bytes(20 * 1024) .with_sort_spill_reservation_bytes(100 * 1024); let runtime = RuntimeEnvBuilder::new() .with_memory_limit(500 * 1024, 1.0) @@ -1974,13 +1981,9 @@ mod tests { let batch_size = 50; // Small batch size to force multiple output batches let num_rows = 1000; // Create enough data for multiple batches - let task_ctx = Arc::new( - TaskContext::default().with_session_config( - SessionConfig::new() - .with_batch_size(batch_size) - .with_sort_in_place_threshold_bytes(usize::MAX), // Ensure we don't concat batches - ), - ); + let task_ctx = Arc::new(TaskContext::default().with_session_config( + SessionConfig::new().with_batch_size(batch_size), // Ensure we don't concat batches + )); let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); @@ -2381,11 +2384,8 @@ mod tests { let batch_size = 100; let create_task_ctx = |_: &[RecordBatch]| { - TaskContext::default().with_session_config( - SessionConfig::new() - .with_batch_size(batch_size) - .with_sort_in_place_threshold_bytes(usize::MAX), - ) + TaskContext::default() + .with_session_config(SessionConfig::new().with_batch_size(batch_size)) }; // Smaller than batch size and require more than a single batch to get the requested batch size @@ -2406,11 +2406,8 @@ mod tests { let batch_size = 100; let create_task_ctx = |_: &[RecordBatch]| { - TaskContext::default().with_session_config( - SessionConfig::new() - .with_batch_size(batch_size) - .with_sort_in_place_threshold_bytes(usize::MAX - 1), - ) + TaskContext::default() + .with_session_config(SessionConfig::new().with_batch_size(batch_size)) }; // Smaller than batch size and require more than a single batch to get the requested batch size @@ -2531,8 +2528,6 @@ mod tests { .with_session_config( SessionConfig::new() .with_batch_size(batch_size) - // To make sure there is no in place sorting - .with_sort_in_place_threshold_bytes(1) .with_sort_spill_reservation_bytes(1), ) .with_runtime( @@ -2842,7 +2837,7 @@ mod tests { [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(), 128, // batch_size sort_spill_reservation_bytes, - usize::MAX, // sort_in_place_threshold_bytes (high to avoid concat path) + 32768, // sort_coalesce_target_rows SpillCompression::Uncompressed, &metrics_set, Arc::clone(&runtime), diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index ab8a4a293234e..5f42bbe7ff3b0 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -269,6 +269,7 @@ datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8 datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 datafusion.execution.skip_physical_aggregate_schema_check false datafusion.execution.soft_max_rows_per_output_file 50000000 +datafusion.execution.sort_coalesce_target_rows 32768 datafusion.execution.sort_in_place_threshold_bytes 1048576 datafusion.execution.sort_pushdown_buffer_capacity 1073741824 datafusion.execution.sort_spill_reservation_bytes 10485760 @@ -414,6 +415,7 @@ datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8 Aggregat datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode datafusion.execution.skip_physical_aggregate_schema_check false When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max +datafusion.execution.sort_coalesce_target_rows 32768 Target number of rows to coalesce before sorting in ExternalSorter. Larger values give radix sort (used for primitives and strings) enough rows to amortize RowConverter encoding overhead. Under memory pressure the actual chunk size may be smaller. For schemas that are not eligible for radix sort (all-dictionary or nested types), the coalesce target falls back to `batch_size` regardless of this setting. datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. datafusion.execution.sort_pushdown_buffer_capacity 1073741824 Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). From d822a502ee1b371224d14ccc1ba3b47f89e00f20 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 15:02:12 -0400 Subject: [PATCH 03/25] Fix more tests. --- datafusion/core/tests/memory_limit/mod.rs | 15 +- datafusion/physical-plan/src/sorts/sort.rs | 155 +++++++++++++++++---- 2 files changed, 130 insertions(+), 40 deletions(-) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 912d92332fe9a..e0fb5682babdb 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -311,26 +311,21 @@ async fn sort_spill_reservation() { ] ); + // With low reservation, the sort should still succeed because + // the chunked sort pipeline eagerly sorts and the multi-level merge + // handles low merge memory by reducing fan-in. let config = base_config .clone() - // provide insufficient reserved space for merging, - // the sort will fail while trying to merge .with_sort_spill_reservation_bytes(1024); test.clone() - .with_expected_errors(vec![ - "Resources exhausted: Additional allocation failed", - "with top memory consumers (across reservations) as:", - "B for ExternalSorterMerge", - ]) + .with_expected_success() .with_config(config) .run() .await; let config = base_config - // reserve sufficient space up front for merge and this time, - // which will force the spills to happen with less buffered - // input and thus with enough to merge. + // reserve sufficient space up front for merge .with_sort_spill_reservation_bytes(mem_limit / 2); test.with_config(config).with_expected_success().run().await; diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index d6f8848194b50..990acdea347ca 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -340,10 +340,25 @@ impl ExternalSorter { return Ok(()); } + let input_rows = input.num_rows(); + let input_mem = get_record_batch_memory_size(&input); + eprintln!( + "[SORT] insert_batch: {} rows, mem={}, reservation_before={}", + input_rows, + input_mem, + self.reservation.size() + ); + self.reserve_memory_for_merge()?; self.reserve_memory_for_batch_and_maybe_spill(&input) .await?; + eprintln!( + "[SORT] after reserve: reservation={}, sorted_runs={}", + self.reservation.size(), + self.sorted_runs.len() + ); + let coalescer = self .coalescer .as_mut() @@ -355,6 +370,12 @@ impl ExternalSorter { // Sort any completed coalesced batches and store as runs self.drain_completed_batches()?; + eprintln!( + "[SORT] after drain: reservation={}, sorted_runs={}", + self.reservation.size(), + self.sorted_runs.len() + ); + Ok(()) } @@ -379,6 +400,15 @@ impl ExternalSorter { /// Uses radix sort when the batch is large enough to amortize encoding /// overhead (more than `batch_size` rows). Otherwise falls back to lexsort. fn sort_and_store_run(&mut self, batch: &RecordBatch) -> Result<()> { + let batch_rows = batch.num_rows(); + let batch_mem = get_record_batch_memory_size(batch); + eprintln!( + "[SORT] sort_and_store_run: {} rows, batch_mem={}, reservation={}", + batch_rows, + batch_mem, + self.reservation.size() + ); + let use_radix_for_this_batch = self.use_radix && batch.num_rows() > self.batch_size; @@ -395,25 +425,68 @@ impl ExternalSorter { vec![sort_batch(batch, &self.expr, None)?] }; + // GC StringView arrays to compact shared buffers from take(). + // Without this, sorted StringView batches reference all buffers + // from the coalesced input, inflating reported memory. + let sorted_chunks = Self::gc_stringview_batches(sorted_chunks)?; + self.sorted_runs.push(sorted_chunks); - // Resize reservation to match actual total of all sorted runs. - // After coalescing + sorting, the sorted runs may differ in size - // from the 2x reservations made for original input batches - // (e.g., StringView GC compacts buffers, slicing changes layout). + // Shrink reservation to match actual total of all sorted runs. let total_sorted_size: usize = self .sorted_runs .iter() .flat_map(|run| run.iter()) .map(get_record_batch_memory_size) .sum(); - self.reservation - .try_resize(total_sorted_size) - .map_err(Self::err_with_oom_context)?; + let new_size = total_sorted_size.min(self.reservation.size()); + self.reservation.shrink(self.reservation.size() - new_size); + + eprintln!( + "[SORT] after sort_and_store: total_sorted_size={}, new_reservation={}, runs={}", + total_sorted_size, + self.reservation.size(), + self.sorted_runs.len() + ); Ok(()) } + /// Compact StringView arrays in sorted batches to eliminate shared + /// buffer references from `take()`. Skips work if no StringView columns. + fn gc_stringview_batches(batches: Vec) -> Result> { + // Fast path: check schema for any StringView columns + if let Some(first) = batches.first() { + let has_stringview = first.schema().fields().iter().any(|f| { + matches!(f.data_type(), DataType::Utf8View | DataType::BinaryView) + }); + if !has_stringview { + return Ok(batches); + } + } + + let mut result = Vec::with_capacity(batches.len()); + for batch in batches { + let mut new_columns: Vec> = + Vec::with_capacity(batch.num_columns()); + let mut mutated = false; + for array in batch.columns() { + if let Some(sv) = array.as_any().downcast_ref::() { + new_columns.push(Arc::new(sv.gc())); + mutated = true; + } else { + new_columns.push(Arc::clone(array)); + } + } + if mutated { + result.push(RecordBatch::try_new(batch.schema(), new_columns)?); + } else { + result.push(batch); + } + } + Ok(result) + } + /// Flushes any partially accumulated rows from the coalescer, sorts them, /// and stores as a run. Called before spilling and at sort() time. fn flush_coalescer(&mut self) -> Result<()> { @@ -449,21 +522,27 @@ impl ExternalSorter { self.flush_coalescer()?; self.coalescer = None; + eprintln!( + "[SORT] sort(): spilled_before={}, has_sorted_runs={}, runs={}, reservation={}, merge_reservation={}", + self.spilled_before(), + self.has_sorted_runs(), + self.sorted_runs.len(), + self.reservation.size(), + self.merge_reservation.size() + ); + if self.spilled_before() { - // Merge and spill remaining sorted runs first. If there are many - // sorted runs and the memory limit is almost reached, merging - // them with the spilled files at the same time might cause OOM. if self.has_sorted_runs() { self.spill_sorted_runs().await?; } - // Transfer the pre-reserved merge memory to the streaming merge - // using `take()` instead of `new_empty()`. This ensures the merge - // stream starts with `sort_spill_reservation_bytes` already - // allocated, preventing starvation when concurrent sort partitions - // compete for pool memory. `take()` moves the bytes atomically - // without releasing them back to the pool, so other partitions - // cannot race to consume the freed memory. + eprintln!( + "[SORT] sort() spill path: reservation={}, merge_reservation={}, spill_files={}", + self.reservation.size(), + self.merge_reservation.size(), + self.finished_spill_files.len() + ); + StreamingMergeBuilder::new() .with_sorted_spill_files(std::mem::take(&mut self.finished_spill_files)) .with_spill_manager(self.spill_manager.clone()) @@ -475,10 +554,6 @@ impl ExternalSorter { .with_reservation(self.merge_reservation.take()) .build() } else { - // Release the memory reserved for merge back to the pool so - // there is some left when `merge_sorted_runs` requests an - // allocation. Only needed for the non-spill path; the spill - // path transfers the reservation to the merge stream instead. self.merge_reservation.free(); self.merge_sorted_runs(self.metrics.baseline.clone()) } @@ -694,6 +769,12 @@ impl ExternalSorter { ); // Reserve headroom for next sort/merge + eprintln!( + "[SORT] spill_sorted_runs end: reservation={}, merge_reservation={}, about to re-reserve {}", + self.reservation.size(), + self.merge_reservation.size(), + self.sort_spill_reservation_bytes + ); self.reserve_memory_for_merge()?; Ok(()) @@ -753,12 +834,14 @@ impl ExternalSorter { ))); } - // Multiple runs: create one stream per run and merge + // Multiple runs: create one stream per run and merge. + // The merge stream holds the main reservation and frees memory + // as runs are consumed. This ensures the RowCursorStream can + // allocate for row encoding from the freed memory. + let reservation_for_merge = self.reservation.take(); let streams = all_runs .into_iter() .map(|run| { - let run_size: usize = run.iter().map(get_record_batch_memory_size).sum(); - let reservation = self.reservation.split(run_size); let schema = Arc::clone(&self.schema); let intermediate_metrics = self.metrics.baseline.intermediate(); let output_rows = intermediate_metrics.output_rows().clone(); @@ -772,11 +855,8 @@ impl ExternalSorter { Err(e) => Err(e), } }); - let boxed: SendableRecordBatchStream = Box::pin(ReservationStream::new( - Arc::clone(&schema), - Box::pin(RecordBatchStreamAdapter::new(schema, stream)), - reservation, - )); + let boxed: SendableRecordBatchStream = + Box::pin(RecordBatchStreamAdapter::new(schema, stream)); Ok(spawn_buffered(boxed, 1)) }) .collect::>()?; @@ -788,7 +868,7 @@ impl ExternalSorter { .with_metrics(metrics) .with_batch_size(self.batch_size) .with_fetch(None) - .with_reservation(self.merge_reservation.new_empty()) + .with_reservation(reservation_for_merge) .build() } @@ -821,6 +901,11 @@ impl ExternalSorter { match self.reservation.try_grow(size) { Ok(_) => Ok(()), Err(e) => { + eprintln!( + "[SORT] reserve failed: need={}, reservation={}, spilling...", + size, + self.reservation.size() + ); // Flush coalescer: sort whatever's accumulated (even if partial) self.flush_coalescer()?; @@ -828,8 +913,18 @@ impl ExternalSorter { return Err(Self::err_with_oom_context(e)); } + eprintln!( + "[SORT] after flush: reservation={}, runs={}", + self.reservation.size(), + self.sorted_runs.len() + ); + // Spill sorted runs and try again. self.spill_sorted_runs().await?; + eprintln!( + "[SORT] after spill: reservation={}", + self.reservation.size() + ); self.reservation .try_grow(size) .map_err(Self::err_with_oom_context) From 54be4756d359b1c7fa48b76ba3e0732105e1a1a4 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 16:27:50 -0400 Subject: [PATCH 04/25] Tests pass. --- datafusion/core/tests/memory_limit/mod.rs | 4 +- datafusion/physical-plan/src/sorts/sort.rs | 127 ++++++++++++--------- 2 files changed, 71 insertions(+), 60 deletions(-) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index e0fb5682babdb..a3729cb9fca32 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -314,9 +314,7 @@ async fn sort_spill_reservation() { // With low reservation, the sort should still succeed because // the chunked sort pipeline eagerly sorts and the multi-level merge // handles low merge memory by reducing fan-in. - let config = base_config - .clone() - .with_sort_spill_reservation_bytes(1024); + let config = base_config.clone().with_sort_spill_reservation_bytes(1024); test.clone() .with_expected_success() diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 990acdea347ca..43c9e8c859b3e 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -531,7 +531,27 @@ impl ExternalSorter { self.merge_reservation.size() ); - if self.spilled_before() { + // Determine if we must take the spill path. + // + // We must spill if: + // 1. We already spilled during the insert phase, OR + // 2. We have multiple sorted runs but merge_reservation is 0. + // + // Case 2 matters because the in-memory merge needs to allocate + // cursor infrastructure (RowCursorStream / FieldCursorStream) + // at build time, before any run data is consumed. The cursor + // allocation comes from merge_reservation. If that's 0, the + // pool is fully occupied by sorted run data and the cursor + // can't allocate. Spilling to disk frees pool memory, and + // MultiLevelMerge handles the merge with dynamic fan-in — + // reading from spill files that don't hold pool memory. + let must_spill = self.spilled_before() + || (self.sorted_runs.len() > 1 && self.merge_reservation.size() == 0); + + if must_spill { + // Merge and spill remaining sorted runs first. If there are many + // sorted runs and the memory limit is almost reached, merging + // them with the spilled files at the same time might cause OOM. if self.has_sorted_runs() { self.spill_sorted_runs().await?; } @@ -554,6 +574,13 @@ impl ExternalSorter { .with_reservation(self.merge_reservation.take()) .build() } else { + // In-memory path: we have 0 runs, 1 run (no merge needed), + // or multiple runs with merge_reservation > 0 providing + // headroom for cursor allocation. + // + // Release merge_reservation back to the pool — in the + // non-spill path, merge_sorted_runs allocates cursor memory + // from the pool directly (freed merge_reservation bytes). self.merge_reservation.free(); self.merge_sorted_runs(self.metrics.baseline.clone()) } @@ -711,70 +738,43 @@ impl ExternalSorter { Ok(()) } - /// Merges the pre-sorted runs and writes the result to spill files. + /// Spills sorted runs to disk. + /// + /// Unlike the old pipeline where `in_mem_batches` held unsorted data + /// that required merging before spilling, our sorted runs are already + /// sorted. We can spill each run directly as its own spill file. + /// MultiLevelMerge handles the fan-in during the final merge. + /// + /// This also avoids the need to allocate merge cursor infrastructure + /// (RowCursorStream / FieldCursorStream) during the spill, which is + /// important when the pool has no headroom (sort_spill_reservation_bytes=0). async fn spill_sorted_runs(&mut self) -> Result<()> { assert_or_internal_err!( self.has_sorted_runs(), "sorted_runs must not be empty when attempting to spill" ); - // Release the memory reserved for merge back to the pool so - // there is some left when `merge_sorted_runs` requests an - // allocation. At the end of this function, memory will be - // reserved again for the next spill. - self.merge_reservation.free(); - - let mut sorted_stream = - self.merge_sorted_runs(self.metrics.baseline.intermediate())?; - // After `merge_sorted_runs()` is constructed, all `sorted_runs` are taken - // to construct a globally sorted stream. - assert_or_internal_err!( - self.sorted_runs.is_empty(), - "sorted_runs should be empty after constructing sorted stream" + eprintln!( + "[SORT] spill_sorted_runs: spilling {} runs individually", + self.sorted_runs.len() ); - // 'global' here refers to all buffered batches when the memory limit is - // reached. This variable will buffer the sorted batches after - // sort-preserving merge and incrementally append to spill files. - let mut globally_sorted_batches: Vec = vec![]; - - while let Some(batch) = sorted_stream.next().await { - let batch = batch?; - let sorted_size = get_reserved_bytes_for_record_batch(&batch)?; - if self.reservation.try_grow(sorted_size).is_err() { - // Although the reservation is not enough, the batch is - // already in memory, so it's okay to combine it with previously - // sorted batches, and spill together. - globally_sorted_batches.push(batch); - self.consume_and_spill_append(&mut globally_sorted_batches) - .await?; // reservation is freed in spill() - } else { - globally_sorted_batches.push(batch); - } - } - - // Drop early to free up memory reserved by the sorted stream, otherwise the - // upcoming `self.reserve_memory_for_merge()` may fail due to insufficient memory. - drop(sorted_stream); - - self.consume_and_spill_append(&mut globally_sorted_batches) - .await?; - self.spill_finish().await?; - // Sanity check after spilling - let buffers_cleared_property = - self.sorted_runs.is_empty() && globally_sorted_batches.is_empty(); - assert_or_internal_err!( - buffers_cleared_property, - "sorted_runs and globally_sorted_batches should be cleared before" - ); + let all_runs = std::mem::take(&mut self.sorted_runs); + for run in all_runs { + let mut batches = run; + Self::organize_stringview_arrays(&mut batches)?; + self.consume_and_spill_append(&mut batches).await?; + self.spill_finish().await?; + } - // Reserve headroom for next sort/merge eprintln!( "[SORT] spill_sorted_runs end: reservation={}, merge_reservation={}, about to re-reserve {}", self.reservation.size(), self.merge_reservation.size(), self.sort_spill_reservation_bytes ); + + // Reserve headroom for next sort/merge self.reserve_memory_for_merge()?; Ok(()) @@ -835,10 +835,17 @@ impl ExternalSorter { } // Multiple runs: create one stream per run and merge. - // The merge stream holds the main reservation and frees memory - // as runs are consumed. This ensures the RowCursorStream can - // allocate for row encoding from the freed memory. - let reservation_for_merge = self.reservation.take(); + // + // Memory model for the multi-run merge: + // - self.reservation holds the sorted run data. It stays allocated + // for the lifetime of the ExternalSorter (freed on drop). This + // over-reserves as runs are consumed, but is conservative/safe. + // - The merge cursor (RowCursorStream/FieldCursorStream) allocates + // from a new_empty() reservation, drawing from pool headroom + // freed by merge_reservation.free() in the caller. + // - This works because sort() only enters this path when + // merge_reservation > 0, guaranteeing pool headroom for cursors. + // When merge_reservation == 0, sort() takes the spill path instead. let streams = all_runs .into_iter() .map(|run| { @@ -861,15 +868,21 @@ impl ExternalSorter { }) .collect::>()?; - StreamingMergeBuilder::new() + let result = StreamingMergeBuilder::new() .with_streams(streams) .with_schema(Arc::clone(&self.schema)) .with_expressions(&self.expr.clone()) .with_metrics(metrics) .with_batch_size(self.batch_size) .with_fetch(None) - .with_reservation(reservation_for_merge) - .build() + .with_reservation(self.reservation.new_empty()) + .build(); + + if let Err(ref e) = result { + eprintln!("[SORT] merge_sorted_runs build failed: {e}"); + } + + result } /// If this sort may spill, pre-allocates From a42ba494b5cd19cc57dedcae97badb79880d43e0 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 16:37:11 -0400 Subject: [PATCH 05/25] Cleanup. --- datafusion/physical-plan/src/sorts/sort.rs | 172 ++++----------------- 1 file changed, 27 insertions(+), 145 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 43c9e8c859b3e..705733646b963 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -41,7 +41,6 @@ use crate::projection::{ProjectionExec, make_with_child, update_ordering}; use crate::sorts::IncrementalSortIterator; use crate::sorts::streaming_merge::{SortedSpillFile, StreamingMergeBuilder}; use crate::spill::get_record_batch_memory_size; -use crate::spill::in_progress_spill_file::InProgressSpillFile; use crate::spill::spill_manager::{GetSlicedSize, SpillManager}; use crate::stream::RecordBatchStreamAdapter; use crate::stream::ReservationStream; @@ -59,8 +58,7 @@ use arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::config::SpillCompression; use datafusion_common::tree_node::TreeNodeRecursion; use datafusion_common::{ - DataFusionError, Result, assert_or_internal_err, internal_datafusion_err, - unwrap_or_internal_err, + DataFusionError, Result, assert_or_internal_err, unwrap_or_internal_err, }; use datafusion_execution::TaskContext; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; @@ -70,7 +68,7 @@ use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit}; use futures::{StreamExt, TryStreamExt}; -use log::{debug, trace}; +use log::trace; struct ExternalSorterMetrics { /// metrics @@ -232,13 +230,6 @@ struct ExternalSorter { /// `Vec` is a single sorted run produced by sorting one coalesced batch. sorted_runs: Vec>, - /// During external sorting, in-memory intermediate data will be appended to - /// this file incrementally. Once finished, this file will be moved to [`Self::finished_spill_files`]. - /// - /// this is a tuple of: - /// 1. `InProgressSpillFile` - the file that is being written to - /// 2. `max_record_batch_memory` - the maximum memory usage of a single batch in this spill file. - in_progress_spill_file: Option<(InProgressSpillFile, usize)>, /// If data has previously been spilled, the locations of the spill files (in /// Arrow IPC format) /// Within the same spill file, the data might be chunked into multiple batches, @@ -316,7 +307,6 @@ impl ExternalSorter { schema, coalescer: Some(coalescer), sorted_runs: vec![], - in_progress_spill_file: None, finished_spill_files: vec![], expr, metrics, @@ -549,9 +539,8 @@ impl ExternalSorter { || (self.sorted_runs.len() > 1 && self.merge_reservation.size() == 0); if must_spill { - // Merge and spill remaining sorted runs first. If there are many - // sorted runs and the memory limit is almost reached, merging - // them with the spilled files at the same time might cause OOM. + // Spill remaining sorted runs. Since runs are already sorted, + // each is written directly as its own spill file (no merge needed). if self.has_sorted_runs() { self.spill_sorted_runs().await?; } @@ -612,132 +601,6 @@ impl ExternalSorter { self.metrics.spill_metrics.spill_file_count.value() } - /// Appending globally sorted batches to the in-progress spill file, and clears - /// the `globally_sorted_batches` (also its memory reservation) afterwards. - async fn consume_and_spill_append( - &mut self, - globally_sorted_batches: &mut Vec, - ) -> Result<()> { - if globally_sorted_batches.is_empty() { - return Ok(()); - } - - // Lazily initialize the in-progress spill file - if self.in_progress_spill_file.is_none() { - self.in_progress_spill_file = - Some((self.spill_manager.create_in_progress_file("Sorting")?, 0)); - } - - Self::organize_stringview_arrays(globally_sorted_batches)?; - - debug!("Spilling sort data of ExternalSorter to disk whilst inserting"); - - let batches_to_spill = std::mem::take(globally_sorted_batches); - self.reservation.free(); - - let (in_progress_file, max_record_batch_size) = - self.in_progress_spill_file.as_mut().ok_or_else(|| { - internal_datafusion_err!("In-progress spill file should be initialized") - })?; - - for batch in batches_to_spill { - in_progress_file.append_batch(&batch)?; - - *max_record_batch_size = - (*max_record_batch_size).max(batch.get_sliced_size()?); - } - - assert_or_internal_err!( - globally_sorted_batches.is_empty(), - "This function consumes globally_sorted_batches, so it should be empty after taking." - ); - - Ok(()) - } - - /// Finishes the in-progress spill file and moves it to the finished spill files. - async fn spill_finish(&mut self) -> Result<()> { - let (mut in_progress_file, max_record_batch_memory) = - self.in_progress_spill_file.take().ok_or_else(|| { - internal_datafusion_err!("Should be called after `spill_append`") - })?; - let spill_file = in_progress_file.finish()?; - - if let Some(spill_file) = spill_file { - self.finished_spill_files.push(SortedSpillFile { - file: spill_file, - max_record_batch_memory, - }); - } - - Ok(()) - } - - /// Reconstruct `globally_sorted_batches` to organize the payload buffers of each - /// `StringViewArray` in sequential order by calling `gc()` on them. - /// - /// Note this is a workaround until is - /// available - /// - /// # Rationale - /// After (merge-based) sorting, all batches will be sorted into a single run, - /// but physically this sorted run is chunked into many small batches. For - /// `StringViewArray`s inside each sorted run, their inner buffers are not - /// re-constructed by default, leading to non-sequential payload locations - /// (permutated by `interleave()` Arrow kernel). A single payload buffer might - /// be shared by multiple `RecordBatch`es. - /// When writing each batch to disk, the writer has to write all referenced buffers, - /// because they have to be read back one by one to reduce memory usage. This - /// causes extra disk reads and writes, and potentially execution failure. - /// - /// # Example - /// Before sorting: - /// batch1 -> buffer1 - /// batch2 -> buffer2 - /// - /// sorted_batch1 -> buffer1 - /// -> buffer2 - /// sorted_batch2 -> buffer1 - /// -> buffer2 - /// - /// Then when spilling each batch, the writer has to write all referenced buffers - /// repeatedly. - fn organize_stringview_arrays( - globally_sorted_batches: &mut Vec, - ) -> Result<()> { - let mut organized_batches = Vec::with_capacity(globally_sorted_batches.len()); - - for batch in globally_sorted_batches.drain(..) { - let mut new_columns: Vec> = - Vec::with_capacity(batch.num_columns()); - - let mut arr_mutated = false; - for array in batch.columns() { - if let Some(string_view_array) = - array.as_any().downcast_ref::() - { - let new_array = string_view_array.gc(); - new_columns.push(Arc::new(new_array)); - arr_mutated = true; - } else { - new_columns.push(Arc::clone(array)); - } - } - - let organized_batch = if arr_mutated { - RecordBatch::try_new(batch.schema(), new_columns)? - } else { - batch - }; - - organized_batches.push(organized_batch); - } - - *globally_sorted_batches = organized_batches; - - Ok(()) - } - /// Spills sorted runs to disk. /// /// Unlike the old pipeline where `in_mem_batches` held unsorted data @@ -761,10 +624,29 @@ impl ExternalSorter { let all_runs = std::mem::take(&mut self.sorted_runs); for run in all_runs { - let mut batches = run; - Self::organize_stringview_arrays(&mut batches)?; - self.consume_and_spill_append(&mut batches).await?; - self.spill_finish().await?; + let run_size: usize = run.iter().map(get_record_batch_memory_size).sum(); + + // Initialize a new spill file for this run + let mut in_progress = + self.spill_manager.create_in_progress_file("Sorting")?; + let mut max_batch_memory = 0usize; + for batch in &run { + in_progress.append_batch(batch)?; + max_batch_memory = max_batch_memory.max(batch.get_sliced_size()?); + } + + let spill_file = in_progress.finish()?; + if let Some(spill_file) = spill_file { + self.finished_spill_files.push(SortedSpillFile { + file: spill_file, + max_record_batch_memory: max_batch_memory, + }); + } + + // Drop the run data and release its reservation + drop(run); + self.reservation + .shrink(run_size.min(self.reservation.size())); } eprintln!( From 24dbdb7ca1be567351d1d07c6f008c00773307fb Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 16:50:11 -0400 Subject: [PATCH 06/25] More cleanup. --- datafusion/physical-plan/src/sorts/sort.rs | 283 ++++++--------------- 1 file changed, 77 insertions(+), 206 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 705733646b963..df8d018e03433 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -93,115 +93,82 @@ impl ExternalSorterMetrics { /// /// # Algorithm /// -/// 1. get a non-empty new batch from input +/// Incoming batches are coalesced via [`BatchCoalescer`] to a target +/// row count before sorting. For radix-eligible schemas (primitives, +/// strings) the target is `sort_coalesce_target_rows` (default 32768); +/// for non-radix schemas (all-dictionary, nested types) it falls back +/// to `batch_size`. This gives sort kernels enough rows to amortize +/// overhead — radix sort is 2-3x faster than lexsort at 32K+ rows +/// but slower at small batch sizes. /// -/// 2. check with the memory manager there is sufficient space to -/// buffer the batch in memory. +/// Each coalesced batch is sorted immediately (radix or lexsort) and +/// stored as a pre-sorted run. Under memory pressure the coalescer +/// flushes early, producing smaller runs that fall back to lexsort. /// -/// 2.1 if memory is sufficient, buffer batch in memory, go to 1. +/// 1. For each incoming batch: +/// - Reserve memory (2x batch size). If reservation fails, flush +/// the coalescer, spill all sorted runs to disk, then retry. +/// - Push batch into the coalescer. +/// - If the coalescer reached its target: sort the coalesced batch +/// and store as a new sorted run. /// -/// 2.2 if no more memory is available, sort all buffered batches and -/// spill to file. buffer the next batch in memory, go to 1. -/// -/// 3. when input is exhausted, merge all in memory batches and spills -/// to get a total order. +/// 2. When input is exhausted, merge all sorted runs (and any spill +/// files) to produce a total order. /// /// # When data fits in available memory /// -/// If there is sufficient memory, data is sorted in memory to produce the output +/// Sorted runs are merged in memory using a loser-tree k-way merge +/// (via [`StreamingMergeBuilder`]). /// /// ```text -/// ┌─────┐ -/// │ 2 │ -/// │ 3 │ -/// │ 1 │─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ -/// │ 4 │ -/// │ 2 │ │ -/// └─────┘ ▼ -/// ┌─────┐ -/// │ 1 │ In memory -/// │ 4 │─ ─ ─ ─ ─ ─▶ sort/merge ─ ─ ─ ─ ─▶ total sorted output -/// │ 1 │ -/// └─────┘ ▲ -/// ... │ -/// -/// ┌─────┐ │ -/// │ 4 │ -/// │ 3 │─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ -/// └─────┘ -/// -/// in_mem_batches +/// ┌──────────┐ ┌────────────┐ ┌──────┐ ┌────────────┐ +/// │ Incoming │────▶│ Batch │────▶│ Sort │────▶│ Sorted Run │ +/// │ Batches │ │ Coalescer │ │ │ │ (in memory)│ +/// └──────────┘ └────────────┘ └──────┘ └─────┬──────┘ +/// │ +/// ┌──────────────┘ +/// ▼ +/// k-way merge (loser tree) +/// │ +/// ▼ +/// total sorted output /// ``` /// /// # When data does not fit in available memory /// -/// When memory is exhausted, data is first sorted and written to one -/// or more spill files on disk: -/// -/// ```text -/// ┌─────┐ .─────────────────. -/// │ 2 │ ( ) -/// │ 3 │ │`─────────────────'│ -/// │ 1 │─ ─ ─ ─ ─ ─ ─ │ ┌────┐ │ -/// │ 4 │ │ │ │ 1 │░ │ -/// │ 2 │ │ │... │░ │ -/// └─────┘ ▼ │ │ 4 │░ ┌ ─ ─ │ -/// ┌─────┐ │ └────┘░ 1 │░ │ -/// │ 1 │ In memory │ ░░░░░░ │ ░░ │ -/// │ 4 │─ ─ ▶ sort/merge ─ ─ ─ ─ ┼ ─ ─ ─ ─ ─▶ ... │░ │ -/// │ 1 │ and write to file │ │ ░░ │ -/// └─────┘ │ 4 │░ │ -/// ... ▲ │ └░─░─░░ │ -/// │ │ ░░░░░░ │ -/// ┌─────┐ │.─────────────────.│ -/// │ 4 │ │ ( ) -/// │ 3 │─ ─ ─ ─ ─ ─ ─ `─────────────────' -/// └─────┘ -/// -/// in_mem_batches spills -/// (file on disk in Arrow -/// IPC format) -/// ``` -/// -/// Once the input is completely read, the spill files are read and -/// merged with any in memory batches to produce a single total sorted -/// output: +/// When memory is exhausted, sorted runs are spilled directly to disk +/// (one spill file per run — no merge needed since runs are already +/// sorted). [`MultiLevelMerge`] handles the final merge from disk +/// with dynamic fan-in. /// /// ```text -/// .─────────────────. -/// ( ) -/// │`─────────────────'│ -/// │ ┌────┐ │ -/// │ │ 1 │░ │ -/// │ │... │─ ─ ─ ─ ─ ─│─ ─ ─ ─ ─ ─ -/// │ │ 4 │░ ┌────┐ │ │ -/// │ └────┘░ │ 1 │░ │ ▼ -/// │ ░░░░░░ │ │░ │ -/// │ │... │─ ─│─ ─ ─ ▶ merge ─ ─ ─▶ total sorted output -/// │ │ │░ │ -/// │ │ 4 │░ │ ▲ -/// │ └────┘░ │ │ -/// │ ░░░░░░ │ -/// │.─────────────────.│ │ -/// ( ) -/// `─────────────────' │ -/// spills +/// ┌──────────┐ ┌────────────┐ ┌──────┐ ┌────────────┐ +/// │ Incoming │────▶│ Batch │────▶│ Sort │────▶│ Sorted Run │ +/// │ Batches │ │ Coalescer │ │ │ │ │ +/// └──────────┘ └────────────┘ └──────┘ └─────┬──────┘ +/// │ +/// memory pressure ◀──────────────┘ /// │ -/// +/// ▼ +/// .─────────────────. +/// ( Spill to disk ) +/// │ (one file/run) │ +/// `─────────────────' /// │ +/// ┌───────────────────┘ +/// ▼ +/// MultiLevelMerge (dynamic fan-in) +/// │ +/// ▼ +/// total sorted output +/// ``` /// -/// ┌─────┐ │ -/// │ 1 │ -/// │ 4 │─ ─ ─ ─ │ -/// └─────┘ │ -/// ... In memory -/// └ ─ ─ ─▶ sort/merge -/// ┌─────┐ -/// │ 4 │ ▲ -/// │ 3 │─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ -/// └─────┘ +/// # Graceful degradation /// -/// in_mem_batches +/// The coalesce target (32K rows) is aspirational. Under memory +/// pressure, chunk sizes shrink and radix sort amortizes less — at +/// `batch_size` or below, the pipeline falls back to lexsort, +/// matching the old per-batch sort behavior. /// ``` struct ExternalSorter { // ======================================================================== @@ -330,25 +297,10 @@ impl ExternalSorter { return Ok(()); } - let input_rows = input.num_rows(); - let input_mem = get_record_batch_memory_size(&input); - eprintln!( - "[SORT] insert_batch: {} rows, mem={}, reservation_before={}", - input_rows, - input_mem, - self.reservation.size() - ); - self.reserve_memory_for_merge()?; self.reserve_memory_for_batch_and_maybe_spill(&input) .await?; - eprintln!( - "[SORT] after reserve: reservation={}, sorted_runs={}", - self.reservation.size(), - self.sorted_runs.len() - ); - let coalescer = self .coalescer .as_mut() @@ -357,15 +309,8 @@ impl ExternalSorter { .push_batch(input) .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; - // Sort any completed coalesced batches and store as runs self.drain_completed_batches()?; - eprintln!( - "[SORT] after drain: reservation={}, sorted_runs={}", - self.reservation.size(), - self.sorted_runs.len() - ); - Ok(()) } @@ -390,54 +335,31 @@ impl ExternalSorter { /// Uses radix sort when the batch is large enough to amortize encoding /// overhead (more than `batch_size` rows). Otherwise falls back to lexsort. fn sort_and_store_run(&mut self, batch: &RecordBatch) -> Result<()> { - let batch_rows = batch.num_rows(); - let batch_mem = get_record_batch_memory_size(batch); - eprintln!( - "[SORT] sort_and_store_run: {} rows, batch_mem={}, reservation={}", - batch_rows, - batch_mem, - self.reservation.size() - ); - let use_radix_for_this_batch = self.use_radix && batch.num_rows() > self.batch_size; let sorted_chunks = if use_radix_for_this_batch { - // Radix sort on large coalesced batch, chunk output to batch_size - sort_batch_chunked( - batch, - &self.expr, - self.batch_size, - true, // use_radix - )? + sort_batch_chunked(batch, &self.expr, self.batch_size, true)? } else { - // Lexsort (or small batch from degraded radix path) vec![sort_batch(batch, &self.expr, None)?] }; - // GC StringView arrays to compact shared buffers from take(). - // Without this, sorted StringView batches reference all buffers - // from the coalesced input, inflating reported memory. + // After take(), StringView arrays may reference shared buffers from + // multiple coalesced input batches, inflating reported memory size. + // GC compacts them so reservation tracking stays accurate. let sorted_chunks = Self::gc_stringview_batches(sorted_chunks)?; - self.sorted_runs.push(sorted_chunks); - // Shrink reservation to match actual total of all sorted runs. - let total_sorted_size: usize = self - .sorted_runs - .iter() - .flat_map(|run| run.iter()) - .map(get_record_batch_memory_size) - .sum(); - let new_size = total_sorted_size.min(self.reservation.size()); - self.reservation.shrink(self.reservation.size() - new_size); - - eprintln!( - "[SORT] after sort_and_store: total_sorted_size={}, new_reservation={}, runs={}", - total_sorted_size, - self.reservation.size(), - self.sorted_runs.len() + // The 2x reservation from input batches exceeds the 1x sorted output. + // Shrink to release the excess back to the pool. + let target = self.reservation.size().min( + self.sorted_runs + .iter() + .flat_map(|r| r.iter()) + .map(get_record_batch_memory_size) + .sum(), ); + self.reservation.shrink(self.reservation.size() - target); Ok(()) } @@ -508,19 +430,9 @@ impl ExternalSorter { /// 2. A combined streaming merge incorporating sorted runs /// and data from spill files on disk. async fn sort(&mut self) -> Result { - // Flush remaining data from the coalescer self.flush_coalescer()?; self.coalescer = None; - eprintln!( - "[SORT] sort(): spilled_before={}, has_sorted_runs={}, runs={}, reservation={}, merge_reservation={}", - self.spilled_before(), - self.has_sorted_runs(), - self.sorted_runs.len(), - self.reservation.size(), - self.merge_reservation.size() - ); - // Determine if we must take the spill path. // // We must spill if: @@ -545,18 +457,11 @@ impl ExternalSorter { self.spill_sorted_runs().await?; } - eprintln!( - "[SORT] sort() spill path: reservation={}, merge_reservation={}, spill_files={}", - self.reservation.size(), - self.merge_reservation.size(), - self.finished_spill_files.len() - ); - StreamingMergeBuilder::new() .with_sorted_spill_files(std::mem::take(&mut self.finished_spill_files)) .with_spill_manager(self.spill_manager.clone()) .with_schema(Arc::clone(&self.schema)) - .with_expressions(&self.expr.clone()) + .with_expressions(&self.expr) .with_metrics(self.metrics.baseline.clone()) .with_batch_size(self.batch_size) .with_fetch(None) @@ -617,11 +522,6 @@ impl ExternalSorter { "sorted_runs must not be empty when attempting to spill" ); - eprintln!( - "[SORT] spill_sorted_runs: spilling {} runs individually", - self.sorted_runs.len() - ); - let all_runs = std::mem::take(&mut self.sorted_runs); for run in all_runs { let run_size: usize = run.iter().map(get_record_batch_memory_size).sum(); @@ -649,14 +549,6 @@ impl ExternalSorter { .shrink(run_size.min(self.reservation.size())); } - eprintln!( - "[SORT] spill_sorted_runs end: reservation={}, merge_reservation={}, about to re-reserve {}", - self.reservation.size(), - self.merge_reservation.size(), - self.sort_spill_reservation_bytes - ); - - // Reserve headroom for next sort/merge self.reserve_memory_for_merge()?; Ok(()) @@ -750,21 +642,15 @@ impl ExternalSorter { }) .collect::>()?; - let result = StreamingMergeBuilder::new() + StreamingMergeBuilder::new() .with_streams(streams) .with_schema(Arc::clone(&self.schema)) - .with_expressions(&self.expr.clone()) + .with_expressions(&self.expr) .with_metrics(metrics) .with_batch_size(self.batch_size) .with_fetch(None) .with_reservation(self.reservation.new_empty()) - .build(); - - if let Err(ref e) = result { - eprintln!("[SORT] merge_sorted_runs build failed: {e}"); - } - - result + .build() } /// If this sort may spill, pre-allocates @@ -796,30 +682,15 @@ impl ExternalSorter { match self.reservation.try_grow(size) { Ok(_) => Ok(()), Err(e) => { - eprintln!( - "[SORT] reserve failed: need={}, reservation={}, spilling...", - size, - self.reservation.size() - ); - // Flush coalescer: sort whatever's accumulated (even if partial) + // Sort whatever the coalescer has accumulated, then spill + // all sorted runs to disk to free pool memory. self.flush_coalescer()?; if !self.has_sorted_runs() { return Err(Self::err_with_oom_context(e)); } - eprintln!( - "[SORT] after flush: reservation={}, runs={}", - self.reservation.size(), - self.sorted_runs.len() - ); - - // Spill sorted runs and try again. self.spill_sorted_runs().await?; - eprintln!( - "[SORT] after spill: reservation={}", - self.reservation.size() - ); self.reservation .try_grow(size) .map_err(Self::err_with_oom_context) From e83ffc6c99846e315fb221968e351b543a055908 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 16:56:36 -0400 Subject: [PATCH 07/25] More tests. --- datafusion/physical-plan/src/sorts/sort.rs | 161 +++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index df8d018e03433..092a4148456a9 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -3038,4 +3038,165 @@ mod tests { } } } + + /// Helper to create an ExternalSorter for testing + fn test_sorter( + schema: SchemaRef, + expr: LexOrdering, + batch_size: usize, + sort_coalesce_target_rows: usize, + pool: Arc, + ) -> Result { + let runtime = RuntimeEnvBuilder::new() + .with_memory_pool(pool) + .build_arc()?; + let metrics_set = ExecutionPlanMetricsSet::new(); + ExternalSorter::new( + 0, + schema, + expr, + batch_size, + 10 * 1024 * 1024, + sort_coalesce_target_rows, + SpillCompression::Uncompressed, + &metrics_set, + runtime, + ) + } + + /// Collect sorted output and verify ascending order on column 0. + async fn collect_and_verify_sorted( + sorter: &mut ExternalSorter, + ) -> Result> { + let schema = Arc::clone(&sorter.schema); + let stream = sorter.sort().await?; + let batches: Vec = stream.try_collect().await?; + let merged = concat_batches(&schema, &batches)?; + if merged.num_rows() > 1 { + let col = merged.column(0).as_primitive::(); + for i in 1..col.len() { + assert!( + col.value(i - 1) <= col.value(i), + "Not sorted at index {i}: {} > {}", + col.value(i - 1), + col.value(i) + ); + } + } + Ok(batches) + } + + /// Radix-eligible batches are coalesced to `sort_coalesce_target_rows` + /// and chunked back to `batch_size` after sorting. + #[tokio::test] + async fn test_chunked_sort_radix_coalescing() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); + let expr: LexOrdering = + [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(); + + let pool: Arc = + Arc::new(GreedyMemoryPool::new(100 * 1024 * 1024)); + let mut sorter = test_sorter(Arc::clone(&schema), expr, 8192, 32768, pool)?; + + // 8 batches × 8192 rows = 65536 rows → 2 coalesced chunks of 32K + for i in 0..8 { + let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(values))], + )?; + sorter.insert_batch(batch).await?; + } + + assert_eq!(sorter.sorted_runs.len(), 2); + // 32K rows / 8K batch_size = 4 chunks per run + assert_eq!(sorter.sorted_runs[0].len(), 4); + assert_eq!(sorter.sorted_runs[1].len(), 4); + + let batches = collect_and_verify_sorted(&mut sorter).await?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 65536); + + Ok(()) + } + + /// When sort() is called before the coalesce target is reached, + /// the partial coalescer contents are flushed and sorted. + #[tokio::test] + async fn test_chunked_sort_partial_flush() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); + let expr: LexOrdering = + [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(); + + let pool: Arc = + Arc::new(GreedyMemoryPool::new(100 * 1024 * 1024)); + let mut sorter = test_sorter(Arc::clone(&schema), expr, 8192, 32768, pool)?; + + // 2 batches × 8192 = 16384 rows (below 32K target) + for i in 0..2 { + let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(values))], + )?; + sorter.insert_batch(batch).await?; + } + + // Data is in the coalescer, not yet sorted into runs + assert_eq!(sorter.sorted_runs.len(), 0); + + let batches = collect_and_verify_sorted(&mut sorter).await?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 16384); + + Ok(()) + } + + /// Spilling writes one spill file per sorted run (no merge before spill). + #[tokio::test] + async fn test_spill_creates_one_file_per_run() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); + let expr: LexOrdering = + [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(); + + let pool: Arc = Arc::new(GreedyMemoryPool::new(500 * 1024)); + let runtime = RuntimeEnvBuilder::new() + .with_memory_pool(pool) + .build_arc()?; + let metrics_set = ExecutionPlanMetricsSet::new(); + let mut sorter = ExternalSorter::new( + 0, + Arc::clone(&schema), + expr, + 8192, + 50 * 1024, // sort_spill_reservation + 8192, // coalesce to batch_size → 1 run per batch + SpillCompression::Uncompressed, + &metrics_set, + runtime, + )?; + + for i in 0..20 { + let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(values))], + )?; + sorter.insert_batch(batch).await?; + } + + assert!(sorter.spilled_before()); + // Each run spills as its own file (not merged into one) + assert!( + sorter.spill_count() > 1, + "Expected multiple spill files, got {}", + sorter.spill_count() + ); + + let batches = collect_and_verify_sorted(&mut sorter).await?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 20 * 8192); + + Ok(()) + } } From 60fbdfb9c11898af795ed3c76f8e78e7104994cd Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 17:25:51 -0400 Subject: [PATCH 08/25] More tests. --- datafusion/physical-plan/src/sorts/sort.rs | 126 +++++++++++++++++---- 1 file changed, 107 insertions(+), 19 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 092a4148456a9..4893288485da1 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -508,33 +508,47 @@ impl ExternalSorter { /// Spills sorted runs to disk. /// - /// Unlike the old pipeline where `in_mem_batches` held unsorted data - /// that required merging before spilling, our sorted runs are already - /// sorted. We can spill each run directly as its own spill file. - /// MultiLevelMerge handles the fan-in during the final merge. + /// Two strategies depending on available merge headroom: /// - /// This also avoids the need to allocate merge cursor infrastructure - /// (RowCursorStream / FieldCursorStream) during the spill, which is - /// important when the pool has no headroom (sort_spill_reservation_bytes=0). + /// - **With headroom** (`merge_reservation > 0`): merge all runs into + /// a single globally-sorted stream, then write to one spill file. + /// Fewer spill files = lower fan-in for the final MultiLevelMerge. + /// + /// - **Without headroom** (`merge_reservation == 0`): spill each run + /// as its own file. Avoids allocating merge cursor infrastructure + /// when the pool has no room. MultiLevelMerge handles the higher + /// fan-in with dynamic memory management. async fn spill_sorted_runs(&mut self) -> Result<()> { assert_or_internal_err!( self.has_sorted_runs(), "sorted_runs must not be empty when attempting to spill" ); - let all_runs = std::mem::take(&mut self.sorted_runs); - for run in all_runs { - let run_size: usize = run.iter().map(get_record_batch_memory_size).sum(); + if self.merge_reservation.size() > 0 && self.sorted_runs.len() > 1 { + // Free merge_reservation to provide pool headroom for the + // merge cursor allocation. Re-reserved at the end. + self.merge_reservation.free(); + + let mut sorted_stream = + self.merge_sorted_runs(self.metrics.baseline.intermediate())?; + assert_or_internal_err!( + self.sorted_runs.is_empty(), + "sorted_runs should be empty after constructing sorted stream" + ); - // Initialize a new spill file for this run let mut in_progress = self.spill_manager.create_in_progress_file("Sorting")?; let mut max_batch_memory = 0usize; - for batch in &run { - in_progress.append_batch(batch)?; + + while let Some(batch) = sorted_stream.next().await { + let batch = batch?; max_batch_memory = max_batch_memory.max(batch.get_sliced_size()?); + in_progress.append_batch(&batch)?; } + drop(sorted_stream); + self.reservation.free(); + let spill_file = in_progress.finish()?; if let Some(spill_file) = spill_file { self.finished_spill_files.push(SortedSpillFile { @@ -542,11 +556,32 @@ impl ExternalSorter { max_record_batch_memory: max_batch_memory, }); } + } else { + // No merge headroom or single run: spill each run directly. + let all_runs = std::mem::take(&mut self.sorted_runs); + for run in all_runs { + let run_size: usize = run.iter().map(get_record_batch_memory_size).sum(); + + let mut in_progress = + self.spill_manager.create_in_progress_file("Sorting")?; + let mut max_batch_memory = 0usize; + for batch in &run { + in_progress.append_batch(batch)?; + max_batch_memory = max_batch_memory.max(batch.get_sliced_size()?); + } - // Drop the run data and release its reservation - drop(run); - self.reservation - .shrink(run_size.min(self.reservation.size())); + let spill_file = in_progress.finish()?; + if let Some(spill_file) = spill_file { + self.finished_spill_files.push(SortedSpillFile { + file: spill_file, + max_record_batch_memory: max_batch_memory, + }); + } + + drop(run); + self.reservation + .shrink(run_size.min(self.reservation.size())); + } } self.reserve_memory_for_merge()?; @@ -3169,8 +3204,8 @@ mod tests { Arc::clone(&schema), expr, 8192, - 50 * 1024, // sort_spill_reservation - 8192, // coalesce to batch_size → 1 run per batch + 0, // no merge headroom → per-run spill path + 8192, // coalesce to batch_size → 1 run per batch SpillCompression::Uncompressed, &metrics_set, runtime, @@ -3199,4 +3234,57 @@ mod tests { Ok(()) } + + /// With merge headroom (sort_spill_reservation_bytes > 0), runs are + /// merged into a single sorted stream before spilling to one file. + #[tokio::test] + async fn test_spill_merges_runs_with_headroom() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); + let expr: LexOrdering = + [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(); + + // Pool sized to trigger spilling after a few coalesced runs but + // leave enough room for the merge-before-spill path to work. + // merge_reservation must cover merge cursor infrastructure (~131KB + // for i32 with spawn_buffered + SortPreservingMergeStream). + let pool: Arc = Arc::new(GreedyMemoryPool::new(600 * 1024)); + let runtime = RuntimeEnvBuilder::new() + .with_memory_pool(pool) + .build_arc()?; + let metrics_set = ExecutionPlanMetricsSet::new(); + let mut sorter = ExternalSorter::new( + 0, + Arc::clone(&schema), + expr, + 8192, + 200 * 1024, // merge headroom: enough for merge cursor infrastructure + 32768, + SpillCompression::Uncompressed, + &metrics_set, + runtime, + )?; + + for i in 0..20 { + let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(values))], + )?; + sorter.insert_batch(batch).await?; + } + + assert!(sorter.spilled_before()); + // Runs merged before spilling → fewer spill files than runs + let spill_count = sorter.spill_count(); + assert!( + spill_count > 0 && spill_count < 20, + "Expected merged spill files, got {spill_count}", + ); + + let batches = collect_and_verify_sorted(&mut sorter).await?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 20 * 8192); + + Ok(()) + } } From 8e8f77474c78e4b0522a55a786292d6b9e1532c6 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 18:07:16 -0400 Subject: [PATCH 09/25] Cleanup before pushing. --- ...spilling_fuzz_in_memory_constrained_env.rs | 8 ++- datafusion/physical-plan/src/sorts/sort.rs | 55 ++++++++++++++----- 2 files changed, 46 insertions(+), 17 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs b/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs index d401557e966d6..5c677e68ecc37 100644 --- a/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs +++ b/datafusion/core/tests/fuzz_cases/spilling_fuzz_in_memory_constrained_env.rs @@ -78,10 +78,12 @@ async fn test_sort_with_limited_memory() -> Result<()> { }) .await?; - let total_spill_files_size = spill_count * record_batch_size; + // The chunked sort pipeline is more memory-efficient (shrinks + // reservations after sorting), so total spill size may be less than + // pool size. Just verify that spilling occurred. assert!( - total_spill_files_size > pool_size, - "Total spill files size {total_spill_files_size} should be greater than pool size {pool_size}", + spill_count > 0, + "Expected spilling under memory pressure, but spill_count was 0", ); Ok(()) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 4893288485da1..9e46bea5c036f 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -68,7 +68,7 @@ use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit}; use futures::{StreamExt, TryStreamExt}; -use log::trace; +use log::{debug, trace}; struct ExternalSorterMetrics { /// metrics @@ -197,6 +197,10 @@ struct ExternalSorter { /// `Vec` is a single sorted run produced by sorting one coalesced batch. sorted_runs: Vec>, + /// Running total of `get_record_batch_memory_size` across all sorted runs. + /// Updated incrementally to avoid O(n) recomputation on every sort. + sorted_runs_memory: usize, + /// If data has previously been spilled, the locations of the spill files (in /// Arrow IPC format) /// Within the same spill file, the data might be chunked into multiple batches, @@ -268,12 +272,20 @@ impl ExternalSorter { batch_size }; - let coalescer = BatchCoalescer::new(Arc::clone(&schema), coalesce_target_rows); + // For non-radix schemas (coalesce_target == batch_size), enable + // the large-batch bypass so incoming batches that are already + // batch_size rows pass through without copying. + let mut coalescer = + BatchCoalescer::new(Arc::clone(&schema), coalesce_target_rows); + if !use_radix { + coalescer = coalescer.with_biggest_coalesce_batch_size(Some(batch_size / 2)); + } Ok(Self { schema, coalescer: Some(coalescer), sorted_runs: vec![], + sorted_runs_memory: 0, finished_spill_files: vec![], expr, metrics, @@ -348,19 +360,23 @@ impl ExternalSorter { // multiple coalesced input batches, inflating reported memory size. // GC compacts them so reservation tracking stays accurate. let sorted_chunks = Self::gc_stringview_batches(sorted_chunks)?; + + let run_size: usize = + sorted_chunks.iter().map(get_record_batch_memory_size).sum(); self.sorted_runs.push(sorted_chunks); + self.sorted_runs_memory += run_size; // The 2x reservation from input batches exceeds the 1x sorted output. // Shrink to release the excess back to the pool. - let target = self.reservation.size().min( - self.sorted_runs - .iter() - .flat_map(|r| r.iter()) - .map(get_record_batch_memory_size) - .sum(), - ); + let target = self.sorted_runs_memory.min(self.reservation.size()); self.reservation.shrink(self.reservation.size() - target); + debug_assert_eq!( + self.reservation.size(), + self.sorted_runs_memory, + "reservation should track sorted_runs_memory after shrink" + ); + Ok(()) } @@ -525,6 +541,10 @@ impl ExternalSorter { ); if self.merge_reservation.size() > 0 && self.sorted_runs.len() > 1 { + debug!( + "Spilling {} sorted runs via merge to single file", + self.sorted_runs.len() + ); // Free merge_reservation to provide pool headroom for the // merge cursor allocation. Re-reserved at the end. self.merge_reservation.free(); @@ -557,8 +577,14 @@ impl ExternalSorter { }); } } else { - // No merge headroom or single run: spill each run directly. + // No merge headroom or single run: spill each run as its own + // file to avoid allocating merge cursor infrastructure. + debug!( + "Spilling {} sorted runs as individual files (no merge headroom)", + self.sorted_runs.len() + ); let all_runs = std::mem::take(&mut self.sorted_runs); + self.sorted_runs_memory = 0; for run in all_runs { let run_size: usize = run.iter().map(get_record_batch_memory_size).sum(); @@ -610,6 +636,7 @@ impl ExternalSorter { metrics: BaselineMetrics, ) -> Result { let all_runs = std::mem::take(&mut self.sorted_runs); + self.sorted_runs_memory = 0; if all_runs.is_empty() { return Ok(Box::pin(EmptyRecordBatchStream::new(Arc::clone( @@ -3133,7 +3160,7 @@ mod tests { Arc::new(GreedyMemoryPool::new(100 * 1024 * 1024)); let mut sorter = test_sorter(Arc::clone(&schema), expr, 8192, 32768, pool)?; - // 8 batches × 8192 rows = 65536 rows → 2 coalesced chunks of 32K + // 8 batches × 8192 rows = 65536 rows -> 2 coalesced chunks of 32K for i in 0..8 { let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); let batch = RecordBatch::try_new( @@ -3204,8 +3231,8 @@ mod tests { Arc::clone(&schema), expr, 8192, - 0, // no merge headroom → per-run spill path - 8192, // coalesce to batch_size → 1 run per batch + 0, // no merge headroom -> per-run spill path + 8192, // coalesce to batch_size -> 1 run per batch SpillCompression::Uncompressed, &metrics_set, runtime, @@ -3274,7 +3301,7 @@ mod tests { } assert!(sorter.spilled_before()); - // Runs merged before spilling → fewer spill files than runs + // Runs merged before spilling -> fewer spill files than runs let spill_count = sorter.spill_count(); assert!( spill_count > 0 && spill_count < 20, From aa93a6ebcc0886d7fc16a87659cd9aab4515c1ce Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 18:57:37 -0400 Subject: [PATCH 10/25] Fix CI failures. --- datafusion/core/tests/sql/runtime_config.rs | 2 +- datafusion/physical-plan/src/sorts/sort.rs | 32 ++++++--------------- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/datafusion/core/tests/sql/runtime_config.rs b/datafusion/core/tests/sql/runtime_config.rs index cf5237d725805..2689aa21a0ef4 100644 --- a/datafusion/core/tests/sql/runtime_config.rs +++ b/datafusion/core/tests/sql/runtime_config.rs @@ -180,7 +180,7 @@ async fn test_invalid_memory_limit_when_limit_is_not_numeric() { async fn test_max_temp_directory_size_enforcement() { let ctx = SessionContext::new(); - ctx.sql("SET datafusion.runtime.memory_limit = '1M'") + ctx.sql("SET datafusion.runtime.memory_limit = '256K'") .await .unwrap() .collect() diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 9e46bea5c036f..8c00ce57b24af 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -138,7 +138,7 @@ impl ExternalSorterMetrics { /// /// When memory is exhausted, sorted runs are spilled directly to disk /// (one spill file per run — no merge needed since runs are already -/// sorted). [`MultiLevelMerge`] handles the final merge from disk +/// sorted). `MultiLevelMergeBuilder` handles the final merge from disk /// with dynamic fan-in. /// /// ```text @@ -169,7 +169,6 @@ impl ExternalSorterMetrics { /// pressure, chunk sizes shrink and radix sort amortizes less — at /// `batch_size` or below, the pipeline falls back to lexsort, /// matching the old per-batch sort behavior. -/// ``` struct ExternalSorter { // ======================================================================== // PROPERTIES: @@ -451,20 +450,10 @@ impl ExternalSorter { // Determine if we must take the spill path. // - // We must spill if: - // 1. We already spilled during the insert phase, OR - // 2. We have multiple sorted runs but merge_reservation is 0. - // - // Case 2 matters because the in-memory merge needs to allocate - // cursor infrastructure (RowCursorStream / FieldCursorStream) - // at build time, before any run data is consumed. The cursor - // allocation comes from merge_reservation. If that's 0, the - // pool is fully occupied by sorted run data and the cursor - // can't allocate. Spilling to disk frees pool memory, and - // MultiLevelMerge handles the merge with dynamic fan-in — - // reading from spill files that don't hold pool memory. - let must_spill = self.spilled_before() - || (self.sorted_runs.len() > 1 && self.merge_reservation.size() == 0); + // We must spill if we already spilled during the insert phase. + // The merge-from-disk path handles combining spill files with + // any remaining in-memory runs. + let must_spill = self.spilled_before(); if must_spill { // Spill remaining sorted runs. Since runs are already sorted, @@ -484,13 +473,10 @@ impl ExternalSorter { .with_reservation(self.merge_reservation.take()) .build() } else { - // In-memory path: we have 0 runs, 1 run (no merge needed), - // or multiple runs with merge_reservation > 0 providing - // headroom for cursor allocation. - // - // Release merge_reservation back to the pool — in the - // non-spill path, merge_sorted_runs allocates cursor memory - // from the pool directly (freed merge_reservation bytes). + // In-memory path: no prior spills. We have 0, 1, or multiple + // sorted runs. Release merge_reservation (if any) back to the + // pool — merge_sorted_runs allocates cursor memory from pool + // headroom directly. self.merge_reservation.free(); self.merge_sorted_runs(self.metrics.baseline.clone()) } From af514fe5048f1d87716d4452a5595c26e2d63b4d Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 19:07:06 -0400 Subject: [PATCH 11/25] Fix configs.md. --- docs/source/user-guide/configs.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index b828f0e793d47..2024b64c486fb 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -117,7 +117,8 @@ The following configuration settings are available: | datafusion.execution.skip_physical_aggregate_schema_check | false | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. | | datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | | datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | -| datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | +| datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. Deprecated: this option is no longer used. The sort pipeline now always coalesces batches before sorting. Use `sort_coalesce_target_rows` instead. | +| datafusion.execution.sort_coalesce_target_rows | 32768 | Target number of rows to coalesce before sorting in ExternalSorter. Larger values give radix sort (used for primitives and strings) enough rows to amortize RowConverter encoding overhead. Under memory pressure the actual chunk size may be smaller. For schemas that are not eligible for radix sort (all-dictionary or nested types), the coalesce target falls back to `batch_size` regardless of this setting. | | datafusion.execution.sort_pushdown_buffer_capacity | 1073741824 | Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. | | datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | | datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | From c5275534e5ea29611703fd2d7dd8fea25b5da2b2 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 19:52:46 -0400 Subject: [PATCH 12/25] Fix CI failures. --- datafusion/physical-plan/src/sorts/sort.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 8c00ce57b24af..dffe8568ad6b0 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -362,18 +362,30 @@ impl ExternalSorter { let run_size: usize = sorted_chunks.iter().map(get_record_batch_memory_size).sum(); + self.sorted_runs.push(sorted_chunks); self.sorted_runs_memory += run_size; - // The 2x reservation from input batches exceeds the 1x sorted output. - // Shrink to release the excess back to the pool. - let target = self.sorted_runs_memory.min(self.reservation.size()); - self.reservation.shrink(self.reservation.size() - target); + // Align reservation to actual sorted run memory with a single pool + // interaction. Normally 2x reservation > 1x sorted output, so we + // shrink. For tiny batches (single-digit rows), per-column buffer + // overhead can make the sorted output slightly larger — grow + // unconditionally since the memory is already allocated. This uses + // grow() which bypasses the pool limit, so the pool's tracked total + // may briefly exceed its limit by a small amount (tens of KB). + let reservation_size = self.reservation.size(); + if reservation_size > self.sorted_runs_memory { + self.reservation + .shrink(reservation_size - self.sorted_runs_memory); + } else if self.sorted_runs_memory > reservation_size { + self.reservation + .grow(self.sorted_runs_memory - reservation_size); + } debug_assert_eq!( self.reservation.size(), self.sorted_runs_memory, - "reservation should track sorted_runs_memory after shrink" + "reservation should track sorted_runs_memory after adjustment" ); Ok(()) From 83860c0c5fff83c4b800129215c81ea9fdc227f0 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 20:08:58 -0400 Subject: [PATCH 13/25] Avoid radix sort for decimal types for now. --- datafusion/physical-plan/src/sorts/sort.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index dffe8568ad6b0..95a869cc4c685 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -878,6 +878,12 @@ pub(super) fn use_radix_sort(data_types: &[&DataType]) -> bool { | DataType::Struct(_) | DataType::Map(_, _) | DataType::Union(_, _) => return false, + // Decimal128/256 row-encode to 17/33 bytes with poor prefix + // entropy (values cluster in narrow numeric ranges). MSD radix + // sort burns through prefix bytes doing minimal partitioning, + // then falls back to quicksort — paying radix overhead with no + // benefit over lexsort. + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => return false, _ => all_dict = false, } } @@ -2984,6 +2990,10 @@ mod tests { // Mixed dict + primitive -> radix assert!(use_radix_sort(&[&dict_type, &DataType::Int32])); + // Decimal -> lexsort (poor prefix entropy in row encoding) + assert!(!use_radix_sort(&[&DataType::Decimal128(38, 10)])); + assert!(!use_radix_sort(&[&DataType::Decimal256(76, 20)])); + // Empty -> no radix assert!(!use_radix_sort(&[])); } From a88eacf56548120faa4e5da55da2eaa6ea005bb6 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 20:28:35 -0400 Subject: [PATCH 14/25] Fix information_schema.slt test. --- datafusion/sqllogictest/test_files/information_schema.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 5f42bbe7ff3b0..f32c6d63b39e8 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -416,7 +416,7 @@ datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 Number datafusion.execution.skip_physical_aggregate_schema_check false When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max datafusion.execution.sort_coalesce_target_rows 32768 Target number of rows to coalesce before sorting in ExternalSorter. Larger values give radix sort (used for primitives and strings) enough rows to amortize RowConverter encoding overhead. Under memory pressure the actual chunk size may be smaller. For schemas that are not eligible for radix sort (all-dictionary or nested types), the coalesce target falls back to `batch_size` regardless of this setting. -datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. +datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. Deprecated: this option is no longer used. The sort pipeline now always coalesces batches before sorting. Use `sort_coalesce_target_rows` instead. datafusion.execution.sort_pushdown_buffer_capacity 1073741824 Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). datafusion.execution.spill_compression uncompressed Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. From 175dcf61f66f4fdb9b670c09709ab51ac9118c03 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Mon, 13 Apr 2026 20:43:00 -0400 Subject: [PATCH 15/25] Update to latest radix kernel from arrow-rs PR. --- datafusion/physical-plan/src/sorts/radix.rs | 131 +++++++++++++++----- 1 file changed, 99 insertions(+), 32 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/radix.rs b/datafusion/physical-plan/src/sorts/radix.rs index 377bad33a9d20..b0cb987b7df19 100644 --- a/datafusion/physical-plan/src/sorts/radix.rs +++ b/datafusion/physical-plan/src/sorts/radix.rs @@ -25,11 +25,10 @@ use arrow::row::{RowConverter, Rows, SortField}; use arrow_ord::sort::SortColumn; use std::sync::Arc; -/// 256-bucket histogram + scatter costs more than comparison sort at small n. -const FALLBACK_THRESHOLD: usize = 64; +/// Buckets smaller than this fall back to comparison sort. +const FALLBACK_THRESHOLD: usize = 32; -/// 8 bytes covers the discriminating prefix of most key layouts; deeper -/// recursion hits diminishing returns as buckets become sparse. +/// Maximum number of radix passes before falling back to comparison sort. const MAX_DEPTH: usize = 8; /// Sort row indices using MSD radix sort on row-encoded keys. @@ -59,60 +58,128 @@ pub(crate) fn radix_sort_to_indices( let n = rows.num_rows(); let mut indices: Vec = (0..n as u32).collect(); let mut temp = vec![0u32; n]; - msd_radix_sort(&mut indices, &mut temp, &rows, 0); + let mut bytes = vec![0u8; n]; + msd_radix_sort(&mut indices, &mut temp, &mut bytes, &rows, 0, true); Ok(UInt32Array::from(indices)) } -fn msd_radix_sort(indices: &mut [u32], temp: &mut [u32], rows: &Rows, byte_pos: usize) { - let n = indices.len(); +/// The byte at `offset` in the row, or 0 if past the end. +/// +/// Inline helper until `Row::byte_from` is available in released arrow-row. +#[inline(always)] +unsafe fn row_byte(rows: &Rows, idx: u32, byte_pos: usize) -> u8 { + let row = unsafe { rows.row_unchecked(idx as usize) }; + let data = row.data(); + if byte_pos < data.len() { + unsafe { *data.get_unchecked(byte_pos) } + } else { + 0 + } +} + +/// Row data starting at `offset`, or empty slice if past the end. +/// +/// Inline helper until `Row::data_from` is available in released arrow-row. +#[inline(always)] +unsafe fn row_data_from(rows: &Rows, idx: u32, byte_pos: usize) -> &[u8] { + let row = unsafe { rows.row_unchecked(idx as usize) }; + let data = row.data(); + if byte_pos <= data.len() { + unsafe { data.get_unchecked(byte_pos..) } + } else { + &[] + } +} + +/// MSD radix sort using ping-pong buffers. +/// +/// Each level scatters from `src` into `dst`, then recurses with the +/// roles swapped (dst becomes the next level's src). This avoids an +/// O(n) `copy_from_slice` at every recursion level. +/// +/// `result_in_src` tracks where the caller expects the sorted output: +/// true means `src`, false means `dst`. +fn msd_radix_sort( + src: &mut [u32], + dst: &mut [u32], + bytes: &mut [u8], + rows: &Rows, + byte_pos: usize, + result_in_src: bool, +) { + let n = src.len(); if n <= FALLBACK_THRESHOLD || byte_pos >= MAX_DEPTH { - indices.sort_unstable_by(|&a, &b| { - let ra = unsafe { rows.row_unchecked(a as usize) }; - let rb = unsafe { rows.row_unchecked(b as usize) }; - ra.cmp(&rb) - }); + // Compare only from byte_pos onward — earlier bytes are identical + // within this bucket, having already been discriminated by prior + // radix passes. + if result_in_src { + src.sort_unstable_by(|&a, &b| { + let ra = unsafe { row_data_from(rows, a, byte_pos) }; + let rb = unsafe { row_data_from(rows, b, byte_pos) }; + ra.cmp(rb) + }); + } else { + dst.copy_from_slice(src); + dst.sort_unstable_by(|&a, &b| { + let ra = unsafe { row_data_from(rows, a, byte_pos) }; + let rb = unsafe { row_data_from(rows, b, byte_pos) }; + ra.cmp(rb) + }); + } return; } + // Extract bytes and build histogram in one pass. The bytes buffer + // avoids chasing pointers through Rows a second time during scatter. + let bytes = &mut bytes[..n]; let mut counts = [0u32; 256]; - for &idx in &*indices { - let row = unsafe { rows.row_unchecked(idx as usize) }; - let byte = row.data().get(byte_pos).copied().unwrap_or(0); - counts[byte as usize] += 1; - } - - // Skip scatter when all rows share the same byte - if counts.iter().filter(|&&c| c > 0).count() == 1 { - msd_radix_sort(indices, temp, rows, byte_pos + 1); - return; + for (i, &idx) in src.iter().enumerate() { + let b = unsafe { row_byte(rows, idx, byte_pos) }; + bytes[i] = b; + counts[b as usize] += 1; } let mut offsets = [0u32; 257]; + let mut num_buckets = 0u32; for i in 0..256 { + num_buckets += (counts[i] > 0) as u32; offsets[i + 1] = offsets[i] + counts[i]; } - let temp = &mut temp[..n]; + // All rows share the same byte — no scatter needed, roles unchanged. + if num_buckets == 1 { + msd_radix_sort(src, dst, bytes, rows, byte_pos + 1, result_in_src); + return; + } + + // Scatter src → dst using the pre-extracted bytes let mut write_pos = offsets; - for &idx in &*indices { - let row = unsafe { rows.row_unchecked(idx as usize) }; - let byte = row.data().get(byte_pos).copied().unwrap_or(0) as usize; - temp[write_pos[byte] as usize] = idx; - write_pos[byte] += 1; + for (i, &idx) in src.iter().enumerate() { + let b = bytes[i] as usize; + dst[write_pos[b] as usize] = idx; + write_pos[b] += 1; } - indices.copy_from_slice(temp); + // Recurse with roles swapped: after scatter the data lives in dst, + // so dst becomes the next level's src. for bucket in 0..256 { let start = offsets[bucket] as usize; let end = offsets[bucket + 1] as usize; - if end - start > 1 { + let len = end - start; + if len > 1 { msd_radix_sort( - &mut indices[start..end], - &mut temp[start..end], + &mut dst[start..end], + &mut src[start..end], + &mut bytes[start..end], rows, byte_pos + 1, + !result_in_src, ); + } else if len == 1 && result_in_src { + // Single-element bucket: after scatter it's in dst, copy back + // if the caller expects the result in src. + src[start] = dst[start]; } } } From 4bdf8711b20c357b8ea85d6c80d353ee5ad84887 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 09:41:07 -0400 Subject: [PATCH 16/25] Use lexsort for single columns, radix otherwise. Should help with Q11 regression. --- .../core/tests/fuzz_cases/sort_query_fuzz.rs | 2 + datafusion/physical-plan/src/sorts/sort.rs | 259 +++++++++++++----- 2 files changed, 191 insertions(+), 70 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs index 578076955842b..b3753e1ed971c 100644 --- a/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs @@ -419,6 +419,8 @@ impl SortFuzzerTestGenerator { pub fn generate_random_query(&self, rng_seed: u64) -> (String, Option) { let mut rng = StdRng::seed_from_u64(rng_seed); + // Pick 1-3 ORDER BY columns. Single-column queries exercise the + // lexsort path; multi-column queries exercise the radix sort path. let num_columns = rng.random_range(1..=3).min(self.selected_columns.len()); let selected_columns: Vec<_> = self .selected_columns diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 95a869cc4c685..59af439687512 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -857,14 +857,15 @@ pub fn sort_batch( /// Returns true if radix sort should be used for the given sort column types. /// -/// Radix sort is faster for most multi-column sorts but falls back to -/// lexsort when: +/// Radix sort is faster for multi-column sorts but falls back to lexsort when: +/// - Only one sort column (`lexsort_to_indices` compares native arrays +/// directly without RowConverter encoding overhead) /// - All sort columns are dictionary-typed (long shared row prefixes /// waste radix passes before falling back to comparison sort) /// - Any sort column is a nested type (encoding cost is high and lexsort /// short-circuits comparison on leading columns) pub(super) fn use_radix_sort(data_types: &[&DataType]) -> bool { - if data_types.is_empty() { + if data_types.len() <= 1 { return false; } @@ -1442,8 +1443,8 @@ mod tests { use crate::filter_pushdown::{FilterPushdownPhase, PushedDown}; use crate::test; use crate::test::TestMemoryExec; + use crate::test::assert_is_pending; use crate::test::exec::{BlockingExec, assert_strong_count_converges_to_zero}; - use crate::test::{assert_is_pending, make_partition}; use arrow::array::*; use arrow::compute::{SortOptions, concat_batches}; @@ -1773,17 +1774,48 @@ mod tests { .with_runtime(runtime), ); - // The input has 200 partitions, each partition has a batch containing 100 rows. - // Each row has a single Utf8 column, the Utf8 string values are roughly 42 bytes. - // The total size of the input is roughly 820 KB. - let input = test::scan_partitioned_utf8(200); - let schema = input.schema(); + // Build 200 partitions, each with one 100-row batch containing a + // Utf8 column (~42 bytes per value) and an Int32 column. The second + // column ensures the radix sort path is used (use_radix_sort + // requires > 1 sort column), which keeps the coalesce target at + // sort_coalesce_target_rows instead of falling back to batch_size. + let schema = Arc::new(Schema::new(vec![ + Field::new("i", DataType::Utf8, true), + Field::new("j", DataType::Int32, true), + ])); + let partitions: Vec> = (0..200) + .map(|_| { + let strings: Vec = (0..100i32) + .map(|i| format!("test_long_string_that_is_roughly_42_bytes_{i}")) + .collect(); + let ints: Vec = (0..100).collect(); + let mut string_array = StringArray::from(strings); + string_array.shrink_to_fit(); + vec![ + RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(string_array) as ArrayRef, + Arc::new(Int32Array::from(ints)) as ArrayRef, + ], + ) + .unwrap(), + ] + }) + .collect(); + let input = TestMemoryExec::try_new_exec(&partitions, Arc::clone(&schema), None)?; let sort_exec = Arc::new(SortExec::new( - [PhysicalSortExpr { - expr: col("i", &schema)?, - options: SortOptions::default(), - }] + [ + PhysicalSortExpr { + expr: col("i", &schema)?, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: col("j", &schema)?, + options: SortOptions::default(), + }, + ] .into(), Arc::new(CoalescePartitionsExec::new(input)), )); @@ -1803,22 +1835,16 @@ mod tests { let spilled_rows = metrics.spilled_rows().unwrap(); let spilled_bytes = metrics.spilled_bytes().unwrap(); - // This test case is processing 840KB of data using 400KB of memory. Note - // that buffered batches can't be dropped until all sorted batches are - // generated, so we can only buffer `sort_spill_reservation_bytes` of sorted - // batches. - // The number of spills is roughly calculated as: - // `number_of_batches / (sort_spill_reservation_bytes / batch_size)` - - // If this assertion fail with large spill count, make sure the following - // case does not happen: - // During external sorting, one sorted run should be spilled to disk in a - // single file, due to memory limit we might need to append to the file - // multiple times to spill all the data. Make sure we're not writing each - // appending as a separate file. - assert!((4..=8).contains(&spill_count)); - assert!((15000..=20000).contains(&spilled_rows)); - assert!((900000..=1000000).contains(&spilled_bytes)); + // This test case is processing ~840KB of data using ~400KB of memory. + // Note that buffered batches can't be dropped until all sorted batches + // are generated, so we can only buffer `sort_spill_reservation_bytes` + // of sorted batches. + assert!((2..=10).contains(&spill_count), "spill_count={spill_count}"); + assert!( + (10000..=20000).contains(&spilled_rows), + "spilled_rows={spilled_rows}" + ); + assert!(spilled_bytes > 0, "spilled_bytes={spilled_bytes}"); // Verify that the result is sorted let concated_result = concat_batches(&schema, &result)?; @@ -1912,7 +1938,11 @@ mod tests { SessionConfig::new().with_batch_size(batch_size), // Ensure we don't concat batches )); - let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + // Two columns so the radix path is used and sorted runs are chunked. + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); // Create unsorted data let mut values: Vec = (0..num_rows).collect(); @@ -1920,16 +1950,25 @@ mod tests { let input_batch = RecordBatch::try_new( Arc::clone(&schema), - vec![Arc::new(Int32Array::from(values))], + vec![ + Arc::new(Int32Array::from(values.clone())), + Arc::new(Int32Array::from(values)), + ], )?; let batches = vec![input_batch]; let sort_exec = Arc::new(SortExec::new( - [PhysicalSortExpr { - expr: Arc::new(Column::new("a", 0)), - options: SortOptions::default(), - }] + [ + PhysicalSortExpr { + expr: Arc::new(Column::new("a", 0)), + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("b", 1)), + options: SortOptions::default(), + }, + ] .into(), TestMemoryExec::try_new_exec( std::slice::from_ref(&batches), @@ -2497,15 +2536,31 @@ mod tests { batch_size_to_generate: usize, create_task_ctx: impl Fn(&[RecordBatch]) -> TaskContext, ) -> Result { - let batches = (0..number_of_batches) - .map(|_| make_partition(batch_size_to_generate as i32)) - .collect::>(); + // Two-column batches so use_radix_sort returns true and sorted runs + // are chunked to batch_size, which these tests depend on. + let schema = Arc::new(Schema::new(vec![ + Field::new("i", DataType::Int32, true), + Field::new("j", DataType::Int32, true), + ])); + let batches: Vec = (0..number_of_batches) + .map(|_| { + let values: Vec = (0..batch_size_to_generate as i32).collect(); + RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(values.clone())), + Arc::new(Int32Array::from(values)), + ], + ) + .unwrap() + }) + .collect(); let task_ctx = create_task_ctx(batches.as_slice()); let expected_batch_size = task_ctx.session_config().batch_size(); let (mut output_batches, metrics) = - run_sort_on_input(task_ctx, "i", batches).await?; + run_sort_on_input(task_ctx, &["i", "j"], batches).await?; let last_batch = output_batches.pop().unwrap(); @@ -2525,21 +2580,25 @@ mod tests { async fn run_sort_on_input( task_ctx: TaskContext, - order_by_col: &str, + order_by_cols: &[&str], batches: Vec, ) -> Result<(Vec, MetricsSet)> { let task_ctx = Arc::new(task_ctx); - // let task_ctx = env. let schema = batches[0].schema(); - let ordering: LexOrdering = [PhysicalSortExpr { - expr: col(order_by_col, &schema)?, - options: SortOptions { - descending: false, - nulls_first: true, - }, - }] - .into(); + let ordering = LexOrdering::new( + order_by_cols + .iter() + .map(|c| PhysicalSortExpr { + expr: col(c, &schema).unwrap(), + options: SortOptions { + descending: false, + nulls_first: true, + }, + }) + .collect::>(), + ) + .expect("non-empty ordering"); let sort_exec: Arc = Arc::new(SortExec::new( ordering.clone(), TestMemoryExec::try_new_exec(std::slice::from_ref(&batches), schema, None)?, @@ -2756,12 +2815,20 @@ mod tests { .build_arc()?; let metrics_set = ExecutionPlanMetricsSet::new(); - let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); + // Two columns so radix sort is used, matching the coalesce_target_rows config. + let schema = Arc::new(Schema::new(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, false), + ])); let mut sorter = ExternalSorter::new( 0, Arc::clone(&schema), - [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(), + [ + PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0))), + PhysicalSortExpr::new_default(Arc::new(Column::new("y", 1))), + ] + .into(), 128, // batch_size sort_spill_reservation_bytes, 32768, // sort_coalesce_target_rows @@ -2776,7 +2843,10 @@ mod tests { let values: Vec = ((i * 100)..((i + 1) * 100)).rev().collect(); let batch = RecordBatch::try_new( Arc::clone(&schema), - vec![Arc::new(Int32Array::from(values))], + vec![ + Arc::new(Int32Array::from(values.clone())), + Arc::new(Int32Array::from(values)), + ], )?; sorter.insert_batch(batch).await?; } @@ -2974,8 +3044,8 @@ mod tests { #[test] fn test_use_radix_sort_heuristic() { - // Primitive columns -> radix - assert!(use_radix_sort(&[&DataType::Int32])); + // Single column -> lexsort (no RowConverter overhead) + assert!(!use_radix_sort(&[&DataType::Int32])); // All dictionary -> lexsort let dict_type = @@ -2987,6 +3057,9 @@ mod tests { DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); assert!(!use_radix_sort(&[&list_type])); + // Multi-column primitives -> radix + assert!(use_radix_sort(&[&DataType::Int32, &DataType::Int64])); + // Mixed dict + primitive -> radix assert!(use_radix_sort(&[&dict_type, &DataType::Int32])); @@ -2994,6 +3067,12 @@ mod tests { assert!(!use_radix_sort(&[&DataType::Decimal128(38, 10)])); assert!(!use_radix_sort(&[&DataType::Decimal256(76, 20)])); + // Multi-column with decimal -> lexsort + assert!(!use_radix_sort(&[ + &DataType::Int32, + &DataType::Decimal128(38, 10) + ])); + // Empty -> no radix assert!(!use_radix_sort(&[])); } @@ -3160,9 +3239,16 @@ mod tests { /// and chunked back to `batch_size` after sorting. #[tokio::test] async fn test_chunked_sort_radix_coalescing() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); - let expr: LexOrdering = - [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(); + // Two sort columns required so use_radix_sort returns true. + let schema = Arc::new(Schema::new(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, false), + ])); + let expr: LexOrdering = [ + PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0))), + PhysicalSortExpr::new_default(Arc::new(Column::new("y", 1))), + ] + .into(); let pool: Arc = Arc::new(GreedyMemoryPool::new(100 * 1024 * 1024)); @@ -3173,7 +3259,10 @@ mod tests { let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); let batch = RecordBatch::try_new( Arc::clone(&schema), - vec![Arc::new(Int32Array::from(values))], + vec![ + Arc::new(Int32Array::from(values.clone())), + Arc::new(Int32Array::from(values)), + ], )?; sorter.insert_batch(batch).await?; } @@ -3194,9 +3283,16 @@ mod tests { /// the partial coalescer contents are flushed and sorted. #[tokio::test] async fn test_chunked_sort_partial_flush() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); - let expr: LexOrdering = - [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(); + // Two sort columns required so use_radix_sort returns true. + let schema = Arc::new(Schema::new(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, false), + ])); + let expr: LexOrdering = [ + PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0))), + PhysicalSortExpr::new_default(Arc::new(Column::new("y", 1))), + ] + .into(); let pool: Arc = Arc::new(GreedyMemoryPool::new(100 * 1024 * 1024)); @@ -3207,7 +3303,10 @@ mod tests { let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); let batch = RecordBatch::try_new( Arc::clone(&schema), - vec![Arc::new(Int32Array::from(values))], + vec![ + Arc::new(Int32Array::from(values.clone())), + Arc::new(Int32Array::from(values)), + ], )?; sorter.insert_batch(batch).await?; } @@ -3225,9 +3324,16 @@ mod tests { /// Spilling writes one spill file per sorted run (no merge before spill). #[tokio::test] async fn test_spill_creates_one_file_per_run() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); - let expr: LexOrdering = - [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(); + // Two sort columns required so use_radix_sort returns true. + let schema = Arc::new(Schema::new(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, false), + ])); + let expr: LexOrdering = [ + PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0))), + PhysicalSortExpr::new_default(Arc::new(Column::new("y", 1))), + ] + .into(); let pool: Arc = Arc::new(GreedyMemoryPool::new(500 * 1024)); let runtime = RuntimeEnvBuilder::new() @@ -3250,7 +3356,10 @@ mod tests { let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); let batch = RecordBatch::try_new( Arc::clone(&schema), - vec![Arc::new(Int32Array::from(values))], + vec![ + Arc::new(Int32Array::from(values.clone())), + Arc::new(Int32Array::from(values)), + ], )?; sorter.insert_batch(batch).await?; } @@ -3274,9 +3383,16 @@ mod tests { /// merged into a single sorted stream before spilling to one file. #[tokio::test] async fn test_spill_merges_runs_with_headroom() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); - let expr: LexOrdering = - [PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0)))].into(); + // Two sort columns required so use_radix_sort returns true. + let schema = Arc::new(Schema::new(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, false), + ])); + let expr: LexOrdering = [ + PhysicalSortExpr::new_default(Arc::new(Column::new("x", 0))), + PhysicalSortExpr::new_default(Arc::new(Column::new("y", 1))), + ] + .into(); // Pool sized to trigger spilling after a few coalesced runs but // leave enough room for the merge-before-spill path to work. @@ -3303,7 +3419,10 @@ mod tests { let values: Vec = ((i * 8192)..((i + 1) * 8192)).rev().collect(); let batch = RecordBatch::try_new( Arc::clone(&schema), - vec![Arc::new(Int32Array::from(values))], + vec![ + Arc::new(Int32Array::from(values.clone())), + Arc::new(Int32Array::from(values)), + ], )?; sorter.insert_batch(batch).await?; } From a1f01934d327e049ef127b973300352fc7e7cbce Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 10:00:17 -0400 Subject: [PATCH 17/25] Address some PR feedback. --- datafusion/physical-plan/src/sorts/sort.rs | 38 ++++++++++++++++------ 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 59af439687512..35135101d194d 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -53,7 +53,7 @@ use crate::{ }; use arrow::array::{Array, RecordBatch, RecordBatchOptions, StringViewArray}; -use arrow::compute::{BatchCoalescer, lexsort_to_indices, take_arrays}; +use arrow::compute::{BatchCoalescer, SortColumn, lexsort_to_indices, take_arrays}; use arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::config::SpillCompression; use datafusion_common::tree_node::TreeNodeRecursion; @@ -346,13 +346,17 @@ impl ExternalSorter { /// Uses radix sort when the batch is large enough to amortize encoding /// overhead (more than `batch_size` rows). Otherwise falls back to lexsort. fn sort_and_store_run(&mut self, batch: &RecordBatch) -> Result<()> { - let use_radix_for_this_batch = - self.use_radix && batch.num_rows() > self.batch_size; + let use_chunked_radix = self.use_radix && batch.num_rows() > self.batch_size; - let sorted_chunks = if use_radix_for_this_batch { + let sorted_chunks = if use_chunked_radix { sort_batch_chunked(batch, &self.expr, self.batch_size, true)? } else { - vec![sort_batch(batch, &self.expr, None)?] + let sort_columns = self + .expr + .iter() + .map(|e| e.evaluate_to_sort_column(batch)) + .collect::>>()?; + vec![sort_batch_with(batch, &sort_columns, None, false)?] }; // After take(), StringView arrays may reference shared buffers from @@ -826,15 +830,29 @@ pub fn sort_batch( .map(|expr| expr.evaluate_to_sort_column(batch)) .collect::>>()?; - if fetch.is_none() + let use_radix = fetch.is_none() && use_radix_sort( &sort_columns .iter() .map(|c| c.values.data_type()) .collect::>(), - ) - { - let indices = super::radix::radix_sort_to_indices(&sort_columns)?; + ); + + sort_batch_with(batch, &sort_columns, fetch, use_radix) +} + +/// Sort a batch with an explicit radix vs lexsort decision. +/// +/// Used by [`ExternalSorter`] which pre-computes the radix eligibility +/// once at construction time to avoid repeated schema checks. +fn sort_batch_with( + batch: &RecordBatch, + sort_columns: &[SortColumn], + fetch: Option, + use_radix: bool, +) -> Result { + if use_radix { + let indices = super::radix::radix_sort_to_indices(sort_columns)?; let columns = take_arrays(batch.columns(), &indices, None)?; let options = RecordBatchOptions::new().with_row_count(Some(indices.len())); return Ok(RecordBatch::try_new_with_options( @@ -844,7 +862,7 @@ pub fn sort_batch( )?); } - let indices = lexsort_to_indices(&sort_columns, fetch)?; + let indices = lexsort_to_indices(sort_columns, fetch)?; let columns = take_arrays(batch.columns(), &indices, None)?; let options = RecordBatchOptions::new().with_row_count(Some(indices.len())); From bbda50f7bd112456cd87e99a705f010fd0608c08 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 10:03:52 -0400 Subject: [PATCH 18/25] Address some PR feedback. --- datafusion/physical-plan/src/sorts/sort.rs | 30 +++++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 35135101d194d..c82cb3b58c348 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -370,13 +370,29 @@ impl ExternalSorter { self.sorted_runs.push(sorted_chunks); self.sorted_runs_memory += run_size; - // Align reservation to actual sorted run memory with a single pool - // interaction. Normally 2x reservation > 1x sorted output, so we - // shrink. For tiny batches (single-digit rows), per-column buffer - // overhead can make the sorted output slightly larger — grow - // unconditionally since the memory is already allocated. This uses - // grow() which bypasses the pool limit, so the pool's tracked total - // may briefly exceed its limit by a small amount (tens of KB). + // Align the pool reservation to match actual sorted run memory. + // + // Before sorting we reserve 2x the input batch size (space for + // both the unsorted input and the sorted output). After sorting + // we drop the input, so normally sorted_runs_memory < reservation + // and we shrink to free the excess back to the pool. + // + // The grow path handles a rare edge case: for very small batches + // (single-digit rows), Arrow's per-column buffer minimums (64 + // bytes each) can make the sorted output slightly larger than + // the reservation. We use grow() rather than try_grow() because: + // + // 1. The memory is already allocated — the sorted run exists + // in self.sorted_runs. This is accounting catch-up, not a + // new allocation request. + // 2. Under-reporting is worse than over-reporting. If we + // swallowed a try_grow() failure, the pool would think + // there is free headroom that doesn't actually exist, + // which could cause other operators to over-allocate and + // trigger a real OOM. + // 3. The overshoot is small and bounded: it is at most the + // per-column buffer overhead for a handful of rows, which + // is tens of KB even with wide schemas. let reservation_size = self.reservation.size(); if reservation_size > self.sorted_runs_memory { self.reservation From 2698b8507b26ecfb7f2b329f7981a00dbcccf9d2 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 10:16:14 -0400 Subject: [PATCH 19/25] Update failing test. --- datafusion/core/tests/fuzz_cases/sort_fuzz.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index 0d8a066d432dd..84fb5ce34e62b 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -61,7 +61,7 @@ async fn test_sort_10k_mem() { #[cfg_attr(tarpaulin, ignore)] async fn test_sort_100k_mem() { for (batch_size, should_spill) in - [(5, false), (10000, false), (20000, true), (1000000, true)] + [(5, false), (10000, false), (20000, false), (1000000, true)] { let (input, collected) = SortTest::new() .with_int32_batches(batch_size) From d5d3ef7935749b73b8443c63fb8f004838020098 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 10:38:11 -0400 Subject: [PATCH 20/25] Update failing test for realsies. --- datafusion/core/tests/fuzz_cases/sort_fuzz.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs index 84fb5ce34e62b..d27c1be940304 100644 --- a/datafusion/core/tests/fuzz_cases/sort_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_fuzz.rs @@ -61,7 +61,7 @@ async fn test_sort_10k_mem() { #[cfg_attr(tarpaulin, ignore)] async fn test_sort_100k_mem() { for (batch_size, should_spill) in - [(5, false), (10000, false), (20000, false), (1000000, true)] + [(5, false), (10000, false), (50000, true), (1000000, true)] { let (input, collected) = SortTest::new() .with_int32_batches(batch_size) From e0fb0a8feee0473e85b8824ffb052cde3d33ce70 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 11:44:48 -0400 Subject: [PATCH 21/25] ## `datafusion/physical-plan/src/sorts/sort.rs` - Always coalesce to `sort_coalesce_target_rows` regardless of schema (removed conditional that fell back to `batch_size` for non-radix) - Both radix and lexsort paths now go through `sort_batch_chunked` (both chunk output to `batch_size`) - Per-batch radix decision uses `sort_coalesce_target_rows` as threshold instead of `batch_size` - Added `radix_sorted_batches` and `lexsort_sorted_batches` counters to `ExternalSorterMetrics` - Added `sort_coalesce_target_rows` and `sort_use_radix` config fields to `ExternalSorter` - New `sort_use_radix` parameter gates the `use_radix_sort()` schema check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## `datafusion/common/src/config.rs` - New config: `sort_use_radix: bool, default = true` - Updated `sort_coalesce_target_rows` doc ## `datafusion/execution/src/config.rs` - New builder method: `with_sort_use_radix()` ## `datafusion/core/tests/fuzz_cases/sort_fuzz.rs` - `(20000, false)` → `(50000, true)` to fix flaky test ## `datafusion/sqllogictest/test_files/information_schema.slt` + `docs/source/user-guide/configs.md` - Added `sort_use_radix` entry, updated `sort_coalesce_target_rows` description --- datafusion/common/src/config.rs | 18 +++-- datafusion/execution/src/config.rs | 10 +++ datafusion/physical-plan/src/sorts/sort.rs | 80 ++++++++++--------- .../test_files/information_schema.slt | 4 +- docs/source/user-guide/configs.md | 3 +- 5 files changed, 70 insertions(+), 45 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 21d0a52cbb16d..53a556a6260ad 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -563,15 +563,19 @@ config_namespace! { /// Target number of rows to coalesce before sorting in ExternalSorter. /// - /// Larger values give radix sort (used for primitives and strings) - /// enough rows to amortize RowConverter encoding overhead. Under - /// memory pressure the actual chunk size may be smaller. - /// - /// For schemas that are not eligible for radix sort (all-dictionary - /// or nested types), the coalesce target falls back to `batch_size` - /// regardless of this setting. + /// Larger values reduce merge fan-in and give radix sort enough rows + /// to amortize RowConverter encoding overhead. pub sort_coalesce_target_rows: usize, default = 32768 + /// When true, use radix sort for multi-column sorts over eligible + /// types (primitives, strings). When false, always use lexsort. + /// + /// Radix sort is 2-3x faster than lexsort at 32K+ rows for + /// multi-column sorts but has higher overhead for small batches. + /// Set to false to isolate the performance impact of the + /// coalesce-then-sort pipeline from radix sort itself. + pub sort_use_radix: bool, default = true + /// Maximum buffer capacity (in bytes) per partition for BufferExec /// inserted during sort pushdown optimization. /// diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs index 5e3f187b19fcb..688a6d3c58e12 100644 --- a/datafusion/execution/src/config.rs +++ b/datafusion/execution/src/config.rs @@ -489,6 +489,16 @@ impl SessionConfig { self } + /// Enables or disables radix sort for multi-column sorts over eligible types. + /// + /// When false, always uses lexsort regardless of schema eligibility. + /// + /// [`sort_use_radix`]: datafusion_common::config::ExecutionOptions::sort_use_radix + pub fn with_sort_use_radix(mut self, sort_use_radix: bool) -> Self { + self.options_mut().execution.sort_use_radix = sort_use_radix; + self + } + /// Enables or disables the enforcement of batch size in joins pub fn with_enforce_batch_size_in_joins( mut self, diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index c82cb3b58c348..88de5644e7a02 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -35,7 +35,8 @@ use crate::filter_pushdown::{ }; use crate::limit::LimitStream; use crate::metrics::{ - BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, SpillMetrics, + BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, + SpillMetrics, }; use crate::projection::{ProjectionExec, make_with_child, update_ordering}; use crate::sorts::IncrementalSortIterator; @@ -75,6 +76,12 @@ struct ExternalSorterMetrics { baseline: BaselineMetrics, spill_metrics: SpillMetrics, + + /// Number of batches sorted with radix sort + radix_sorted_batches: Count, + + /// Number of batches sorted with lexsort + lexsort_sorted_batches: Count, } impl ExternalSorterMetrics { @@ -82,6 +89,10 @@ impl ExternalSorterMetrics { Self { baseline: BaselineMetrics::new(metrics, partition), spill_metrics: SpillMetrics::new(metrics, partition), + radix_sorted_batches: MetricBuilder::new(metrics) + .counter("radix_sorted_batches", partition), + lexsort_sorted_batches: MetricBuilder::new(metrics) + .counter("lexsort_sorted_batches", partition), } } } @@ -93,13 +104,11 @@ impl ExternalSorterMetrics { /// /// # Algorithm /// -/// Incoming batches are coalesced via [`BatchCoalescer`] to a target -/// row count before sorting. For radix-eligible schemas (primitives, -/// strings) the target is `sort_coalesce_target_rows` (default 32768); -/// for non-radix schemas (all-dictionary, nested types) it falls back -/// to `batch_size`. This gives sort kernels enough rows to amortize -/// overhead — radix sort is 2-3x faster than lexsort at 32K+ rows -/// but slower at small batch sizes. +/// Incoming batches are coalesced via [`BatchCoalescer`] to +/// `sort_coalesce_target_rows` (default 32768) before sorting. This gives +/// sort kernels enough rows to amortize overhead and reduces merge fan-in. +/// Radix sort is used for multi-column radix-eligible schemas; lexsort +/// is used otherwise (single-column, all-dictionary, nested types). /// /// Each coalesced batch is sorted immediately (radix or lexsort) and /// stored as a pre-sorted run. Under memory pressure the coalescer @@ -180,6 +189,9 @@ struct ExternalSorter { expr: LexOrdering, /// The target number of rows for output batches batch_size: usize, + /// The target number of rows for coalesced batches before sorting. + /// Radix sort is only used when a batch reaches this size. + sort_coalesce_target_rows: usize, /// Whether to use radix sort (decided once from expression types). use_radix: bool, @@ -238,6 +250,7 @@ impl ExternalSorter { batch_size: usize, sort_spill_reservation_bytes: usize, sort_coalesce_target_rows: usize, + sort_use_radix: bool, // Configured via `datafusion.execution.spill_compression`. spill_compression: SpillCompression, metrics: &ExecutionPlanMetricsSet, @@ -263,22 +276,11 @@ impl ExternalSorter { .iter() .map(|e| e.expr.data_type(&schema)) .collect::>()?; - let use_radix = use_radix_sort(&sort_data_types.iter().collect::>()); - - let coalesce_target_rows = if use_radix { - sort_coalesce_target_rows - } else { - batch_size - }; + let use_radix = + sort_use_radix && use_radix_sort(&sort_data_types.iter().collect::>()); - // For non-radix schemas (coalesce_target == batch_size), enable - // the large-batch bypass so incoming batches that are already - // batch_size rows pass through without copying. - let mut coalescer = - BatchCoalescer::new(Arc::clone(&schema), coalesce_target_rows); - if !use_radix { - coalescer = coalescer.with_biggest_coalesce_batch_size(Some(batch_size / 2)); - } + let coalescer = + BatchCoalescer::new(Arc::clone(&schema), sort_coalesce_target_rows); Ok(Self { schema, @@ -293,6 +295,7 @@ impl ExternalSorter { merge_reservation, runtime, batch_size, + sort_coalesce_target_rows, sort_spill_reservation_bytes, use_radix, }) @@ -343,21 +346,21 @@ impl ExternalSorter { /// Sorts a single coalesced batch and stores the result as a new run. /// - /// Uses radix sort when the batch is large enough to amortize encoding - /// overhead (more than `batch_size` rows). Otherwise falls back to lexsort. + /// Uses radix sort for multi-column radix-eligible schemas when the + /// batch is large enough to amortize RowConverter encoding overhead + /// (more than `batch_size` rows). Falls back to lexsort for small + /// batches (e.g. early flush under memory pressure). + /// Both paths chunk output back to `batch_size`. fn sort_and_store_run(&mut self, batch: &RecordBatch) -> Result<()> { - let use_chunked_radix = self.use_radix && batch.num_rows() > self.batch_size; - - let sorted_chunks = if use_chunked_radix { - sort_batch_chunked(batch, &self.expr, self.batch_size, true)? + let use_radix = + self.use_radix && batch.num_rows() >= self.sort_coalesce_target_rows; + if use_radix { + self.metrics.radix_sorted_batches.add(1); } else { - let sort_columns = self - .expr - .iter() - .map(|e| e.evaluate_to_sort_column(batch)) - .collect::>>()?; - vec![sort_batch_with(batch, &sort_columns, None, false)?] - }; + self.metrics.lexsort_sorted_batches.add(1); + } + let sorted_chunks = + sort_batch_chunked(batch, &self.expr, self.batch_size, use_radix)?; // After take(), StringView arrays may reference shared buffers from // multiple coalesced input batches, inflating reported memory size. @@ -1355,6 +1358,7 @@ impl ExecutionPlan for SortExec { context.session_config().batch_size(), execution_options.sort_spill_reservation_bytes, execution_options.sort_coalesce_target_rows, + execution_options.sort_use_radix, context.session_config().spill_compression(), &self.metrics_set, context.runtime_env(), @@ -2866,6 +2870,7 @@ mod tests { 128, // batch_size sort_spill_reservation_bytes, 32768, // sort_coalesce_target_rows + true, // sort_use_radix SpillCompression::Uncompressed, &metrics_set, Arc::clone(&runtime), @@ -3241,6 +3246,7 @@ mod tests { batch_size, 10 * 1024 * 1024, sort_coalesce_target_rows, + true, // sort_use_radix SpillCompression::Uncompressed, &metrics_set, runtime, @@ -3381,6 +3387,7 @@ mod tests { 8192, 0, // no merge headroom -> per-run spill path 8192, // coalesce to batch_size -> 1 run per batch + true, // sort_use_radix SpillCompression::Uncompressed, &metrics_set, runtime, @@ -3444,6 +3451,7 @@ mod tests { 8192, 200 * 1024, // merge headroom: enough for merge cursor infrastructure 32768, + true, // sort_use_radix SpillCompression::Uncompressed, &metrics_set, runtime, diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index f32c6d63b39e8..ec12b58273421 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -273,6 +273,7 @@ datafusion.execution.sort_coalesce_target_rows 32768 datafusion.execution.sort_in_place_threshold_bytes 1048576 datafusion.execution.sort_pushdown_buffer_capacity 1073741824 datafusion.execution.sort_spill_reservation_bytes 10485760 +datafusion.execution.sort_use_radix true datafusion.execution.spill_compression uncompressed datafusion.execution.split_file_groups_by_statistics false datafusion.execution.target_partitions 7 @@ -415,10 +416,11 @@ datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8 Aggregat datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode datafusion.execution.skip_physical_aggregate_schema_check false When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max -datafusion.execution.sort_coalesce_target_rows 32768 Target number of rows to coalesce before sorting in ExternalSorter. Larger values give radix sort (used for primitives and strings) enough rows to amortize RowConverter encoding overhead. Under memory pressure the actual chunk size may be smaller. For schemas that are not eligible for radix sort (all-dictionary or nested types), the coalesce target falls back to `batch_size` regardless of this setting. +datafusion.execution.sort_coalesce_target_rows 32768 Target number of rows to coalesce before sorting in ExternalSorter. Larger values reduce merge fan-in and give radix sort enough rows to amortize RowConverter encoding overhead. datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. Deprecated: this option is no longer used. The sort pipeline now always coalesces batches before sorting. Use `sort_coalesce_target_rows` instead. datafusion.execution.sort_pushdown_buffer_capacity 1073741824 Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). +datafusion.execution.sort_use_radix true When true, use radix sort for multi-column sorts over eligible types (primitives, strings). When false, always use lexsort. Radix sort is 2-3x faster than lexsort at 32K+ rows for multi-column sorts but has higher overhead for small batches. Set to false to isolate the performance impact of the coalesce-then-sort pipeline from radix sort itself. datafusion.execution.spill_compression uncompressed Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. datafusion.execution.split_file_groups_by_statistics false Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental datafusion.execution.target_partitions 7 Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 2024b64c486fb..611a279b70fd5 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -118,7 +118,8 @@ The following configuration settings are available: | datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | | datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | | datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. Deprecated: this option is no longer used. The sort pipeline now always coalesces batches before sorting. Use `sort_coalesce_target_rows` instead. | -| datafusion.execution.sort_coalesce_target_rows | 32768 | Target number of rows to coalesce before sorting in ExternalSorter. Larger values give radix sort (used for primitives and strings) enough rows to amortize RowConverter encoding overhead. Under memory pressure the actual chunk size may be smaller. For schemas that are not eligible for radix sort (all-dictionary or nested types), the coalesce target falls back to `batch_size` regardless of this setting. | +| datafusion.execution.sort_coalesce_target_rows | 32768 | Target number of rows to coalesce before sorting in ExternalSorter. Larger values reduce merge fan-in and give radix sort enough rows to amortize RowConverter encoding overhead. | +| datafusion.execution.sort_use_radix | true | When true, use radix sort for multi-column sorts over eligible types (primitives, strings). When false, always use lexsort. Radix sort is 2-3x faster than lexsort at 32K+ rows for multi-column sorts but has higher overhead for small batches. Set to false to isolate the performance impact of the coalesce-then-sort pipeline from radix sort itself. | | datafusion.execution.sort_pushdown_buffer_capacity | 1073741824 | Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. | | datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | | datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | From 73bb06bf08ea382a55b71495eefa1bdfae16f77f Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 11:55:26 -0400 Subject: [PATCH 22/25] Cleanup. --- datafusion/physical-plan/src/sorts/sort.rs | 26 +++++++++------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 88de5644e7a02..26e1238f5091d 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -347,9 +347,8 @@ impl ExternalSorter { /// Sorts a single coalesced batch and stores the result as a new run. /// /// Uses radix sort for multi-column radix-eligible schemas when the - /// batch is large enough to amortize RowConverter encoding overhead - /// (more than `batch_size` rows). Falls back to lexsort for small - /// batches (e.g. early flush under memory pressure). + /// batch reached `sort_coalesce_target_rows`. Falls back to lexsort + /// for smaller batches (e.g. the final partial flush). /// Both paths chunk output back to `batch_size`. fn sort_and_store_run(&mut self, batch: &RecordBatch) -> Result<()> { let use_radix = @@ -1813,10 +1812,8 @@ mod tests { ); // Build 200 partitions, each with one 100-row batch containing a - // Utf8 column (~42 bytes per value) and an Int32 column. The second - // column ensures the radix sort path is used (use_radix_sort - // requires > 1 sort column), which keeps the coalesce target at - // sort_coalesce_target_rows instead of falling back to batch_size. + // Utf8 column (~42 bytes per value) and an Int32 column. Two sort + // columns so use_radix_sort returns true and the radix path is exercised. let schema = Arc::new(Schema::new(vec![ Field::new("i", DataType::Utf8, true), Field::new("j", DataType::Int32, true), @@ -1976,7 +1973,7 @@ mod tests { SessionConfig::new().with_batch_size(batch_size), // Ensure we don't concat batches )); - // Two columns so the radix path is used and sorted runs are chunked. + // Two columns so use_radix_sort returns true. let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), @@ -2574,8 +2571,7 @@ mod tests { batch_size_to_generate: usize, create_task_ctx: impl Fn(&[RecordBatch]) -> TaskContext, ) -> Result { - // Two-column batches so use_radix_sort returns true and sorted runs - // are chunked to batch_size, which these tests depend on. + // Two-column batches so use_radix_sort returns true. let schema = Arc::new(Schema::new(vec![ Field::new("i", DataType::Int32, true), Field::new("j", DataType::Int32, true), @@ -2853,7 +2849,7 @@ mod tests { .build_arc()?; let metrics_set = ExecutionPlanMetricsSet::new(); - // Two columns so radix sort is used, matching the coalesce_target_rows config. + // Two columns so use_radix_sort returns true. let schema = Arc::new(Schema::new(vec![ Field::new("x", DataType::Int32, false), Field::new("y", DataType::Int32, false), @@ -3279,7 +3275,7 @@ mod tests { /// and chunked back to `batch_size` after sorting. #[tokio::test] async fn test_chunked_sort_radix_coalescing() -> Result<()> { - // Two sort columns required so use_radix_sort returns true. + // Two sort columns so use_radix_sort returns true. let schema = Arc::new(Schema::new(vec![ Field::new("x", DataType::Int32, false), Field::new("y", DataType::Int32, false), @@ -3323,7 +3319,7 @@ mod tests { /// the partial coalescer contents are flushed and sorted. #[tokio::test] async fn test_chunked_sort_partial_flush() -> Result<()> { - // Two sort columns required so use_radix_sort returns true. + // Two sort columns so use_radix_sort returns true. let schema = Arc::new(Schema::new(vec![ Field::new("x", DataType::Int32, false), Field::new("y", DataType::Int32, false), @@ -3364,7 +3360,7 @@ mod tests { /// Spilling writes one spill file per sorted run (no merge before spill). #[tokio::test] async fn test_spill_creates_one_file_per_run() -> Result<()> { - // Two sort columns required so use_radix_sort returns true. + // Two sort columns so use_radix_sort returns true. let schema = Arc::new(Schema::new(vec![ Field::new("x", DataType::Int32, false), Field::new("y", DataType::Int32, false), @@ -3424,7 +3420,7 @@ mod tests { /// merged into a single sorted stream before spilling to one file. #[tokio::test] async fn test_spill_merges_runs_with_headroom() -> Result<()> { - // Two sort columns required so use_radix_sort returns true. + // Two sort columns so use_radix_sort returns true. let schema = Arc::new(Schema::new(vec![ Field::new("x", DataType::Int32, false), Field::new("y", DataType::Int32, false), From 7a75c386d7faf57f46f503884e728fb6aa34ec5b Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 12:35:15 -0400 Subject: [PATCH 23/25] Fix copy.slt test to show new metrics. --- datafusion/sqllogictest/test_files/copy.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 402ac8e8512bf..cf1aeb3a622f8 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -207,7 +207,7 @@ EXPLAIN ANALYZE COPY (SELECT col1, upper(col2) AS col2_upper FROM source_table O ---- Plan with Metrics 01)DataSinkExec: sink=ParquetSink(file_groups=[]), metrics=[elapsed_compute=, bytes_written=, rows_written=2] -02)--SortExec: expr=[col1@0 ASC NULLS LAST], preserve_partitioning=[false], metrics=[output_rows=2, elapsed_compute=, output_bytes=, output_batches=, spill_count=0, spilled_bytes=0.0 B, spilled_rows=0] +02)--SortExec: expr=[col1@0 ASC NULLS LAST], preserve_partitioning=[false], metrics=[output_rows=2, elapsed_compute=, output_bytes=, output_batches=, spill_count=0, spilled_bytes=0.0 B, spilled_rows=0, lexsort_sorted_batches=1, radix_sorted_batches=0] 03)----ProjectionExec: expr=[col1@0 as col1, upper(col2@1) as col2_upper], metrics=[output_rows=2, elapsed_compute=, output_bytes=, output_batches=1, expr_0_eval_time=, expr_1_eval_time=] 04)------DataSourceExec: partitions=1, partition_sizes=[1], metrics=[] From 482e72c0c26e43eaf517de5fd638825ed1ba5ad1 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 12:42:18 -0400 Subject: [PATCH 24/25] Temporarily default radix to false to get benchmarks. --- datafusion/common/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 53a556a6260ad..2113877de2d94 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -574,7 +574,7 @@ config_namespace! { /// multi-column sorts but has higher overhead for small batches. /// Set to false to isolate the performance impact of the /// coalesce-then-sort pipeline from radix sort itself. - pub sort_use_radix: bool, default = true + pub sort_use_radix: bool, default = false /// Maximum buffer capacity (in bytes) per partition for BufferExec /// inserted during sort pushdown optimization. From 96dd0dfb0de32dea607c381f4e043af0e11087d8 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 14 Apr 2026 12:43:56 -0400 Subject: [PATCH 25/25] Revert "Temporarily default radix to false to get benchmarks." This reverts commit 482e72c0c26e43eaf517de5fd638825ed1ba5ad1. --- datafusion/common/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 2113877de2d94..53a556a6260ad 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -574,7 +574,7 @@ config_namespace! { /// multi-column sorts but has higher overhead for small batches. /// Set to false to isolate the performance impact of the /// coalesce-then-sort pipeline from radix sort itself. - pub sort_use_radix: bool, default = false + pub sort_use_radix: bool, default = true /// Maximum buffer capacity (in bytes) per partition for BufferExec /// inserted during sort pushdown optimization.