apache · zhuqi-lucas · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -316,6 +316,9 @@ main() {
                 sort_pushdown|sort_pushdown_sorted)
                     data_sort_pushdown
                     ;;
+                sort_pushdown_inexact)
+                    data_sort_pushdown_inexact
+                    ;;
                 sort_tpch)
                     # same data as for tpch
                     data_tpch "1" "parquet"
@@ -522,6 +525,9 @@ main() {
                 sort_pushdown_sorted)
                     run_sort_pushdown_sorted
                     ;;
+                sort_pushdown_inexact)
+                    run_sort_pushdown_inexact
+                    ;;
                 sort_tpch)
                     run_sort_tpch "1"
                     ;;
@@ -1137,6 +1143,77 @@ run_sort_pushdown_sorted() {
     debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
 }
 
+# Generates data for sort pushdown Inexact benchmark.
+#
+# Unlike sort_pushdown (Exact path), this benchmark targets the Inexact path
+# where row group reorder by statistics is beneficial. It produces a single
+# large lineitem parquet file where row groups have NON-OVERLAPPING but
+# OUT-OF-ORDER l_orderkey ranges (each RG internally sorted, RGs shuffled).
+#
+# This simulates append-heavy workloads where data is written in batches at
+# different times, producing segments with tight value ranges but in arbitrary
+# row-group order.
+data_sort_pushdown_inexact() {
+    INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact/lineitem"
+    if [ -d "${INEXACT_DIR}" ] && [ "$(ls -A ${INEXACT_DIR}/*.parquet 2>/dev/null)" ]; then
+        echo "Sort pushdown Inexact data already exists at ${INEXACT_DIR}"
+        return
+    fi
+
+    echo "Generating sort pushdown Inexact benchmark data (single file, shuffled RGs)..."
+
+    # Re-use the sort_pushdown data as the source (generate if missing)
+    data_sort_pushdown
+
+    mkdir -p "${INEXACT_DIR}"
+    SRC_DIR="${DATA_DIR}/sort_pushdown/lineitem"
+
+    # Use datafusion-cli to:
+    # 1. Read the 3 source files as one table
+    # 2. Shuffle row order by hashing l_orderkey (produces deterministic but
+    #    non-sorted output — each RG will cover a scattered subset of orderkeys
+    #    with no overlap between RGs once row_group_size is small)
+    # 3. Write a single parquet file with small max_row_group_size so we get
+    #    many RGs per file.
+    #
+    # Note: with pure hash-shuffle, each RG still covers the full orderkey range.
+    # To get tight RGs with non-overlapping ranges we bucket by hash first then
+    # sort within bucket.
+    (cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
+        CREATE EXTERNAL TABLE src
+        STORED AS PARQUET
+        LOCATION '${SRC_DIR}';
+
+        -- Bucket rows: split orderkey range into 64 chunks, scramble chunk
+        -- order, but sort within each chunk. This produces ~64 RG-sized
+        -- segments where each segment has a tight orderkey range but the
+        -- segments appear in scrambled (non-sorted) order in the file.
+        -- We use (l_orderkey * 1664525 + 1013904223) % 64 as a deterministic
+        -- scrambler (linear congruential generator) so bucket order is
+        -- effectively random but reproducible.
+        COPY (
+            SELECT * FROM src
+            ORDER BY
+                (l_orderkey * 1664525 + 1013904223) % 64,
+                l_orderkey
+        )
+        TO '${INEXACT_DIR}/shuffled.parquet'
+        STORED AS PARQUET
+        OPTIONS ('format.max_row_group_size' '100000');
+    ")
+
+    echo "Sort pushdown Inexact data generated at ${INEXACT_DIR}"
+    ls -la "${INEXACT_DIR}"
+}
+
+# Runs the sort pushdown Inexact benchmark (tests RG reorder by statistics).
+run_sort_pushdown_inexact() {
+    INEXACT_DIR="${DATA_DIR}/sort_pushdown_inexact"
+    RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact.json"
+    echo "Running sort pushdown Inexact benchmark (row group reorder by statistics)..."
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${INEXACT_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
 # Runs the sort integration benchmark
 run_sort_tpch() {
     SCALE_FACTOR=$1

diff --git a/benchmarks/queries/sort_pushdown_inexact/q1.sql b/benchmarks/queries/sort_pushdown_inexact/q1.sql
@@ -0,0 +1,8 @@
+-- Inexact path: TopK + DESC LIMIT on ASC-declared file.
+-- With RG reorder, the first RG read contains the highest max value,
+-- so TopK's threshold tightens quickly and subsequent RGs get filtered
+-- efficiently via dynamic filter pushdown.
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact/q2.sql b/benchmarks/queries/sort_pushdown_inexact/q2.sql
@@ -0,0 +1,7 @@
+-- Inexact path: TopK + DESC LIMIT with larger fetch (1000).
+-- Larger LIMIT means more row_replacements; RG reorder reduces the
+-- total replacement count by tightening the threshold faster.
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 1000
diff --git a/benchmarks/queries/sort_pushdown_inexact/q3.sql b/benchmarks/queries/sort_pushdown_inexact/q3.sql
@@ -0,0 +1,8 @@
+-- Inexact path: wide projection (all columns) + DESC LIMIT.
+-- Shows the row-level filter benefit: with a tight threshold from the
+-- first RG, subsequent RGs skip decoding non-sort columns for filtered
+-- rows — bigger wins for wide tables.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 100
diff --git a/benchmarks/queries/sort_pushdown_inexact/q4.sql b/benchmarks/queries/sort_pushdown_inexact/q4.sql
@@ -0,0 +1,7 @@
+-- Inexact path: wide projection + DESC LIMIT with larger fetch.
+-- Combines wide-row row-level filter benefit with larger LIMIT to
+-- demonstrate cumulative gains from RG reorder.
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey DESC
+LIMIT 1000
diff --git a/benchmarks/src/sort_pushdown.rs b/benchmarks/src/sort_pushdown.rs
@@ -159,7 +159,14 @@ impl RunOpt {
     async fn benchmark_query(&self, query_id: usize) -> Result<Vec<QueryResult>> {
         let sql = self.load_query(query_id)?;
 
-        let config = self.common.config()?;
+        let mut config = self.common.config()?;
+        // Enable parquet filter pushdown + late materialization. This is
+        // essential for the Inexact sort pushdown path: TopK's dynamic
+        // filter is pushed to the parquet reader, so only sort-column
+        // rows pass the filter's Decode non-sort columns are skipped for
+        // rows that don't pass the filter — this is where RG reorder's
+        // tight-threshold-first strategy pays off for wide-row queries.
+        config.options_mut().execution.parquet.pushdown_filters = true;
         let rt = self.common.build_runtime()?;
         let state = SessionStateBuilder::new()
             .with_config(config)