From 5c4121526a73be75fe3d02bb710bf1029ed1ce5a Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Thu, 28 May 2026 20:21:47 -0400
Subject: [PATCH 01/13] Passes CometFuzzTestSuite, CometNativeShuffleSuite,
 CometExecSuite.

---
 .../shuffle/CometNativeShuffleWriter.scala    | 341 +++++++++---------
 .../shuffle/CometShuffleDependency.scala      |  13 +-
 .../shuffle/CometShuffleExchangeExec.scala    | 170 ++++++++-
 .../shuffle/CometShuffleManager.scala         |   3 +
 .../apache/spark/sql/comet/operators.scala    | 320 ++++++++--------
 5 files changed, 528 insertions(+), 319 deletions(-)

diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
index f27d021ac4..68078e8507 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
@@ -32,19 +32,34 @@ import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriteMetricsR
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning, SinglePartition}
-import org.apache.spark.sql.comet.{CometExec, CometMetricNode}
+import org.apache.spark.sql.comet.{CometExec, CometMetricNode, CometScalarSubquery, NativeExecContext, PlanDataInjector}
 import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.vectorized.ColumnarBatch
 
-import org.apache.comet.CometConf
+import org.apache.comet.{CometConf, CometExecIterator}
 import org.apache.comet.serde.{OperatorOuterClass, PartitioningOuterClass, QueryPlanSerde}
 import org.apache.comet.serde.OperatorOuterClass.{CompressionCodec, Operator}
-import org.apache.comet.serde.QueryPlanSerde.serializeDataType
 
 /**
- * A [[ShuffleWriter]] that will delegate shuffle write to native shuffle.
+ * A [[ShuffleWriter]] that drives the native shuffle write in a single [[CometExecIterator]] per
+ * partition. The unified plan it executes has [[OperatorOuterClass.ShuffleWriter]] at the root
+ * with `childNativeOp` as its only child. Leaf input iterators come from
+ * [[CometNativeShuffleInputIterator]] (constructed by [[CometNativeShuffleInputRDD.compute]]).
+ *
+ * Two flavors of `childNativeOp` are in use:
+ *   - rich Comet native subtree (e.g. HashAgg / Filter / ShuffleScan), supplied by
+ *     [[CometShuffleExchangeExec]] when its child is a
+ *     [[org.apache.spark.sql.comet.CometNativeExec]].
+ *   - synthetic `Scan("ShuffleWriterInput")` placeholder, supplied by the convenience overload of
+ *     [[CometShuffleExchangeExec.prepareShuffleDependency]] for callers that already hold an
+ *     `RDD[ColumnarBatch]` of native-driven batches (e.g.
+ *     [[org.apache.spark.sql.comet.CometCollectLimitExec]]).
+ *
+ * The writer treats both shapes identically.
  */
 class CometNativeShuffleWriter[K, V](
+    childNativeOp: Operator,
+    childMetricNode: CometMetricNode,
+    nativeContext: NativeExecContext,
     outputPartitioning: Partitioning,
     outputAttributes: Seq[Attribute],
     metrics: Map[String, SQLMetric],
@@ -72,8 +87,22 @@ class CometNativeShuffleWriter[K, V](
     val tempDataFilePath = Paths.get(tempDataFilename)
     val tempIndexFilePath = Paths.get(tempIndexFilename)
 
-    // Call native shuffle write
-    val nativePlan = getNativePlan(tempDataFilename, tempIndexFilename)
+    // Pull the per-partition leaf iterators and partition index from the iterator handed to us
+    // by Spark's ShuffleMapTask. CometNativeShuffleInputRDD.compute always returns this exact
+    // iterator type; no other RDD layers between produce a Product2[Int, ColumnarBatch].
+    val shuffleInputIter = inputs.asInstanceOf[CometNativeShuffleInputIterator]
+    val partitionIdx = shuffleInputIter.partitionIndex
+    val leafIterators = shuffleInputIter.leafIterators
+    val shuffleBlockIters = shuffleInputIter.shuffleBlockIterators
+
+    val unifiedPlan = buildUnifiedPlan(tempDataFilename, tempIndexFilename)
+    val finalNativePlan = if (nativeContext.commonByKey.nonEmpty) {
+      val partitionDataByKey =
+        nativeContext.perPartitionByKey.map { case (k, arr) => k -> arr(partitionIdx) }
+      PlanDataInjector.injectPlanData(unifiedPlan, nativeContext.commonByKey, partitionDataByKey)
+    } else {
+      unifiedPlan
+    }
 
     val detailedMetrics = Seq(
       "elapsed_compute",
@@ -82,29 +111,42 @@ class CometNativeShuffleWriter[K, V](
       "input_batches",
       "spill_count",
       "spilled_bytes")
-
-    // Maps native metrics to SQL metrics
     val metricsOutputRows = new SQLMetric("outputRows")
     val metricsWriteTime = new SQLMetric("writeTime")
-    val nativeSQLMetrics = Map(
+    val shuffleWriterSQLMetrics = Map(
       "output_rows" -> metricsOutputRows,
       "data_size" -> metrics("dataSize"),
       "write_time" -> metricsWriteTime) ++
       metrics.filterKeys(detailedMetrics.contains)
-    val nativeMetrics = CometMetricNode(nativeSQLMetrics)
 
-    // Getting rid of the fake partitionId
-    val newInputs = inputs.asInstanceOf[Iterator[_ <: Product2[Any, Any]]].map(_._2)
+    // ShuffleWriter metrics live at the root of the metric tree; the child operator's metric
+    // tree (rich subtree or empty leaf for the Scan placeholder) is attached underneath so the
+    // SQL UI sees the same per-node breakdown the split-driver flow produced.
+    val nativeMetrics = CometMetricNode(shuffleWriterSQLMetrics, Seq(childMetricNode))
 
-    val cometIter = CometExec.getCometIterator(
-      Seq(newInputs.asInstanceOf[Iterator[ColumnarBatch]]),
+    val cometIter = new CometExecIterator(
+      CometExec.newIterId,
+      leafIterators,
       outputAttributes.length,
-      nativePlan,
+      PlanDataInjector.serializeOperator(finalNativePlan),
       nativeMetrics,
       numParts,
-      context.partitionId(),
-      broadcastedHadoopConfForEncryption = None,
-      encryptedFilePaths = Seq.empty)
+      partitionIdx,
+      nativeContext.broadcastedHadoopConfForEncryption,
+      nativeContext.encryptedFilePaths,
+      shuffleBlockIters)
+
+    // Register subqueries against the iterator id so native callbacks resolve them to values.
+    nativeContext.subqueries.foreach { sub =>
+      CometScalarSubquery.setSubquery(cometIter.id, sub)
+    }
+    Option(context).foreach { taskCtx =>
+      taskCtx.addTaskCompletionListener[Unit] { _ =>
+        nativeContext.subqueries.foreach { sub =>
+          CometScalarSubquery.removeSubquery(cometIter.id, sub)
+        }
+      }
+    }
 
     while (cometIter.hasNext) {
       cometIter.next()
@@ -134,7 +176,7 @@ class CometNativeShuffleWriter[K, V](
 
     // Report spill metrics to Spark's task metrics so they appear in
     // Spark UI task summaries (not just SQL metrics)
-    val spilledBytes = nativeSQLMetrics.get("spilled_bytes").map(_.value).getOrElse(0L)
+    val spilledBytes = shuffleWriterSQLMetrics.get("spilled_bytes").map(_.value).getOrElse(0L)
     if (spilledBytes > 0) {
       context.taskMetrics().incMemoryBytesSpilled(spilledBytes)
       context.taskMetrics().incDiskBytesSpilled(spilledBytes)
@@ -162,163 +204,138 @@ class CometNativeShuffleWriter[K, V](
     case _ => false
   }
 
-  private def getNativePlan(dataFile: String, indexFile: String): Operator = {
-    val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput")
-    val opBuilder = OperatorOuterClass.Operator.newBuilder()
-
-    val scanTypes = outputAttributes.flatten { attr =>
-      serializeDataType(attr.dataType)
+  /**
+   * Build the unified `ShuffleWriter(child = childNativeOp)` plan with the partitioning serde,
+   * compression settings, and output file paths.
+   */
+  private def buildUnifiedPlan(dataFile: String, indexFile: String): Operator = {
+    val shuffleWriterBuilder = OperatorOuterClass.ShuffleWriter.newBuilder()
+    shuffleWriterBuilder.setOutputDataFile(dataFile)
+    shuffleWriterBuilder.setOutputIndexFile(indexFile)
+
+    if (SparkEnv.get.conf.getBoolean("spark.shuffle.compress", true)) {
+      val codec = CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_CODEC.get() match {
+        case "zstd" => CompressionCodec.Zstd
+        case "lz4" => CompressionCodec.Lz4
+        case "snappy" => CompressionCodec.Snappy
+        case other => throw new UnsupportedOperationException(s"invalid codec: $other")
+      }
+      shuffleWriterBuilder.setCodec(codec)
+    } else {
+      shuffleWriterBuilder.setCodec(CompressionCodec.None)
     }
-
-    if (scanTypes.length == outputAttributes.length) {
-      scanBuilder.addAllFields(scanTypes.asJava)
-
-      val shuffleWriterBuilder = OperatorOuterClass.ShuffleWriter.newBuilder()
-      shuffleWriterBuilder.setOutputDataFile(dataFile)
-      shuffleWriterBuilder.setOutputIndexFile(indexFile)
-
-      if (SparkEnv.get.conf.getBoolean("spark.shuffle.compress", true)) {
-        val codec = CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_CODEC.get() match {
-          case "zstd" => CompressionCodec.Zstd
-          case "lz4" => CompressionCodec.Lz4
-          case "snappy" => CompressionCodec.Snappy
-          case other => throw new UnsupportedOperationException(s"invalid codec: $other")
+    shuffleWriterBuilder.setCompressionLevel(
+      CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL.get)
+    shuffleWriterBuilder.setWriteBufferSize(
+      CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().min(Int.MaxValue).toInt)
+
+    outputPartitioning match {
+      case p if isSinglePartitioning(p) =>
+        val partitioning = PartitioningOuterClass.SinglePartition.newBuilder()
+        val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
+        shuffleWriterBuilder.setPartitioning(
+          partitioningBuilder.setSinglePartition(partitioning).build())
+      case _: HashPartitioning =>
+        val hashPartitioning = outputPartitioning.asInstanceOf[HashPartitioning]
+        val partitioning = PartitioningOuterClass.HashPartition.newBuilder()
+        partitioning.setNumPartitions(outputPartitioning.numPartitions)
+
+        val partitionExprs = hashPartitioning.expressions
+          .flatMap(e => QueryPlanSerde.exprToProto(e, outputAttributes))
+
+        if (partitionExprs.length != hashPartitioning.expressions.length) {
+          throw new UnsupportedOperationException(
+            s"Partitioning $hashPartitioning is not supported.")
         }
-        shuffleWriterBuilder.setCodec(codec)
-      } else {
-        shuffleWriterBuilder.setCodec(CompressionCodec.None)
-      }
-      shuffleWriterBuilder.setCompressionLevel(
-        CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL.get)
-      shuffleWriterBuilder.setWriteBufferSize(
-        CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().min(Int.MaxValue).toInt)
-
-      outputPartitioning match {
-        case p if isSinglePartitioning(p) =>
-          val partitioning = PartitioningOuterClass.SinglePartition.newBuilder()
 
-          val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
-          shuffleWriterBuilder.setPartitioning(
-            partitioningBuilder.setSinglePartition(partitioning).build())
-        case _: HashPartitioning =>
-          val hashPartitioning = outputPartitioning.asInstanceOf[HashPartitioning]
-
-          val partitioning = PartitioningOuterClass.HashPartition.newBuilder()
-          partitioning.setNumPartitions(outputPartitioning.numPartitions)
+        partitioning.addAllHashExpression(partitionExprs.asJava)
+
+        val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
+        shuffleWriterBuilder.setPartitioning(
+          partitioningBuilder.setHashPartition(partitioning).build())
+      case _: RangePartitioning =>
+        val rangePartitioning = outputPartitioning.asInstanceOf[RangePartitioning]
+        val partitioning = PartitioningOuterClass.RangePartition.newBuilder()
+        partitioning.setNumPartitions(outputPartitioning.numPartitions)
+
+        // Detect duplicates by tracking expressions directly, similar to DataFusion's LexOrdering
+        // DataFusion will deduplicate identical sort expressions in LexOrdering,
+        // so we need to transform boundary rows to match the deduplicated structure
+        val seenExprs = mutable.HashSet[Expression]()
+        val deduplicationMap = mutable.ArrayBuffer[(Int, Boolean)]()
+
+        rangePartitioning.ordering.zipWithIndex.foreach { case (sortOrder, idx) =>
+          if (seenExprs.contains(sortOrder.child)) {
+            deduplicationMap += (idx -> false)
+          } else {
+            seenExprs += sortOrder.child
+            deduplicationMap += (idx -> true)
+          }
+        }
 
-          val partitionExprs = hashPartitioning.expressions
+        {
+          val orderingExprs = rangePartitioning.ordering
             .flatMap(e => QueryPlanSerde.exprToProto(e, outputAttributes))
-
-          if (partitionExprs.length != hashPartitioning.expressions.length) {
+          if (orderingExprs.length != rangePartitioning.ordering.length) {
             throw new UnsupportedOperationException(
-              s"Partitioning $hashPartitioning is not supported.")
-          }
-
-          partitioning.addAllHashExpression(partitionExprs.asJava)
-
-          val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
-          shuffleWriterBuilder.setPartitioning(
-            partitioningBuilder.setHashPartition(partitioning).build())
-        case _: RangePartitioning =>
-          val rangePartitioning = outputPartitioning.asInstanceOf[RangePartitioning]
-
-          val partitioning = PartitioningOuterClass.RangePartition.newBuilder()
-          partitioning.setNumPartitions(outputPartitioning.numPartitions)
-
-          // Detect duplicates by tracking expressions directly, similar to DataFusion's LexOrdering
-          // DataFusion will deduplicate identical sort expressions in LexOrdering,
-          // so we need to transform boundary rows to match the deduplicated structure
-          val seenExprs = mutable.HashSet[Expression]()
-          val deduplicationMap = mutable.ArrayBuffer[(Int, Boolean)]() // (originalIndex, isKept)
-
-          rangePartitioning.ordering.zipWithIndex.foreach { case (sortOrder, idx) =>
-            if (seenExprs.contains(sortOrder.child)) {
-              deduplicationMap += (idx -> false) // Will be deduplicated by DataFusion
-            } else {
-              seenExprs += sortOrder.child
-              deduplicationMap += (idx -> true) // Will be kept by DataFusion
-            }
-          }
-
-          {
-            // Serialize the ordering expressions for comparisons
-            val orderingExprs = rangePartitioning.ordering
-              .flatMap(e => QueryPlanSerde.exprToProto(e, outputAttributes))
-            if (orderingExprs.length != rangePartitioning.ordering.length) {
-              throw new UnsupportedOperationException(
-                s"Partitioning $rangePartitioning is not supported.")
-            }
-            partitioning.addAllSortOrders(orderingExprs.asJava)
+              s"Partitioning $rangePartitioning is not supported.")
           }
+          partitioning.addAllSortOrders(orderingExprs.asJava)
+        }
 
-          // Convert Spark's sequence of InternalRows that represent partitioning boundaries to
-          // sequences of Literals, where each outer entry represents a boundary row, and each
-          // internal entry is a value in that row. In other words, these are stored in row major
-          // order, not column major
-          val boundarySchema = rangePartitioning.ordering.flatMap(e => Some(e.dataType))
-
-          // Transform boundary rows to match DataFusion's deduplicated structure
-          val transformedBoundaryExprs: Seq[Seq[Literal]] =
-            rangePartitionBounds.get.map((row: InternalRow) => {
-              // For every InternalRow, map its values to Literals
-              val allLiterals =
-                row.toSeq(boundarySchema).zip(boundarySchema).map { case (value, valueType) =>
-                  Literal(value, valueType)
-                }
-
-              // Keep only the literals that correspond to non-deduplicated expressions
-              allLiterals
-                .zip(deduplicationMap)
-                .filter(_._2._2) // Keep only where isKept = true
-                .map(_._1) // Extract the literal
+        val boundarySchema = rangePartitioning.ordering.flatMap(e => Some(e.dataType))
+
+        val transformedBoundaryExprs: Seq[Seq[Literal]] =
+          rangePartitionBounds.get.map((row: InternalRow) => {
+            val allLiterals =
+              row.toSeq(boundarySchema).zip(boundarySchema).map { case (value, valueType) =>
+                Literal(value, valueType)
+              }
+            allLiterals
+              .zip(deduplicationMap)
+              .filter(_._2._2)
+              .map(_._1)
+          })
+
+        {
+          val boundaryRows: Seq[PartitioningOuterClass.BoundaryRow] = transformedBoundaryExprs
+            .map((rowLiterals: Seq[Literal]) => {
+              val rowBuilder = PartitioningOuterClass.BoundaryRow.newBuilder();
+              val serializedExprs =
+                rowLiterals.map(lit_value =>
+                  QueryPlanSerde.exprToProto(lit_value, outputAttributes).get)
+              rowBuilder.addAllPartitionBounds(serializedExprs.asJava)
+              rowBuilder.build()
             })
+          partitioning.addAllBoundaryRows(boundaryRows.asJava)
+        }
 
-          {
-            // Convert the sequences of Literals to a collection of serialized BoundaryRows
-            val boundaryRows: Seq[PartitioningOuterClass.BoundaryRow] = transformedBoundaryExprs
-              .map((rowLiterals: Seq[Literal]) => {
-                // Serialize each sequence of Literals as a BoundaryRow
-                val rowBuilder = PartitioningOuterClass.BoundaryRow.newBuilder();
-                val serializedExprs =
-                  rowLiterals.map(lit_value =>
-                    QueryPlanSerde.exprToProto(lit_value, outputAttributes).get)
-                rowBuilder.addAllPartitionBounds(serializedExprs.asJava)
-                rowBuilder.build()
-              })
-            partitioning.addAllBoundaryRows(boundaryRows.asJava)
-          }
-
-          val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
-          shuffleWriterBuilder.setPartitioning(
-            partitioningBuilder.setRangePartition(partitioning).build())
+        val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
+        shuffleWriterBuilder.setPartitioning(
+          partitioningBuilder.setRangePartition(partitioning).build())
 
-        case _: RoundRobinPartitioning =>
-          val partitioning = PartitioningOuterClass.RoundRobinPartition.newBuilder()
-          partitioning.setNumPartitions(outputPartitioning.numPartitions)
-          partitioning.setMaxHashColumns(
-            CometConf.COMET_EXEC_SHUFFLE_WITH_ROUND_ROBIN_PARTITIONING_MAX_HASH_COLUMNS.get())
+      case _: RoundRobinPartitioning =>
+        val partitioning = PartitioningOuterClass.RoundRobinPartition.newBuilder()
+        partitioning.setNumPartitions(outputPartitioning.numPartitions)
+        partitioning.setMaxHashColumns(
+          CometConf.COMET_EXEC_SHUFFLE_WITH_ROUND_ROBIN_PARTITIONING_MAX_HASH_COLUMNS.get())
 
-          val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
-          shuffleWriterBuilder.setPartitioning(
-            partitioningBuilder.setRoundRobinPartition(partitioning).build())
+        val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder()
+        shuffleWriterBuilder.setPartitioning(
+          partitioningBuilder.setRoundRobinPartition(partitioning).build())
 
-        case _ =>
-          throw new UnsupportedOperationException(
-            s"Partitioning $outputPartitioning is not supported.")
-      }
+      case _ =>
+        throw new UnsupportedOperationException(
+          s"Partitioning $outputPartitioning is not supported.")
+    }
 
-      shuffleWriterBuilder.setTracingEnabled(CometConf.COMET_TRACING_ENABLED.get())
+    shuffleWriterBuilder.setTracingEnabled(CometConf.COMET_TRACING_ENABLED.get())
 
-      val shuffleWriterOpBuilder = OperatorOuterClass.Operator.newBuilder()
-      shuffleWriterOpBuilder
-        .setShuffleWriter(shuffleWriterBuilder)
-        .addChildren(opBuilder.setScan(scanBuilder).build())
-        .build()
-    } else {
-      // There are unsupported scan type
-      throw new UnsupportedOperationException(
-        s"$outputAttributes contains unsupported data types for CometShuffleExchangeExec.")
-    }
+    OperatorOuterClass.Operator
+      .newBuilder()
+      .setShuffleWriter(shuffleWriterBuilder)
+      .addChildren(childNativeOp)
+      .build()
   }
 
   override def stop(success: Boolean): Option[MapStatus] = {
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala
index 2b74e5a168..9ec25c49ec 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala
@@ -28,11 +28,19 @@ import org.apache.spark.shuffle.ShuffleWriteProcessor
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.comet.{CometMetricNode, NativeExecContext}
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.types.StructType
 
+import org.apache.comet.serde.OperatorOuterClass
+
 /**
  * A [[ShuffleDependency]] that allows us to identify the shuffle dependency as a Comet shuffle.
+ *
+ * On the native-shuffle path, also carries the child plan's per-partition execution context, root
+ * native operator, and metric node so [[CometNativeShuffleWriter]] can drive the unified
+ * `ShuffleWriter(child = childNativeOp)` plan in a single [[org.apache.comet.CometExecIterator]]
+ * per partition. These three fields are populated only when `shuffleType == CometNativeShuffle`.
  */
 class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
     @transient private val _rdd: RDD[_ <: Product2[K, V]],
@@ -49,7 +57,10 @@ class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
     val outputAttributes: Seq[Attribute] = Seq.empty,
     val shuffleWriteMetrics: Map[String, SQLMetric] = Map.empty,
     val numParts: Int = 0,
-    val rangePartitionBounds: Option[Seq[InternalRow]] = None)
+    val rangePartitionBounds: Option[Seq[InternalRow]] = None,
+    val nativeExecContext: Option[NativeExecContext] = None,
+    val childNativeOp: Option[OperatorOuterClass.Operator] = None,
+    val childMetricNode: Option[CometMetricNode] = None)
     extends ShuffleDependency[K, V, C](
       _rdd,
       partitioner,
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
index 493c20f8b7..16bd40d402 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, Exp
 import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.comet.{CometMetricNode, CometNativeExec, CometPlan, CometSinkPlaceHolder}
+import org.apache.spark.sql.comet.{CometMetricNode, CometNativeExec, CometPlan, CometSinkPlaceHolder, NativeExecContext}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleExchangeExec, ShuffleExchangeLike, ShuffleOrigin}
@@ -101,9 +101,38 @@ case class CometShuffleExchangeExec(
   private lazy val serializer: Serializer =
     new UnsafeRowSerializer(child.output.size, longMetric("dataSize"))
 
+  /**
+   * Per-partition execution context for the child native subtree, computed once and shared
+   * between [[inputRDD]] (which uses it to wire DAGScheduler dependencies) and
+   * [[shuffleDependency]] (which threads it through to [[CometNativeShuffleWriter]] for
+   * single-iterator native execution). Only populated when `shuffleType == CometNativeShuffle`
+   * AND the child is a [[CometNativeExec]] subtree we can inline. When the child is a non-native
+   * Comet plan (e.g. [[org.apache.spark.sql.comet.CometSparkToColumnarExec]]), this stays `None`
+   * and the shuffle falls back to the legacy `Scan("ShuffleWriterInput") -> ShuffleWriter` plan
+   * via the convenience overload of `prepareShuffleDependency`.
+   */
+  @transient private lazy val nativeChildContext: Option[NativeExecContext] = child match {
+    case nativeChild: CometNativeExec if shuffleType == CometNativeShuffle =>
+      Some(nativeChild.buildNativeContext())
+    case _ => None
+  }
+
   @transient lazy val inputRDD: RDD[_] = if (shuffleType == CometNativeShuffle) {
-    // CometNativeShuffle assumes that the input plan is Comet plan.
-    child.executeColumnar()
+    nativeChildContext match {
+      case Some(ctx) =>
+        // Single-driver path: thin scheduling anchor; CometNativeShuffleWriter drives the
+        // unified ShuffleWriter + child plan in a single CometExecIterator per partition.
+        new CometNativeShuffleInputRDD(
+          sparkContext,
+          ctx.inputs,
+          ctx.numPartitions,
+          ctx.shuffleScanIndices)
+      case None =>
+        // Child is a Comet plan but not a CometNativeExec subtree (e.g. CometSparkToColumnarExec).
+        // No native subtree to inline; the writer's plan is `Scan("ShuffleWriterInput") ->
+        // ShuffleWriter` and JVM batches flow into native through Arrow C Stream Interface.
+        child.executeColumnar()
+    }
   } else if (shuffleType == CometColumnarShuffle) {
     // CometColumnarShuffle uses Spark's row-based execute() API. For Spark row-based plans,
     // rows flow directly. For Comet native plans, their doExecute() wraps with ColumnarToRowExec
@@ -149,12 +178,36 @@ case class CometShuffleExchangeExec(
   @transient
   lazy val shuffleDependency: ShuffleDependency[Int, _, _] =
     if (shuffleType == CometNativeShuffle) {
-      val dep = CometShuffleExchangeExec.prepareShuffleDependency(
-        inputRDD.asInstanceOf[RDD[ColumnarBatch]],
-        child.output,
-        outputPartitioning,
-        serializer,
-        metrics)
+      val dep = nativeChildContext match {
+        case Some(ctx) =>
+          // Single-driver path: child is a CometNativeExec subtree. RangePartitioning needs real
+          // rows to compute partition bounds; use a regular columnar execution of the child for
+          // sampling only. The actual shuffle still goes through the single-iterator path.
+          val nativeChild = child.asInstanceOf[CometNativeExec]
+          val samplingRDD: Option[RDD[ColumnarBatch]] = outputPartitioning match {
+            case _: RangePartitioning => Some(child.executeColumnar())
+            case _ => None
+          }
+          CometShuffleExchangeExec.prepareNativeShuffleDependency(
+            inputRDD.asInstanceOf[CometNativeShuffleInputRDD],
+            samplingRDD,
+            child.output,
+            outputPartitioning,
+            serializer,
+            metrics,
+            ctx,
+            nativeChild.nativeOp,
+            CometMetricNode.fromCometPlan(nativeChild))
+        case None =>
+          // Child is a non-native Comet plan; the writer falls back to its Scan-placeholder
+          // path via the convenience overload of prepareShuffleDependency.
+          CometShuffleExchangeExec.prepareShuffleDependency(
+            inputRDD.asInstanceOf[RDD[ColumnarBatch]],
+            child.output,
+            outputPartitioning,
+            serializer,
+            metrics)
+      }
       metrics("numPartitions").set(dep.partitioner.numPartitions)
       val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
       SQLMetrics.postDriverMetricUpdates(
@@ -623,21 +676,107 @@ object CometShuffleExchangeExec
     }
   }
 
+  /**
+   * Build a Comet native shuffle dependency around an existing `RDD[ColumnarBatch]` of real
+   * batches. Used by [[org.apache.spark.sql.comet.CometCollectLimitExec]] and
+   * [[org.apache.spark.sql.comet.CometTakeOrderedAndProjectExec]] where the input is the result
+   * of a local-limit / topK transform and there is no separate child native subtree to inline.
+   *
+   * Implemented as a thin wrapper around [[prepareNativeShuffleDependency]]: synthesizes a
+   * `Scan("ShuffleWriterInput")` as the child native op (so the writer's plan is still
+   * `ShuffleWriter -> Scan`, consuming JVM batches via Arrow C Stream), wraps `rdd` as the single
+   * leaf input of a thin scheduling RDD, and supplies a minimal [[NativeExecContext]]. Same wire
+   * shape as before; one writer code path for both this case and the [[CometShuffleExchangeExec]]
+   * case.
+   */
   def prepareShuffleDependency(
       rdd: RDD[ColumnarBatch],
       outputAttributes: Seq[Attribute],
       outputPartitioning: Partitioning,
       serializer: Serializer,
       metrics: Map[String, SQLMetric]): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = {
-    val numParts = rdd.getNumPartitions
+
+    val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput")
+    val scanTypes = outputAttributes.flatten { attr =>
+      QueryPlanSerde.serializeDataType(attr.dataType)
+    }
+    if (scanTypes.length != outputAttributes.length) {
+      throw new UnsupportedOperationException(
+        s"$outputAttributes contains unsupported data types for CometShuffleExchangeExec.")
+    }
+    scanBuilder.addAllFields(scanTypes.asJava)
+    val scanOp = OperatorOuterClass.Operator.newBuilder().setScan(scanBuilder).build()
+
+    val thinRDD = new CometNativeShuffleInputRDD(
+      rdd.sparkContext,
+      Seq(rdd),
+      rdd.getNumPartitions,
+      shuffleScanIndices = Set.empty)
+
+    val ctx = NativeExecContext(
+      inputs = Seq(rdd),
+      numPartitions = rdd.getNumPartitions,
+      subqueries = Seq.empty,
+      broadcastedHadoopConfForEncryption = None,
+      encryptedFilePaths = Seq.empty,
+      commonByKey = Map.empty,
+      perPartitionByKey = Map.empty,
+      shuffleScanIndices = Set.empty,
+      hasScanInput = false)
+
+    // The Scan placeholder has no per-operator metrics, so the metric tree for the unified plan
+    // is `shuffleWriterMetrics` at the root with one empty leaf for the Scan child.
+    prepareNativeShuffleDependency(
+      thinRDD,
+      Some(rdd),
+      outputAttributes,
+      outputPartitioning,
+      serializer,
+      metrics,
+      ctx,
+      scanOp,
+      CometMetricNode(Map.empty))
+  }
+
+  /**
+   * Build a Comet native shuffle dependency for the [[CometShuffleExchangeExec]] case where the
+   * shuffle is fed by a [[CometNativeExec]] child. The writer drives the unified
+   * `ShuffleWriter(child = childNativeOp)` plan in a single
+   * [[org.apache.comet.CometExecIterator]] per partition. The returned dep carries the child's
+   * per-partition execution context, root native operator, and metric node so
+   * [[CometNativeShuffleWriter]] can reach them at task time.
+   *
+   * @param thinRDD
+   *   scheduling-anchor RDD whose `compute` returns a [[CometNativeShuffleInputIterator]];
+   *   produces no batches itself.
+   * @param samplingRDD
+   *   regular columnar execution of the child, only required for [[RangePartitioning]] (sampling
+   *   needs real rows). `None` for hash / single / round-robin.
+   */
+  def prepareNativeShuffleDependency(
+      thinRDD: CometNativeShuffleInputRDD,
+      samplingRDD: Option[RDD[ColumnarBatch]],
+      outputAttributes: Seq[Attribute],
+      outputPartitioning: Partitioning,
+      serializer: Serializer,
+      metrics: Map[String, SQLMetric],
+      nativeExecContext: NativeExecContext,
+      childNativeOp: OperatorOuterClass.Operator,
+      childMetricNode: CometMetricNode): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = {
+    val numParts = thinRDD.getNumPartitions
 
     // The code block below is mostly brought over from
     // ShuffleExchangeExec::prepareShuffleDependency
     val (partitioner, rangePartitionBounds) = outputPartitioning match {
       case rangePartitioning: RangePartitioning =>
+        // Sampling needs real rows; use the dedicated samplingRDD (a regular columnar execution
+        // of the child). The thin RDD itself yields nothing.
+        val samplingInput = samplingRDD.getOrElse(
+          throw new IllegalStateException(
+            "RangePartitioning requires a samplingRDD on the native-shuffle path"))
         // Extract only fields used for sorting to avoid collecting large fields that does not
         // affect sorting result when deciding partition bounds in RangePartitioner
-        val rddForSampling = rdd.mapPartitionsInternal { iter =>
+        val rddForSampling = samplingInput.mapPartitionsInternal { iter =>
           val projection =
             UnsafeProjection.create(rangePartitioning.ordering.map(_.child), outputAttributes)
           val mutablePair = new MutablePair[InternalRow, Null]()
@@ -683,9 +822,7 @@ object CometShuffleExchangeExec
     }
 
     val dependency = new CometShuffleDependency[Int, ColumnarBatch, ColumnarBatch](
-      rdd.map(
-        (0, _)
-      ), // adding fake partitionId that is always 0 because ShuffleDependency requires it
+      thinRDD,
       serializer = serializer,
       shuffleWriterProcessor = ShuffleExchangeExec.createShuffleWriteProcessor(metrics),
       shuffleType = CometNativeShuffle,
@@ -695,7 +832,10 @@ object CometShuffleExchangeExec
       outputAttributes = outputAttributes,
       shuffleWriteMetrics = metrics,
       numParts = numParts,
-      rangePartitionBounds = rangePartitionBounds)
+      rangePartitionBounds = rangePartitionBounds,
+      nativeExecContext = Some(nativeExecContext),
+      childNativeOp = Some(childNativeOp),
+      childMetricNode = Some(childMetricNode))
     dependency
   }
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala
index c8f2199d53..16a21bbfd9 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala
@@ -231,6 +231,9 @@ class CometShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
       case cometShuffleHandle: CometNativeShuffleHandle[K @unchecked, V @unchecked] =>
         val dep = cometShuffleHandle.dependency.asInstanceOf[CometShuffleDependency[_, _, _]]
         new CometNativeShuffleWriter(
+          dep.childNativeOp.get,
+          dep.childMetricNode.get,
+          dep.nativeExecContext.get,
           dep.outputPartitioning.get,
           dep.outputAttributes,
           dep.shuffleWriteMetrics,
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index 7d5398ae62..748362cd65 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -381,6 +381,29 @@ object CometExec {
   }
 }
 
+/**
+ * Per-partition execution context for a native subtree rooted at a [[CometNativeExec]] boundary.
+ *
+ * Built once on the driver from the SparkPlan tree and consumed by either:
+ *   - [[CometNativeExec.doExecuteColumnar]] to construct a [[CometExecRDD]], or
+ *   - the native-shuffle path, where [[CometNativeShuffleWriter]] drives the same child plan with
+ *     a `ShuffleWriter` operator as its root.
+ *
+ * The fields capture everything that depends on tree-walking the SparkPlan and aligning leaf
+ * input RDDs (broadcast partition counts, plan-data, subqueries, encryption options) so callers
+ * do not have to re-derive them.
+ */
+private[comet] case class NativeExecContext(
+    inputs: Seq[RDD[ColumnarBatch]],
+    numPartitions: Int,
+    subqueries: Seq[ScalarSubquery],
+    broadcastedHadoopConfForEncryption: Option[Broadcast[SerializableConfiguration]],
+    encryptedFilePaths: Seq[String],
+    commonByKey: Map[String, Array[Byte]],
+    perPartitionByKey: Map[String, Array[Array[Byte]]],
+    shuffleScanIndices: Set[Int],
+    hasScanInput: Boolean)
+
 /**
  * A Comet native physical operator.
  */
@@ -426,157 +449,30 @@ abstract class CometNativeExec extends CometExec {
         throw new CometRuntimeException(
           s"CometNativeExec should not be executed directly without a serialized plan: $this")
       case Some(serializedPlan) =>
-        val serializedPlanCopy = serializedPlan
         // TODO: support native metrics for all operators.
         val nativeMetrics = CometMetricNode.fromCometPlan(this)
+        val ctx = buildNativeContext()
 
-        // Go over all the native scans, in order to see if they need encryption options.
-        // For each relation in a CometNativeScan generate a hadoopConf,
-        // for each file path in a relation associate with hadoopConf
-        // This is done per native plan, so only count scans until a comet input is reached.
-        val encryptionOptions =
-          mutable.ArrayBuffer.empty[(Broadcast[SerializableConfiguration], Seq[String])]
-        foreachUntilCometInput(this) {
-          case scan: CometNativeScanExec =>
-            // This creates a hadoopConf that brings in any SQLConf "spark.hadoop.*" configs and
-            // per-relation configs since different tables might have different decryption
-            // properties.
-            val hadoopConf = scan.relation.sparkSession.sessionState
-              .newHadoopConfWithOptions(scan.relation.options)
-            val encryptionEnabled = CometParquetUtils.encryptionEnabled(hadoopConf)
-            if (encryptionEnabled) {
-              // hadoopConf isn't serializable, so we have to do a broadcasted config.
-              val broadcastedConf =
-                scan.relation.sparkSession.sparkContext
-                  .broadcast(new SerializableConfiguration(hadoopConf))
-
-              val optsTuple: (Broadcast[SerializableConfiguration], Seq[String]) =
-                (broadcastedConf, scan.relation.inputFiles.toSeq)
-              encryptionOptions += optsTuple
-            }
-          case _ => // no-op
-        }
-        assert(
-          encryptionOptions.size <= 1,
-          "We expect one native scan that requires encryption reading in a Comet plan," +
-            " since we will broadcast one hadoopConf.")
-        // If this assumption changes in the future, you can look at the commit history of #2447
-        // to see how there used to be a map of relations to broadcasted confs in case multiple
-        // relations in a single plan. The example that came up was UNION. See discussion at:
-        // https://github.com/apache/datafusion-comet/pull/2447#discussion_r2406118264
-        val (broadcastedHadoopConfForEncryption, encryptedFilePaths) =
-          encryptionOptions.headOption match {
-            case Some((conf, paths)) => (Some(conf), paths)
-            case None => (None, Seq.empty)
-          }
-
-        // Find planning data within this stage (stops at shuffle boundaries).
-        val (commonByKey, perPartitionByKey) = findAllPlanData(this)
-
-        // Collect the input ColumnarBatches from the child operators and create a CometExecIterator
-        // to execute the native plan.
-        val sparkPlans = ArrayBuffer.empty[SparkPlan]
-        val inputs = ArrayBuffer.empty[RDD[ColumnarBatch]]
-
-        foreachUntilCometInput(this)(sparkPlans += _)
-
-        // Find the first non broadcast plan
-        val firstNonBroadcastPlan = sparkPlans.zipWithIndex.find {
-          case (_: CometBroadcastExchangeExec, _) => false
-          case (BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _), _) => false
-          case (BroadcastQueryStageExec(_, _: ReusedExchangeExec, _), _) => false
-          case (ReusedExchangeExec(_, _: CometBroadcastExchangeExec), _) => false
-          case _ => true
-        }
-
-        val containsBroadcastInput = sparkPlans.exists {
-          case _: CometBroadcastExchangeExec => true
-          case BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _) => true
-          case BroadcastQueryStageExec(_, _: ReusedExchangeExec, _) => true
-          case ReusedExchangeExec(_, _: CometBroadcastExchangeExec) => true
-          case _ => false
-        }
-
-        // If the first non broadcast plan is not found, it means all the plans are broadcast plans.
-        // This is not expected, so throw an exception.
-        if (containsBroadcastInput && firstNonBroadcastPlan.isEmpty) {
-          throw new CometRuntimeException(s"Cannot find the first non broadcast plan: $this")
-        }
-
-        // If the first non broadcast plan is found, we need to adjust the partition number of
-        // the broadcast plans to make sure they have the same partition number as the first non
-        // broadcast plan.
-        val (firstNonBroadcastPlanRDD, firstNonBroadcastPlanNumPartitions) =
-          firstNonBroadcastPlan.get._1 match {
-            case plan: CometNativeExec =>
-              (null, plan.outputPartitioning.numPartitions)
-            case plan =>
-              val rdd = plan.executeColumnar()
-              (rdd, rdd.getNumPartitions)
-          }
-
-        // Spark doesn't need to zip Broadcast RDDs, so it doesn't schedule Broadcast RDDs with
-        // same partition number. But for Comet, we need to zip them so we need to adjust the
-        // partition number of Broadcast RDDs to make sure they have the same partition number.
-        sparkPlans.zipWithIndex.foreach { case (plan, idx) =>
-          plan match {
-            case c: CometBroadcastExchangeExec =>
-              inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
-            case BroadcastQueryStageExec(_, c: CometBroadcastExchangeExec, _) =>
-              inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
-            case ReusedExchangeExec(_, c: CometBroadcastExchangeExec) =>
-              inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
-            case BroadcastQueryStageExec(
-                  _,
-                  ReusedExchangeExec(_, c: CometBroadcastExchangeExec),
-                  _) =>
-              inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
-            case _: CometNativeExec =>
-            // no-op
-            case _ if idx == firstNonBroadcastPlan.get._2 =>
-              inputs += firstNonBroadcastPlanRDD
-            case _ =>
-              val rdd = plan.executeColumnar()
-              if (rdd.getNumPartitions != firstNonBroadcastPlanNumPartitions) {
-                throw new CometRuntimeException(
-                  s"Partition number mismatch: ${rdd.getNumPartitions} != " +
-                    s"$firstNonBroadcastPlanNumPartitions")
-              } else {
-                inputs += rdd
-              }
-          }
-        }
-
-        if (inputs.isEmpty && !sparkPlans.forall(_.isInstanceOf[CometNativeExec])) {
-          throw new CometRuntimeException(s"No input for CometNativeExec:\n $this")
-        }
-
-        // Detect ShuffleScan indices for direct read in CometExecRDD
-        val shuffleScanIndices = findShuffleScanIndices(serializedPlanCopy)
-
-        // Unified RDD creation - CometExecRDD handles all cases
-        val subqueries = collectSubqueries(this)
-        val hasScanInput = sparkPlans.exists(_.isInstanceOf[CometNativeScanExec])
         new CometExecRDD(
           sparkContext,
-          inputs.toSeq,
-          commonByKey,
-          perPartitionByKey,
-          serializedPlanCopy,
-          firstNonBroadcastPlanNumPartitions,
+          ctx.inputs,
+          ctx.commonByKey,
+          ctx.perPartitionByKey,
+          serializedPlan,
+          ctx.numPartitions,
           output.length,
           nativeMetrics,
-          subqueries,
-          broadcastedHadoopConfForEncryption,
-          encryptedFilePaths,
-          shuffleScanIndices) {
+          ctx.subqueries,
+          ctx.broadcastedHadoopConfForEncryption,
+          ctx.encryptedFilePaths,
+          ctx.shuffleScanIndices) {
           override def compute(
               split: Partition,
               context: TaskContext): Iterator[ColumnarBatch] = {
             val res = super.compute(split, context)
 
             // Report scan input metrics only when the native plan contains a scan.
-            if (hasScanInput) {
+            if (ctx.hasScanInput) {
               Option(context).foreach(nativeMetrics.reportScanInputMetrics)
             }
 
@@ -586,6 +482,149 @@ abstract class CometNativeExec extends CometExec {
     }
   }
 
+  /**
+   * Walk this CometNativeExec subtree once and gather everything needed to launch native
+   * execution: leaf input RDDs (with broadcast partition counts aligned to the first
+   * non-broadcast plan), per-partition planning data, subqueries, encryption options, and
+   * shuffle-scan indices. See [[NativeExecContext]] for the full set of fields.
+   *
+   * Used by [[doExecuteColumnar]] (CometExecRDD path) and by the native-shuffle path
+   * (CometShuffleExchangeExec) so both observe the same input alignment.
+   */
+  private[comet] def buildNativeContext(): NativeExecContext = {
+    // Go over all the native scans, in order to see if they need encryption options.
+    // For each relation in a CometNativeScan generate a hadoopConf,
+    // for each file path in a relation associate with hadoopConf
+    // This is done per native plan, so only count scans until a comet input is reached.
+    val encryptionOptions =
+      mutable.ArrayBuffer.empty[(Broadcast[SerializableConfiguration], Seq[String])]
+    foreachUntilCometInput(this) {
+      case scan: CometNativeScanExec =>
+        // This creates a hadoopConf that brings in any SQLConf "spark.hadoop.*" configs and
+        // per-relation configs since different tables might have different decryption
+        // properties.
+        val hadoopConf = scan.relation.sparkSession.sessionState
+          .newHadoopConfWithOptions(scan.relation.options)
+        val encryptionEnabled = CometParquetUtils.encryptionEnabled(hadoopConf)
+        if (encryptionEnabled) {
+          // hadoopConf isn't serializable, so we have to do a broadcasted config.
+          val broadcastedConf =
+            scan.relation.sparkSession.sparkContext
+              .broadcast(new SerializableConfiguration(hadoopConf))
+
+          val optsTuple: (Broadcast[SerializableConfiguration], Seq[String]) =
+            (broadcastedConf, scan.relation.inputFiles.toSeq)
+          encryptionOptions += optsTuple
+        }
+      case _ => // no-op
+    }
+    assert(
+      encryptionOptions.size <= 1,
+      "We expect one native scan that requires encryption reading in a Comet plan," +
+        " since we will broadcast one hadoopConf.")
+    // If this assumption changes in the future, you can look at the commit history of #2447
+    // to see how there used to be a map of relations to broadcasted confs in case multiple
+    // relations in a single plan. The example that came up was UNION. See discussion at:
+    // https://github.com/apache/datafusion-comet/pull/2447#discussion_r2406118264
+    val (broadcastedHadoopConfForEncryption, encryptedFilePaths) =
+      encryptionOptions.headOption match {
+        case Some((conf, paths)) => (Some(conf), paths)
+        case None => (None, Seq.empty)
+      }
+
+    // Find planning data within this stage (stops at shuffle boundaries).
+    val (commonByKey, perPartitionByKey) = findAllPlanData(this)
+
+    // Collect the input ColumnarBatches from the child operators and create a CometExecIterator
+    // to execute the native plan.
+    val sparkPlans = ArrayBuffer.empty[SparkPlan]
+    val inputs = ArrayBuffer.empty[RDD[ColumnarBatch]]
+
+    foreachUntilCometInput(this)(sparkPlans += _)
+
+    // Find the first non broadcast plan
+    val firstNonBroadcastPlan = sparkPlans.zipWithIndex.find {
+      case (_: CometBroadcastExchangeExec, _) => false
+      case (BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _), _) => false
+      case (BroadcastQueryStageExec(_, _: ReusedExchangeExec, _), _) => false
+      case (ReusedExchangeExec(_, _: CometBroadcastExchangeExec), _) => false
+      case _ => true
+    }
+
+    val containsBroadcastInput = sparkPlans.exists {
+      case _: CometBroadcastExchangeExec => true
+      case BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _) => true
+      case BroadcastQueryStageExec(_, _: ReusedExchangeExec, _) => true
+      case ReusedExchangeExec(_, _: CometBroadcastExchangeExec) => true
+      case _ => false
+    }
+
+    // If the first non broadcast plan is not found, it means all the plans are broadcast plans.
+    // This is not expected, so throw an exception.
+    if (containsBroadcastInput && firstNonBroadcastPlan.isEmpty) {
+      throw new CometRuntimeException(s"Cannot find the first non broadcast plan: $this")
+    }
+
+    // If the first non broadcast plan is found, we need to adjust the partition number of
+    // the broadcast plans to make sure they have the same partition number as the first non
+    // broadcast plan.
+    val (firstNonBroadcastPlanRDD, firstNonBroadcastPlanNumPartitions) =
+      firstNonBroadcastPlan.get._1 match {
+        case plan: CometNativeExec =>
+          (null, plan.outputPartitioning.numPartitions)
+        case plan =>
+          val rdd = plan.executeColumnar()
+          (rdd, rdd.getNumPartitions)
+      }
+
+    // Spark doesn't need to zip Broadcast RDDs, so it doesn't schedule Broadcast RDDs with
+    // same partition number. But for Comet, we need to zip them so we need to adjust the
+    // partition number of Broadcast RDDs to make sure they have the same partition number.
+    sparkPlans.zipWithIndex.foreach { case (plan, idx) =>
+      plan match {
+        case c: CometBroadcastExchangeExec =>
+          inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
+        case BroadcastQueryStageExec(_, c: CometBroadcastExchangeExec, _) =>
+          inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
+        case ReusedExchangeExec(_, c: CometBroadcastExchangeExec) =>
+          inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
+        case BroadcastQueryStageExec(
+              _,
+              ReusedExchangeExec(_, c: CometBroadcastExchangeExec),
+              _) =>
+          inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
+        case _: CometNativeExec =>
+        // no-op
+        case _ if idx == firstNonBroadcastPlan.get._2 =>
+          inputs += firstNonBroadcastPlanRDD
+        case _ =>
+          val rdd = plan.executeColumnar()
+          if (rdd.getNumPartitions != firstNonBroadcastPlanNumPartitions) {
+            throw new CometRuntimeException(
+              s"Partition number mismatch: ${rdd.getNumPartitions} != " +
+                s"$firstNonBroadcastPlanNumPartitions")
+          } else {
+            inputs += rdd
+          }
+      }
+    }
+
+    if (inputs.isEmpty && !sparkPlans.forall(_.isInstanceOf[CometNativeExec])) {
+      throw new CometRuntimeException(s"No input for CometNativeExec:\n $this")
+    }
+
+    NativeExecContext(
+      inputs = inputs.toSeq,
+      numPartitions = firstNonBroadcastPlanNumPartitions,
+      subqueries = collectSubqueries(this),
+      broadcastedHadoopConfForEncryption = broadcastedHadoopConfForEncryption,
+      encryptedFilePaths = encryptedFilePaths,
+      commonByKey = commonByKey,
+      perPartitionByKey = perPartitionByKey,
+      shuffleScanIndices = findShuffleScanIndices(nativeOp),
+      hasScanInput = sparkPlans.exists(_.isInstanceOf[CometNativeScanExec]))
+  }
+
   /**
    * Traverse the tree of Comet physical operators until reaching the input sources operators and
    * apply the given function to each operator.
@@ -623,11 +662,10 @@ abstract class CometNativeExec extends CometExec {
   }
 
   /**
-   * Walk the serialized protobuf plan depth-first to find which input indices correspond to
+   * Walk the protobuf operator tree depth-first to find which input indices correspond to
    * ShuffleScan vs Scan leaf nodes. Each Scan or ShuffleScan leaf consumes one input in order.
    */
-  private def findShuffleScanIndices(planBytes: Array[Byte]): Set[Int] = {
-    val plan = OperatorOuterClass.Operator.parseFrom(planBytes)
+  private def findShuffleScanIndices(plan: OperatorOuterClass.Operator): Set[Int] = {
     var scanIndex = 0
     val indices = mutable.Set.empty[Int]
     def walk(op: OperatorOuterClass.Operator): Unit = {

From 8c99fc5a9b4f5301994787debedd5c9c6a77e50f Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Thu, 28 May 2026 20:29:02 -0400
Subject: [PATCH 02/13] Passes CometFuzzTestSuite, CometNativeShuffleSuite,
 CometExecSuite.

---
 .../shuffle/CometNativeShuffleInputRDD.scala  | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala

diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala
new file mode 100644
index 0000000000..29b9745366
--- /dev/null
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.execution.shuffle
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import org.apache.comet.CometShuffleBlockIterator
+
+/**
+ * Thin RDD that anchors Spark scheduling for the native-shuffle path. Native execution itself is
+ * driven by [[CometNativeShuffleWriter]] using the unified `ShuffleWriter(child = childNativeOp)`
+ * plan.
+ *
+ * The RDD's role here is twofold:
+ *   - declare `OneToOneDependency` on each leaf input RDD so the DAGScheduler walks the lineage
+ *     and triggers prior stages, broadcast materialization, etc.
+ *   - construct the per-partition leaf iterators (and shuffle-block iterators where applicable)
+ *     in `compute`, packaged into a [[CometNativeShuffleInputIterator]] that the writer downcasts
+ *     to extract the inputs it needs to feed the native plan.
+ *
+ * The iterator returned by `compute` always reports `hasNext = false`. Spark's `ShuffleMapTask`
+ * will hand it to `writer.write`; the writer ignores it as an iterator and reads its exposed
+ * fields directly.
+ */
+private[shuffle] class CometNativeShuffleInputRDD(
+    sc: SparkContext,
+    var inputRDDs: Seq[RDD[ColumnarBatch]],
+    numPartitionsParam: Int,
+    shuffleScanIndices: Set[Int])
+    extends RDD[Product2[Int, ColumnarBatch]](
+      sc,
+      inputRDDs.map(rdd => new OneToOneDependency(rdd))) {
+
+  override protected def getPartitions: Array[Partition] =
+    (0 until numPartitionsParam).map { i =>
+      // Resolve leaf-RDD partitions on the driver here (where their @transient fields are still
+      // populated). Stashing them on the partition lets `compute` avoid touching
+      // `leafRdd.partitions` on the executor, which would otherwise trigger getPartitions and
+      // hit the @transient-null trap (e.g. CometExecRDD.perPartitionByKey).
+      val inputParts = inputRDDs.map(_.partitions(i)).toArray
+      new CometNativeShuffleInputPartition(i, inputParts)
+    }.toArray
+
+  override def compute(
+      split: Partition,
+      context: TaskContext): Iterator[Product2[Int, ColumnarBatch]] = {
+    val partition = split.asInstanceOf[CometNativeShuffleInputPartition]
+    val leafIterators = inputRDDs.zip(partition.inputPartitions).map { case (rdd, part) =>
+      rdd.iterator(part, context)
+    }
+    val shuffleBlockIters: Map[Int, CometShuffleBlockIterator] =
+      shuffleScanIndices.flatMap { si =>
+        inputRDDs(si) match {
+          case rdd: CometShuffledBatchRDD =>
+            Some(si -> rdd.computeAsShuffleBlockIterator(partition.inputPartitions(si), context))
+          case _ => None
+        }
+      }.toMap
+    new CometNativeShuffleInputIterator(partition.index, leafIterators, shuffleBlockIters)
+  }
+
+  override def getPreferredLocations(split: Partition): Seq[String] = {
+    if (inputRDDs == null || inputRDDs.isEmpty) return Nil
+    val partition = split.asInstanceOf[CometNativeShuffleInputPartition]
+    val prefs = inputRDDs.zip(partition.inputPartitions).map { case (rdd, part) =>
+      rdd.preferredLocations(part)
+    }
+    val intersection = prefs.reduce((a, b) => a.intersect(b))
+    if (intersection.nonEmpty) intersection else prefs.flatten.distinct
+  }
+
+  override def clearDependencies(): Unit = {
+    super.clearDependencies()
+    inputRDDs = null
+  }
+}
+
+private[shuffle] class CometNativeShuffleInputPartition(
+    override val index: Int,
+    val inputPartitions: Array[Partition])
+    extends Partition
+
+/**
+ * Iterator handed to [[CometNativeShuffleWriter.write]] via Spark's ShuffleMapTask. Reports no
+ * elements; the writer downcasts and reads `partitionIndex`, `leafIterators`, and
+ * `shuffleBlockIterators` directly to drive the unified native plan.
+ */
+private[shuffle] class CometNativeShuffleInputIterator(
+    val partitionIndex: Int,
+    val leafIterators: Seq[Iterator[ColumnarBatch]],
+    val shuffleBlockIterators: Map[Int, CometShuffleBlockIterator])
+    extends Iterator[Product2[Int, ColumnarBatch]] {
+
+  override def hasNext: Boolean = false
+
+  override def next(): Product2[Int, ColumnarBatch] =
+    throw new NoSuchElementException(
+      "CometNativeShuffleInputIterator does not produce elements; CometNativeShuffleWriter " +
+        "drives native execution via the iterator's exposed fields.")
+}

From 443a1c7866ff4dd80664425fecd06f4cbfa211e3 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Thu, 28 May 2026 20:49:43 -0400
Subject: [PATCH 03/13] Cleanup, update docs.

---
 .../contributor-guide/native_shuffle.md       |  38 ++++--
 .../shuffle/CometNativeShuffleInputRDD.scala  |  23 +---
 .../shuffle/CometNativeShuffleWriter.scala    |  72 +++++-----
 .../shuffle/CometShuffleDependency.scala      |  22 ++-
 .../shuffle/CometShuffleExchangeExec.scala    |  62 ++++-----
 .../shuffle/CometShuffleManager.scala         |   4 +-
 .../apache/spark/sql/comet/operators.scala    | 128 ++++++++----------
 7 files changed, 160 insertions(+), 189 deletions(-)

diff --git a/docs/source/contributor-guide/native_shuffle.md b/docs/source/contributor-guide/native_shuffle.md
index 18e80a90c8..c46b8b45a8 100644
--- a/docs/source/contributor-guide/native_shuffle.md
+++ b/docs/source/contributor-guide/native_shuffle.md
@@ -69,8 +69,9 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
                                       ▼
 ┌─────────────────────────────────────────────────────────────────────────────┐
 │                         CometNativeShuffleWriter                             │
-│  - Constructs protobuf operator plan                                         │
-│  - Invokes native execution via CometExec.getCometIterator()                 │
+│  - Builds protobuf operator plan: ShuffleWriter(child = childNativeOp)       │
+│  - Reads per-partition leaf iterators from CometNativeShuffleInputIterator   │
+│  - Drives one CometExecIterator per partition                                │
 └─────────────────────────────────────────────────────────────────────────────┘
                                       │
                                       ▼ (JNI)
@@ -103,13 +104,14 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
 
 ### Scala Side
 
-| Class                          | Location                                         | Description                                                                                   |
-| ------------------------------ | ------------------------------------------------ | --------------------------------------------------------------------------------------------- |
-| `CometShuffleExchangeExec`     | `.../shuffle/CometShuffleExchangeExec.scala`     | Physical plan node. Validates types and partitioning, creates `CometShuffleDependency`.       |
-| `CometNativeShuffleWriter`     | `.../shuffle/CometNativeShuffleWriter.scala`     | Implements `ShuffleWriter`. Builds protobuf plan and invokes native execution.                |
-| `CometShuffleDependency`       | `.../shuffle/CometShuffleDependency.scala`       | Extends `ShuffleDependency`. Holds shuffle type, schema, and range partition bounds.          |
-| `CometBlockStoreShuffleReader` | `.../shuffle/CometBlockStoreShuffleReader.scala` | Reads shuffle blocks via `ShuffleBlockFetcherIterator`. Decodes Arrow IPC to `ColumnarBatch`. |
-| `NativeBatchDecoderIterator`   | `.../shuffle/NativeBatchDecoderIterator.scala`   | Reads compressed Arrow IPC from input stream. Calls native decode via JNI.                    |
+| Class                          | Location                                         | Description                                                                                                                                         |
+| ------------------------------ | ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `CometShuffleExchangeExec`     | `.../shuffle/CometShuffleExchangeExec.scala`     | Physical plan node. Validates types and partitioning, creates `CometShuffleDependency`.                                                             |
+| `CometNativeShuffleWriter`     | `.../shuffle/CometNativeShuffleWriter.scala`     | Implements `ShuffleWriter`. Builds the unified `ShuffleWriter(child = childNativeOp)` plan and runs it in one `CometExecIterator` per partition.    |
+| `CometShuffleDependency`       | `.../shuffle/CometShuffleDependency.scala`       | Extends `ShuffleDependency`. Holds shuffle type, schema, range partition bounds, and (native shuffle only) a `NativeShuffleSpec`.                   |
+| `CometNativeShuffleInputRDD`   | `.../shuffle/CometNativeShuffleInputRDD.scala`   | Thin scheduling-anchor RDD on the native-shuffle path. `compute` returns a `CometNativeShuffleInputIterator` carrying per-partition leaf iterators. |
+| `CometBlockStoreShuffleReader` | `.../shuffle/CometBlockStoreShuffleReader.scala` | Reads shuffle blocks via `ShuffleBlockFetcherIterator`. Decodes Arrow IPC to `ColumnarBatch`.                                                       |
+| `NativeBatchDecoderIterator`   | `.../shuffle/NativeBatchDecoderIterator.scala`   | Reads compressed Arrow IPC from input stream. Calls native decode via JNI.                                                                          |
 
 ### Rust Side
 
@@ -123,11 +125,19 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
 
 ### Write Path
 
-1. **Plan construction**: `CometNativeShuffleWriter` builds a protobuf operator plan containing:
-   - A scan operator reading from the input iterator
-   - A `ShuffleWriter` operator with partitioning config and compression codec
-
-2. **Native execution**: `CometExec.getCometIterator()` executes the plan in Rust.
+1. **Plan construction**: `CometNativeShuffleWriter` builds a protobuf operator tree with a
+   `ShuffleWriter` operator at the root and `childNativeOp` as its child. `childNativeOp` takes
+   one of two shapes:
+   - The child plan's `nativeOp` directly, when `CometShuffleExchangeExec`'s child is a
+     `CometNativeExec` subtree. The upstream operators run inside the same `CometExecIterator`
+     as the writer, with no JVM-to-native batch boundary between them.
+   - A synthetic `Scan("ShuffleWriterInput")` placeholder, when the dep was built via the
+     convenience `prepareShuffleDependency(rdd, ...)` overload (used by
+     `CometCollectLimitExec` and `CometTakeOrderedAndProjectExec`, or when the
+     exchange's child is a non-native `CometPlan` such as `CometSparkToColumnarExec`). Native
+     code reads `ColumnarBatch`es from the JVM input iterator via Arrow C Stream Interface.
+
+2. **Native execution**: A single `CometExecIterator` per partition runs the unified plan.
 
 3. **Partitioning**: `ShuffleWriterExec` receives batches and routes to the appropriate partitioner:
    - `MultiPartitionShuffleRepartitioner`: For hash/range/round-robin partitioning
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala
index 29b9745366..811c5bd3be 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala
@@ -26,20 +26,11 @@ import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.comet.CometShuffleBlockIterator
 
 /**
- * Thin RDD that anchors Spark scheduling for the native-shuffle path. Native execution itself is
- * driven by [[CometNativeShuffleWriter]] using the unified `ShuffleWriter(child = childNativeOp)`
- * plan.
- *
- * The RDD's role here is twofold:
- *   - declare `OneToOneDependency` on each leaf input RDD so the DAGScheduler walks the lineage
- *     and triggers prior stages, broadcast materialization, etc.
- *   - construct the per-partition leaf iterators (and shuffle-block iterators where applicable)
- *     in `compute`, packaged into a [[CometNativeShuffleInputIterator]] that the writer downcasts
- *     to extract the inputs it needs to feed the native plan.
- *
- * The iterator returned by `compute` always reports `hasNext = false`. Spark's `ShuffleMapTask`
- * will hand it to `writer.write`; the writer ignores it as an iterator and reads its exposed
- * fields directly.
+ * Thin scheduling-anchor RDD for the native-shuffle path. Declares `OneToOneDependency` on each
+ * leaf input RDD (so the DAGScheduler triggers prior stages, broadcasts, etc.) and constructs
+ * per-partition leaf iterators in `compute`, packaged into a [[CometNativeShuffleInputIterator]].
+ * The iterator reports `hasNext = false`; [[CometNativeShuffleWriter]] downcasts it and reads the
+ * leaf iterators directly to drive the unified `ShuffleWriter(child = childNativeOp)` plan.
  */
 private[shuffle] class CometNativeShuffleInputRDD(
     sc: SparkContext,
@@ -114,6 +105,6 @@ private[shuffle] class CometNativeShuffleInputIterator(
 
   override def next(): Product2[Int, ColumnarBatch] =
     throw new NoSuchElementException(
-      "CometNativeShuffleInputIterator does not produce elements; CometNativeShuffleWriter " +
-        "drives native execution via the iterator's exposed fields.")
+      "CometNativeShuffleInputIterator should never be drained as an iterator. Reaching this " +
+        "code means a non-Comet ShuffleWriter is consuming the input, which is a bug.")
 }
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
index 68078e8507..21395c58fd 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
@@ -32,7 +32,7 @@ import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriteMetricsR
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning, SinglePartition}
-import org.apache.spark.sql.comet.{CometExec, CometMetricNode, CometScalarSubquery, NativeExecContext, PlanDataInjector}
+import org.apache.spark.sql.comet.{CometExec, CometMetricNode, CometScalarSubquery, PlanDataInjector}
 import org.apache.spark.sql.execution.metric.SQLMetric
 
 import org.apache.comet.{CometConf, CometExecIterator}
@@ -40,26 +40,16 @@ import org.apache.comet.serde.{OperatorOuterClass, PartitioningOuterClass, Query
 import org.apache.comet.serde.OperatorOuterClass.{CompressionCodec, Operator}
 
 /**
- * A [[ShuffleWriter]] that drives the native shuffle write in a single [[CometExecIterator]] per
- * partition. The unified plan it executes has [[OperatorOuterClass.ShuffleWriter]] at the root
- * with `childNativeOp` as its only child. Leaf input iterators come from
- * [[CometNativeShuffleInputIterator]] (constructed by [[CometNativeShuffleInputRDD.compute]]).
- *
- * Two flavors of `childNativeOp` are in use:
- *   - rich Comet native subtree (e.g. HashAgg / Filter / ShuffleScan), supplied by
- *     [[CometShuffleExchangeExec]] when its child is a
- *     [[org.apache.spark.sql.comet.CometNativeExec]].
- *   - synthetic `Scan("ShuffleWriterInput")` placeholder, supplied by the convenience overload of
- *     [[CometShuffleExchangeExec.prepareShuffleDependency]] for callers that already hold an
- *     `RDD[ColumnarBatch]` of native-driven batches (e.g.
- *     [[org.apache.spark.sql.comet.CometCollectLimitExec]]).
- *
- * The writer treats both shapes identically.
+ * Drives the native shuffle write in a single [[CometExecIterator]] per partition. The plan is
+ * `ShuffleWriter(child = childNativeOp)`; leaf iterators come from a
+ * [[CometNativeShuffleInputIterator]]. `childNativeOp` is either a rich Comet native subtree
+ * (when fed by [[CometShuffleExchangeExec]] with a [[org.apache.spark.sql.comet.CometNativeExec]]
+ * child) or a synthetic `Scan("ShuffleWriterInput")` placeholder (the
+ * [[CometShuffleExchangeExec.prepareShuffleDependency]] convenience overload). Same handling
+ * either way.
  */
 class CometNativeShuffleWriter[K, V](
-    childNativeOp: Operator,
-    childMetricNode: CometMetricNode,
-    nativeContext: NativeExecContext,
+    spec: NativeShuffleSpec,
     outputPartitioning: Partitioning,
     outputAttributes: Seq[Attribute],
     metrics: Map[String, SQLMetric],
@@ -87,19 +77,28 @@ class CometNativeShuffleWriter[K, V](
     val tempDataFilePath = Paths.get(tempDataFilename)
     val tempIndexFilePath = Paths.get(tempIndexFilename)
 
-    // Pull the per-partition leaf iterators and partition index from the iterator handed to us
-    // by Spark's ShuffleMapTask. CometNativeShuffleInputRDD.compute always returns this exact
-    // iterator type; no other RDD layers between produce a Product2[Int, ColumnarBatch].
-    val shuffleInputIter = inputs.asInstanceOf[CometNativeShuffleInputIterator]
+    // The dep's _rdd is always a CometNativeShuffleInputRDD on this path. Pattern-match instead
+    // of asInstanceOf so a future RDD-layering change produces a clear error here rather than a
+    // bare ClassCastException deeper in the stack.
+    val shuffleInputIter = inputs match {
+      case it: CometNativeShuffleInputIterator => it
+      case other =>
+        throw new IllegalStateException(
+          "CometNativeShuffleWriter expects its input iterator to be a " +
+            "CometNativeShuffleInputIterator (produced by CometNativeShuffleInputRDD), got " +
+            s"${other.getClass.getName}")
+    }
     val partitionIdx = shuffleInputIter.partitionIndex
     val leafIterators = shuffleInputIter.leafIterators
     val shuffleBlockIters = shuffleInputIter.shuffleBlockIterators
 
     val unifiedPlan = buildUnifiedPlan(tempDataFilename, tempIndexFilename)
-    val finalNativePlan = if (nativeContext.commonByKey.nonEmpty) {
-      val partitionDataByKey =
-        nativeContext.perPartitionByKey.map { case (k, arr) => k -> arr(partitionIdx) }
-      PlanDataInjector.injectPlanData(unifiedPlan, nativeContext.commonByKey, partitionDataByKey)
+    val ctx = spec.execContext
+    val finalNativePlan = if (ctx.commonByKey.nonEmpty) {
+      val partitionDataByKey = ctx.perPartitionByKey.map { case (k, arr) =>
+        k -> arr(partitionIdx)
+      }
+      PlanDataInjector.injectPlanData(unifiedPlan, ctx.commonByKey, partitionDataByKey)
     } else {
       unifiedPlan
     }
@@ -119,30 +118,29 @@ class CometNativeShuffleWriter[K, V](
       "write_time" -> metricsWriteTime) ++
       metrics.filterKeys(detailedMetrics.contains)
 
-    // ShuffleWriter metrics live at the root of the metric tree; the child operator's metric
-    // tree (rich subtree or empty leaf for the Scan placeholder) is attached underneath so the
-    // SQL UI sees the same per-node breakdown the split-driver flow produced.
-    val nativeMetrics = CometMetricNode(shuffleWriterSQLMetrics, Seq(childMetricNode))
+    // ShuffleWriter metrics at the root; child's metric tree underneath so the SQL UI's per-node
+    // breakdown matches what the split-driver flow showed.
+    val nativeMetrics = CometMetricNode(shuffleWriterSQLMetrics, Seq(spec.childMetricNode))
 
     val cometIter = new CometExecIterator(
       CometExec.newIterId,
       leafIterators,
       outputAttributes.length,
-      PlanDataInjector.serializeOperator(finalNativePlan),
+      CometExec.serializeNativePlan(finalNativePlan),
       nativeMetrics,
       numParts,
       partitionIdx,
-      nativeContext.broadcastedHadoopConfForEncryption,
-      nativeContext.encryptedFilePaths,
+      ctx.broadcastedHadoopConfForEncryption,
+      ctx.encryptedFilePaths,
       shuffleBlockIters)
 
     // Register subqueries against the iterator id so native callbacks resolve them to values.
-    nativeContext.subqueries.foreach { sub =>
+    ctx.subqueries.foreach { sub =>
       CometScalarSubquery.setSubquery(cometIter.id, sub)
     }
     Option(context).foreach { taskCtx =>
       taskCtx.addTaskCompletionListener[Unit] { _ =>
-        nativeContext.subqueries.foreach { sub =>
+        ctx.subqueries.foreach { sub =>
           CometScalarSubquery.removeSubquery(cometIter.id, sub)
         }
       }
@@ -334,7 +332,7 @@ class CometNativeShuffleWriter[K, V](
     OperatorOuterClass.Operator
       .newBuilder()
       .setShuffleWriter(shuffleWriterBuilder)
-      .addChildren(childNativeOp)
+      .addChildren(spec.childNativeOp)
       .build()
   }
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala
index 9ec25c49ec..2a05843007 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala
@@ -34,13 +34,23 @@ import org.apache.spark.sql.types.StructType
 
 import org.apache.comet.serde.OperatorOuterClass
 
+/**
+ * Bundle of context the native shuffle write path needs at task time. Co-populated for native
+ * shuffles only; consolidated into a single field on [[CometShuffleDependency]] so it cannot be
+ * partially set.
+ */
+case class NativeShuffleSpec(
+    childNativeOp: OperatorOuterClass.Operator,
+    childMetricNode: CometMetricNode,
+    execContext: NativeExecContext)
+
 /**
  * A [[ShuffleDependency]] that allows us to identify the shuffle dependency as a Comet shuffle.
  *
- * On the native-shuffle path, also carries the child plan's per-partition execution context, root
- * native operator, and metric node so [[CometNativeShuffleWriter]] can drive the unified
- * `ShuffleWriter(child = childNativeOp)` plan in a single [[org.apache.comet.CometExecIterator]]
- * per partition. These three fields are populated only when `shuffleType == CometNativeShuffle`.
+ * On the native-shuffle path, also carries a [[NativeShuffleSpec]] so
+ * [[CometNativeShuffleWriter]] can drive the unified `ShuffleWriter(child = childNativeOp)` plan
+ * in a single [[org.apache.comet.CometExecIterator]] per partition. `nativeShuffleSpec` is
+ * populated only when `shuffleType == CometNativeShuffle`.
  */
 class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
     @transient private val _rdd: RDD[_ <: Product2[K, V]],
@@ -58,9 +68,7 @@ class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
     val shuffleWriteMetrics: Map[String, SQLMetric] = Map.empty,
     val numParts: Int = 0,
     val rangePartitionBounds: Option[Seq[InternalRow]] = None,
-    val nativeExecContext: Option[NativeExecContext] = None,
-    val childNativeOp: Option[OperatorOuterClass.Operator] = None,
-    val childMetricNode: Option[CometMetricNode] = None)
+    val nativeShuffleSpec: Option[NativeShuffleSpec] = None)
     extends ShuffleDependency[K, V, C](
       _rdd,
       partitioner,
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
index 16bd40d402..1470d637d9 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
@@ -102,14 +102,11 @@ case class CometShuffleExchangeExec(
     new UnsafeRowSerializer(child.output.size, longMetric("dataSize"))
 
   /**
-   * Per-partition execution context for the child native subtree, computed once and shared
-   * between [[inputRDD]] (which uses it to wire DAGScheduler dependencies) and
-   * [[shuffleDependency]] (which threads it through to [[CometNativeShuffleWriter]] for
-   * single-iterator native execution). Only populated when `shuffleType == CometNativeShuffle`
-   * AND the child is a [[CometNativeExec]] subtree we can inline. When the child is a non-native
-   * Comet plan (e.g. [[org.apache.spark.sql.comet.CometSparkToColumnarExec]]), this stays `None`
-   * and the shuffle falls back to the legacy `Scan("ShuffleWriterInput") -> ShuffleWriter` plan
-   * via the convenience overload of `prepareShuffleDependency`.
+   * Single-driver native-shuffle context, computed once and shared between [[inputRDD]] and
+   * [[shuffleDependency]]. `Some` only when `shuffleType == CometNativeShuffle` AND the child is
+   * a [[CometNativeExec]] subtree. Otherwise the dep is built via the
+   * [[CometShuffleExchangeExec.prepareShuffleDependency]] convenience overload (synthetic Scan
+   * placeholder).
    */
   @transient private lazy val nativeChildContext: Option[NativeExecContext] = child match {
     case nativeChild: CometNativeExec if shuffleType == CometNativeShuffle =>
@@ -120,23 +117,19 @@ case class CometShuffleExchangeExec(
   @transient lazy val inputRDD: RDD[_] = if (shuffleType == CometNativeShuffle) {
     nativeChildContext match {
       case Some(ctx) =>
-        // Single-driver path: thin scheduling anchor; CometNativeShuffleWriter drives the
-        // unified ShuffleWriter + child plan in a single CometExecIterator per partition.
         new CometNativeShuffleInputRDD(
           sparkContext,
           ctx.inputs,
           ctx.numPartitions,
           ctx.shuffleScanIndices)
       case None =>
-        // Child is a Comet plan but not a CometNativeExec subtree (e.g. CometSparkToColumnarExec).
-        // No native subtree to inline; the writer's plan is `Scan("ShuffleWriterInput") ->
-        // ShuffleWriter` and JVM batches flow into native through Arrow C Stream Interface.
+        // Non-native child (e.g. CometSparkToColumnarExec): no subtree to inline. The dep gets
+        // built via the legacy convenience overload below; we just need a real RDD of batches.
         child.executeColumnar()
     }
   } else if (shuffleType == CometColumnarShuffle) {
-    // CometColumnarShuffle uses Spark's row-based execute() API. For Spark row-based plans,
-    // rows flow directly. For Comet native plans, their doExecute() wraps with ColumnarToRowExec
-    // to convert columnar batches to rows.
+    // Row-based shuffle. CometNativeExec.doExecute wraps columnar output with
+    // ColumnarToRowExec; non-Comet children flow through directly.
     child.execute()
   } else {
     throw new UnsupportedOperationException(
@@ -180,12 +173,11 @@ case class CometShuffleExchangeExec(
     if (shuffleType == CometNativeShuffle) {
       val dep = nativeChildContext match {
         case Some(ctx) =>
-          // Single-driver path: child is a CometNativeExec subtree. RangePartitioning needs real
-          // rows to compute partition bounds; use a regular columnar execution of the child for
-          // sampling only. The actual shuffle still goes through the single-iterator path.
           val nativeChild = child.asInstanceOf[CometNativeExec]
+          // RangePartitioner needs real rows for sampling. Reuse the precomputed context so we
+          // don't re-walk the SparkPlan tree or re-broadcast the encryption Hadoop conf.
           val samplingRDD: Option[RDD[ColumnarBatch]] = outputPartitioning match {
-            case _: RangePartitioning => Some(child.executeColumnar())
+            case _: RangePartitioning => Some(nativeChild.executeColumnarWithContext(ctx))
             case _ => None
           }
           CometShuffleExchangeExec.prepareNativeShuffleDependency(
@@ -195,12 +187,11 @@ case class CometShuffleExchangeExec(
             outputPartitioning,
             serializer,
             metrics,
-            ctx,
-            nativeChild.nativeOp,
-            CometMetricNode.fromCometPlan(nativeChild))
+            NativeShuffleSpec(
+              nativeChild.nativeOp,
+              CometMetricNode.fromCometPlan(nativeChild),
+              ctx))
         case None =>
-          // Child is a non-native Comet plan; the writer falls back to its Scan-placeholder
-          // path via the convenience overload of prepareShuffleDependency.
           CometShuffleExchangeExec.prepareShuffleDependency(
             inputRDD.asInstanceOf[RDD[ColumnarBatch]],
             child.output,
@@ -733,18 +724,16 @@ object CometShuffleExchangeExec
       outputPartitioning,
       serializer,
       metrics,
-      ctx,
-      scanOp,
-      CometMetricNode(Map.empty))
+      NativeShuffleSpec(scanOp, CometMetricNode(Map.empty), ctx))
   }
 
   /**
    * Build a Comet native shuffle dependency for the [[CometShuffleExchangeExec]] case where the
    * shuffle is fed by a [[CometNativeExec]] child. The writer drives the unified
    * `ShuffleWriter(child = childNativeOp)` plan in a single
-   * [[org.apache.comet.CometExecIterator]] per partition. The returned dep carries the child's
-   * per-partition execution context, root native operator, and metric node so
-   * [[CometNativeShuffleWriter]] can reach them at task time.
+   * [[org.apache.comet.CometExecIterator]] per partition. The returned dep carries the
+   * [[NativeShuffleSpec]] so [[CometNativeShuffleWriter]] can reach the child's per-partition
+   * execution context, root native operator, and metric node at task time.
    *
    * @param thinRDD
    *   scheduling-anchor RDD whose `compute` returns a [[CometNativeShuffleInputIterator]];
@@ -760,9 +749,7 @@ object CometShuffleExchangeExec
       outputPartitioning: Partitioning,
       serializer: Serializer,
       metrics: Map[String, SQLMetric],
-      nativeExecContext: NativeExecContext,
-      childNativeOp: OperatorOuterClass.Operator,
-      childMetricNode: CometMetricNode): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = {
+      spec: NativeShuffleSpec): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = {
     val numParts = thinRDD.getNumPartitions
 
     // The code block below is mostly brought over from
@@ -821,7 +808,7 @@ object CometShuffleExchangeExec
           None)
     }
 
-    val dependency = new CometShuffleDependency[Int, ColumnarBatch, ColumnarBatch](
+    new CometShuffleDependency[Int, ColumnarBatch, ColumnarBatch](
       thinRDD,
       serializer = serializer,
       shuffleWriterProcessor = ShuffleExchangeExec.createShuffleWriteProcessor(metrics),
@@ -833,10 +820,7 @@ object CometShuffleExchangeExec
       shuffleWriteMetrics = metrics,
       numParts = numParts,
       rangePartitionBounds = rangePartitionBounds,
-      nativeExecContext = Some(nativeExecContext),
-      childNativeOp = Some(childNativeOp),
-      childMetricNode = Some(childMetricNode))
-    dependency
+      nativeShuffleSpec = Some(spec))
   }
 
   /**
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala
index 16a21bbfd9..bd69e91898 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala
@@ -231,9 +231,7 @@ class CometShuffleManager(conf: SparkConf) extends ShuffleManager with Logging {
       case cometShuffleHandle: CometNativeShuffleHandle[K @unchecked, V @unchecked] =>
         val dep = cometShuffleHandle.dependency.asInstanceOf[CometShuffleDependency[_, _, _]]
         new CometNativeShuffleWriter(
-          dep.childNativeOp.get,
-          dep.childMetricNode.get,
-          dep.nativeExecContext.get,
+          dep.nativeShuffleSpec.get,
           dep.outputPartitioning.get,
           dep.outputAttributes,
           dep.shuffleWriteMetrics,
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index 748362cd65..fa7aa6f1b2 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -383,15 +383,11 @@ object CometExec {
 
 /**
  * Per-partition execution context for a native subtree rooted at a [[CometNativeExec]] boundary.
- *
- * Built once on the driver from the SparkPlan tree and consumed by either:
- *   - [[CometNativeExec.doExecuteColumnar]] to construct a [[CometExecRDD]], or
- *   - the native-shuffle path, where [[CometNativeShuffleWriter]] drives the same child plan with
- *     a `ShuffleWriter` operator as its root.
- *
- * The fields capture everything that depends on tree-walking the SparkPlan and aligning leaf
- * input RDDs (broadcast partition counts, plan-data, subqueries, encryption options) so callers
- * do not have to re-derive them.
+ * Built once on the driver from the SparkPlan tree, then consumed by either
+ * [[CometNativeExec.executeColumnarWithContext]] (to build a [[CometExecRDD]]) or the
+ * native-shuffle path (to drive [[CometNativeShuffleWriter]]). Captures broadcast partition
+ * alignment, plan-data, subqueries, and encryption options so each consumer doesn't re-walk the
+ * tree.
  */
 private[comet] case class NativeExecContext(
     inputs: Seq[RDD[ColumnarBatch]],
@@ -402,7 +398,15 @@ private[comet] case class NativeExecContext(
     commonByKey: Map[String, Array[Byte]],
     perPartitionByKey: Map[String, Array[Array[Byte]]],
     shuffleScanIndices: Set[Int],
-    hasScanInput: Boolean)
+    hasScanInput: Boolean) {
+  // Catch shape divergence (e.g. broadcast scans with different partition counts after DPP
+  // filtering) at construction so consumers don't trip ArrayIndexOutOfBoundsException at
+  // partition idx access time.
+  require(
+    perPartitionByKey.values.forall(_.length == numPartitions),
+    s"All per-partition arrays must have length $numPartitions, but found: " +
+      perPartitionByKey.map { case (key, arr) => s"$key -> ${arr.length}" }.mkString(", "))
+}
 
 /**
  * A Comet native physical operator.
@@ -442,90 +446,68 @@ abstract class CometNativeExec extends CometExec {
     runningSubqueries.clear()
   }
 
-  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    serializedPlanOpt.plan match {
-      case None =>
-        // This is in the middle of a native execution, it should not be executed directly.
-        throw new CometRuntimeException(
-          s"CometNativeExec should not be executed directly without a serialized plan: $this")
-      case Some(serializedPlan) =>
-        // TODO: support native metrics for all operators.
-        val nativeMetrics = CometMetricNode.fromCometPlan(this)
-        val ctx = buildNativeContext()
-
-        new CometExecRDD(
-          sparkContext,
-          ctx.inputs,
-          ctx.commonByKey,
-          ctx.perPartitionByKey,
-          serializedPlan,
-          ctx.numPartitions,
-          output.length,
-          nativeMetrics,
-          ctx.subqueries,
-          ctx.broadcastedHadoopConfForEncryption,
-          ctx.encryptedFilePaths,
-          ctx.shuffleScanIndices) {
-          override def compute(
-              split: Partition,
-              context: TaskContext): Iterator[ColumnarBatch] = {
-            val res = super.compute(split, context)
-
-            // Report scan input metrics only when the native plan contains a scan.
-            if (ctx.hasScanInput) {
-              Option(context).foreach(nativeMetrics.reportScanInputMetrics)
-            }
+  override def doExecuteColumnar(): RDD[ColumnarBatch] =
+    executeColumnarWithContext(buildNativeContext())
 
-            res
-          }
+  /**
+   * Build a [[CometExecRDD]] from a precomputed [[NativeExecContext]]. Public so the native
+   * shuffle path can sample (RangePartitioning) without re-walking the SparkPlan tree and
+   * re-broadcasting the encryption Hadoop conf.
+   */
+  private[comet] def executeColumnarWithContext(ctx: NativeExecContext): RDD[ColumnarBatch] = {
+    val serializedPlan = serializedPlanOpt.plan.getOrElse(
+      throw new CometRuntimeException(
+        s"CometNativeExec should not be executed directly without a serialized plan: $this"))
+    val nativeMetrics = CometMetricNode.fromCometPlan(this)
+
+    new CometExecRDD(
+      sparkContext,
+      ctx.inputs,
+      ctx.commonByKey,
+      ctx.perPartitionByKey,
+      serializedPlan,
+      ctx.numPartitions,
+      output.length,
+      nativeMetrics,
+      ctx.subqueries,
+      ctx.broadcastedHadoopConfForEncryption,
+      ctx.encryptedFilePaths,
+      ctx.shuffleScanIndices) {
+      override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = {
+        val res = super.compute(split, context)
+        if (ctx.hasScanInput) {
+          Option(context).foreach(nativeMetrics.reportScanInputMetrics)
         }
+        res
+      }
     }
   }
 
   /**
    * Walk this CometNativeExec subtree once and gather everything needed to launch native
-   * execution: leaf input RDDs (with broadcast partition counts aligned to the first
-   * non-broadcast plan), per-partition planning data, subqueries, encryption options, and
-   * shuffle-scan indices. See [[NativeExecContext]] for the full set of fields.
-   *
-   * Used by [[doExecuteColumnar]] (CometExecRDD path) and by the native-shuffle path
-   * (CometShuffleExchangeExec) so both observe the same input alignment.
+   * execution. See [[NativeExecContext]] for the field set.
    */
   private[comet] def buildNativeContext(): NativeExecContext = {
-    // Go over all the native scans, in order to see if they need encryption options.
-    // For each relation in a CometNativeScan generate a hadoopConf,
-    // for each file path in a relation associate with hadoopConf
-    // This is done per native plan, so only count scans until a comet input is reached.
+    // Find native scans that need encryption: build a hadoopConf per relation, broadcast it once
+    // so executors can decrypt on read. Capped at one because we only broadcast one conf per
+    // CometExecIterator (see #2447 for history of the per-relation map approach).
     val encryptionOptions =
       mutable.ArrayBuffer.empty[(Broadcast[SerializableConfiguration], Seq[String])]
     foreachUntilCometInput(this) {
       case scan: CometNativeScanExec =>
-        // This creates a hadoopConf that brings in any SQLConf "spark.hadoop.*" configs and
-        // per-relation configs since different tables might have different decryption
-        // properties.
         val hadoopConf = scan.relation.sparkSession.sessionState
           .newHadoopConfWithOptions(scan.relation.options)
-        val encryptionEnabled = CometParquetUtils.encryptionEnabled(hadoopConf)
-        if (encryptionEnabled) {
-          // hadoopConf isn't serializable, so we have to do a broadcasted config.
-          val broadcastedConf =
-            scan.relation.sparkSession.sparkContext
-              .broadcast(new SerializableConfiguration(hadoopConf))
-
-          val optsTuple: (Broadcast[SerializableConfiguration], Seq[String]) =
-            (broadcastedConf, scan.relation.inputFiles.toSeq)
-          encryptionOptions += optsTuple
+        if (CometParquetUtils.encryptionEnabled(hadoopConf)) {
+          val broadcastedConf = scan.relation.sparkSession.sparkContext
+            .broadcast(new SerializableConfiguration(hadoopConf))
+          encryptionOptions += ((broadcastedConf, scan.relation.inputFiles.toSeq))
         }
-      case _ => // no-op
+      case _ =>
     }
     assert(
       encryptionOptions.size <= 1,
       "We expect one native scan that requires encryption reading in a Comet plan," +
         " since we will broadcast one hadoopConf.")
-    // If this assumption changes in the future, you can look at the commit history of #2447
-    // to see how there used to be a map of relations to broadcasted confs in case multiple
-    // relations in a single plan. The example that came up was UNION. See discussion at:
-    // https://github.com/apache/datafusion-comet/pull/2447#discussion_r2406118264
     val (broadcastedHadoopConfForEncryption, encryptedFilePaths) =
       encryptionOptions.headOption match {
         case Some((conf, paths)) => (Some(conf), paths)

From cc7c5bedf9665f6671038cdeac273c09ca5c7fa9 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 10:09:16 -0400
Subject: [PATCH 04/13] handle arrow type mismatches on child stream in native
 shuffle.

---
 native/core/src/execution/operators/mod.rs    |   2 +
 .../src/execution/operators/schema_align.rs   | 266 ++++++++++++++++++
 native/core/src/execution/planner.rs          |  27 +-
 native/proto/src/proto/operator.proto         |   6 +
 .../shuffle/CometNativeShuffleWriter.scala    |   9 +
 5 files changed, 307 insertions(+), 3 deletions(-)
 create mode 100644 native/core/src/execution/operators/schema_align.rs

diff --git a/native/core/src/execution/operators/mod.rs b/native/core/src/execution/operators/mod.rs
index 4b2c06575d..b679d35b19 100644
--- a/native/core/src/execution/operators/mod.rs
+++ b/native/core/src/execution/operators/mod.rs
@@ -32,6 +32,8 @@ pub use parquet_writer::ParquetWriterExec;
 mod csv_scan;
 pub mod projection;
 mod scan;
+mod schema_align;
 mod shuffle_scan;
 pub use csv_scan::init_csv_datasource_exec;
+pub use schema_align::SchemaAlignExec;
 pub use shuffle_scan::ShuffleScanExec;
diff --git a/native/core/src/execution/operators/schema_align.rs b/native/core/src/execution/operators/schema_align.rs
new file mode 100644
index 0000000000..3d207e0202
--- /dev/null
+++ b/native/core/src/execution/operators/schema_align.rs
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! `SchemaAlignExec` reshapes its child's output so the per-column Arrow type and field-level
+//! nullability match what Spark catalyst declared. Used between an inlined native subtree and
+//! `ShuffleWriterExec` when the FFI deep-copy + `ScanExec` cast in `build_record_batch` are both
+//! gone, so DataFusion / `datafusion-spark` return-type drift would otherwise be written into
+//! shuffle blocks. See <https://github.com/apache/datafusion-comet/issues/4515> for the running
+//! list of mismatched functions.
+
+use arrow::array::{ArrayRef, RecordBatch, RecordBatchOptions};
+use arrow::compute::{cast_with_options, CastOptions};
+use arrow::datatypes::{Field, Schema, SchemaRef};
+use datafusion::common::DataFusionError;
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion::{
+    execution::TaskContext,
+    physical_plan::{
+        DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
+        RecordBatchStream, SendableRecordBatchStream,
+    },
+};
+use futures::{Stream, StreamExt};
+use std::{
+    any::Any,
+    collections::HashSet,
+    pin::Pin,
+    sync::{Arc, Mutex, OnceLock},
+    task::{Context, Poll},
+};
+
+/// Process-wide set of `(column, actual, expected)` signatures we have already warned about.
+/// Each schema drift produces the same warning on every partition of every query that runs
+/// the offending expression; deduping here keeps logs readable while still surfacing each
+/// distinct mismatch once.
+fn warn_dedup() -> &'static Mutex<HashSet<String>> {
+    static SET: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
+    SET.get_or_init(|| Mutex::new(HashSet::new()))
+}
+
+#[derive(Debug)]
+pub struct SchemaAlignExec {
+    child: Arc<dyn ExecutionPlan>,
+    target_schema: SchemaRef,
+    column_actions: Arc<Vec<ColumnAction>>,
+    cache: Arc<PlanProperties>,
+}
+
+#[derive(Debug, Clone)]
+enum ColumnAction {
+    /// Pass the input column through unchanged. Any nullability/metadata difference is
+    /// absorbed when the batch is re-stamped via `RecordBatch::try_new_with_options`.
+    Passthrough,
+    /// Cast the input column to the target data_type.
+    Cast,
+}
+
+impl SchemaAlignExec {
+    /// Build a SchemaAlignExec that aligns `child`'s output to `expected`. Returns
+    /// `Ok(child)` unchanged when no per-column reshape is needed; otherwise wraps `child`
+    /// in a SchemaAlignExec whose target schema preserves `expected`'s data_type and metadata
+    /// but widens nullability to `actual.nullable || expected.nullable` (matching the
+    /// reconciliation rule used at the FFI boundary on `main`).
+    pub fn try_new_or_passthrough(
+        child: Arc<dyn ExecutionPlan>,
+        expected: &SchemaRef,
+    ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+        let actual = child.schema();
+        if actual.fields().len() != expected.fields().len() {
+            return Err(DataFusionError::Plan(format!(
+                "SchemaAlignExec: expected {} fields, child produces {}",
+                expected.fields().len(),
+                actual.fields().len()
+            )));
+        }
+        let mut needs_alignment = false;
+        let mut actions = Vec::with_capacity(actual.fields().len());
+        let mut target_fields = Vec::with_capacity(actual.fields().len());
+        for (idx, (actual_field, expected_field)) in actual
+            .fields()
+            .iter()
+            .zip(expected.fields().iter())
+            .enumerate()
+        {
+            let action = if actual_field.data_type() == expected_field.data_type() {
+                ColumnAction::Passthrough
+            } else {
+                let signature = format!(
+                    "{}|{:?}|{:?}",
+                    expected_field.name(),
+                    actual_field.data_type(),
+                    expected_field.data_type()
+                );
+                if warn_dedup().lock().unwrap().insert(signature) {
+                    log::warn!(
+                        "ShuffleWriter input schema mismatch on col[{idx}] '{}': child produced \
+                         {:?}, catalyst declared {:?}. Inserting a cast; please file the upstream \
+                         function bug at https://github.com/apache/datafusion-comet/issues/4515.",
+                        expected_field.name(),
+                        actual_field.data_type(),
+                        expected_field.data_type()
+                    );
+                }
+                ColumnAction::Cast
+            };
+            let target_nullable = actual_field.is_nullable() || expected_field.is_nullable();
+            let field_changed = !matches!(action, ColumnAction::Passthrough)
+                || target_nullable != actual_field.is_nullable()
+                || expected_field.metadata() != actual_field.metadata()
+                || expected_field.name() != actual_field.name();
+            if field_changed {
+                needs_alignment = true;
+            }
+            target_fields.push(
+                Field::new(
+                    expected_field.name(),
+                    expected_field.data_type().clone(),
+                    target_nullable,
+                )
+                .with_metadata(expected_field.metadata().clone()),
+            );
+            actions.push(action);
+        }
+        if !needs_alignment {
+            return Ok(child);
+        }
+        let target_schema: SchemaRef = Arc::new(Schema::new(target_fields));
+        let cache = Arc::new(PlanProperties::new(
+            EquivalenceProperties::new(Arc::clone(&target_schema)),
+            child.output_partitioning().clone(),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        ));
+        Ok(Arc::new(Self {
+            child,
+            target_schema,
+            column_actions: Arc::new(actions),
+            cache,
+        }))
+    }
+}
+
+impl DisplayAs for SchemaAlignExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "CometSchemaAlignExec")
+            }
+            DisplayFormatType::TreeRender => unimplemented!(),
+        }
+    }
+}
+
+impl ExecutionPlan for SchemaAlignExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.target_schema)
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.child]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion::common::Result<Arc<dyn ExecutionPlan>> {
+        // Rebuild PlanProperties from the new child since `output_partitioning` may differ.
+        let new_child = Arc::clone(&children[0]);
+        let cache = Arc::new(PlanProperties::new(
+            EquivalenceProperties::new(Arc::clone(&self.target_schema)),
+            new_child.output_partitioning().clone(),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        ));
+        Ok(Arc::new(Self {
+            child: new_child,
+            target_schema: Arc::clone(&self.target_schema),
+            column_actions: Arc::clone(&self.column_actions),
+            cache,
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> datafusion::common::Result<SendableRecordBatchStream> {
+        let child_stream = self.child.execute(partition, context)?;
+        Ok(Box::pin(SchemaAlignStream {
+            child_stream,
+            target_schema: Arc::clone(&self.target_schema),
+            column_actions: Arc::clone(&self.column_actions),
+        }))
+    }
+
+    fn properties(&self) -> &Arc<PlanProperties> {
+        &self.cache
+    }
+
+    fn name(&self) -> &str {
+        "CometSchemaAlignExec"
+    }
+}
+
+struct SchemaAlignStream {
+    child_stream: SendableRecordBatchStream,
+    target_schema: SchemaRef,
+    column_actions: Arc<Vec<ColumnAction>>,
+}
+
+impl SchemaAlignStream {
+    fn align(&self, batch: RecordBatch) -> Result<RecordBatch, DataFusionError> {
+        let mut columns: Vec<ArrayRef> = Vec::with_capacity(batch.num_columns());
+        for (idx, action) in self.column_actions.iter().enumerate() {
+            let column = batch.column(idx);
+            let aligned = match action {
+                ColumnAction::Passthrough => Arc::clone(column),
+                ColumnAction::Cast => cast_with_options(
+                    column,
+                    self.target_schema.field(idx).data_type(),
+                    &CastOptions::default(),
+                )?,
+            };
+            columns.push(aligned);
+        }
+        let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows()));
+        RecordBatch::try_new_with_options(Arc::clone(&self.target_schema), columns, &options)
+            .map_err(DataFusionError::from)
+    }
+}
+
+impl Stream for SchemaAlignStream {
+    type Item = datafusion::common::Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        match self.child_stream.poll_next_unpin(cx) {
+            Poll::Ready(Some(Ok(batch))) => Poll::Ready(Some(self.align(batch))),
+            other => other,
+        }
+    }
+}
+
+impl RecordBatchStream for SchemaAlignStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.target_schema)
+    }
+}
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index c6160bddd4..dff8275f84 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -26,7 +26,9 @@ use crate::execution::operators::IcebergScanExec;
 use crate::execution::{
     expressions::list_positions::ListPositionsExpr,
     expressions::subquery::Subquery,
-    operators::{ExecutionError, ExpandExec, ParquetWriterExec, ScanExec, ShuffleScanExec},
+    operators::{
+        ExecutionError, ExpandExec, ParquetWriterExec, ScanExec, SchemaAlignExec, ShuffleScanExec,
+    },
     planner::expression_registry::ExpressionRegistry,
     planner::operator_registry::OperatorRegistry,
     serde::to_arrow_datatype,
@@ -1515,9 +1517,14 @@ impl PhysicalPlanner {
                 let (scans, shuffle_scans, child) =
                     self.create_plan(&children[0], inputs, partition_count)?;
 
+                let writer_input = align_shuffle_writer_input(
+                    Arc::clone(&child.native_plan),
+                    &writer.expected_output_schema,
+                )?;
+
                 let partitioning = self.create_partitioning(
                     writer.partitioning.as_ref().unwrap(),
-                    child.native_plan.schema(),
+                    writer_input.schema(),
                 )?;
 
                 let codec = match writer.codec.try_into() {
@@ -1535,7 +1542,7 @@ impl PhysicalPlanner {
 
                 let write_buffer_size = writer.write_buffer_size as usize;
                 let shuffle_writer = Arc::new(ShuffleWriterExec::try_new(
-                    Arc::clone(&child.native_plan),
+                    writer_input,
                     partitioning,
                     codec,
                     writer.output_data_file.clone(),
@@ -3124,6 +3131,20 @@ fn convert_spark_types_to_arrow_schema(
     arrow_schema
 }
 
+/// Wrap `child` in a `SchemaAlignExec` when its output drifts from what Spark catalyst
+/// declared. See <https://github.com/apache/datafusion-comet/issues/4515>.
+fn align_shuffle_writer_input(
+    child: Arc<dyn ExecutionPlan>,
+    expected_proto: &[spark_operator::SparkStructField],
+) -> Result<Arc<dyn ExecutionPlan>, ExecutionError> {
+    if expected_proto.is_empty() {
+        return Ok(child);
+    }
+    let expected = convert_spark_types_to_arrow_schema(expected_proto);
+    SchemaAlignExec::try_new_or_passthrough(child, &expected)
+        .map_err(|e| ExecutionError::DataFusionError(e.to_string()))
+}
+
 /// Converts a protobuf PartitionValue to an iceberg Literal.
 ///
 fn partition_value_to_literal(
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index 7f50aa928c..da498087b9 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -327,6 +327,12 @@ message ShuffleWriter {
   // Size of the write buffer in bytes used when writing shuffle data to disk.
   // Larger values may improve write performance but use more memory.
   int32 write_buffer_size = 8;
+  // Spark-declared output schema of the writer's child. When the child is an inlined native
+  // subtree, the native planner casts the child's actual output to this schema before
+  // serializing to shuffle blocks, since there is no FFI boundary or ScanExec between them
+  // to absorb DataFusion-vs-Spark type drift. Empty when the child is a placeholder Scan;
+  // that path already has a cast point upstream.
+  repeated SparkStructField expected_output_schema = 9;
 }
 
 message ParquetWriter {
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
index 21395c58fd..9fe70f16b5 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala
@@ -34,10 +34,12 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning, SinglePartition}
 import org.apache.spark.sql.comet.{CometExec, CometMetricNode, CometScalarSubquery, PlanDataInjector}
 import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.types.StructField
 
 import org.apache.comet.{CometConf, CometExecIterator}
 import org.apache.comet.serde.{OperatorOuterClass, PartitioningOuterClass, QueryPlanSerde}
 import org.apache.comet.serde.OperatorOuterClass.{CompressionCodec, Operator}
+import org.apache.comet.serde.operator.schema2Proto
 
 /**
  * Drives the native shuffle write in a single [[CometExecIterator]] per partition. The plan is
@@ -329,6 +331,13 @@ class CometNativeShuffleWriter[K, V](
 
     shuffleWriterBuilder.setTracingEnabled(CometConf.COMET_TRACING_ENABLED.get())
 
+    // Used by the native planner to cast the inlined child's output when DataFusion's
+    // declared return type drifts from Spark catalyst (see comet#4515).
+    val expectedFields = outputAttributes
+      .map(a => StructField(a.name, a.dataType, a.nullable, a.metadata))
+      .toArray
+    schema2Proto(expectedFields).foreach(shuffleWriterBuilder.addExpectedOutputSchema)
+
     OperatorOuterClass.Operator
       .newBuilder()
       .setShuffleWriter(shuffleWriterBuilder)

From c76a26399357576a3f699dde6eb767c58ea7018f Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 14:08:04 -0400
Subject: [PATCH 05/13] stash

---
 native/core/src/execution/planner.rs          |  30 ++++-
 native/proto/src/proto/operator.proto         |   6 +
 .../org/apache/comet/CometBatchIterator.java  |  16 +++
 .../org/apache/comet/CometExecIterator.scala  |  21 ++++
 .../apache/comet/rules/CometExecRule.scala    |  24 ++++
 .../apache/comet/rules/CometScanRule.scala    |  11 ++
 .../comet/serde/operator/CometSink.scala      |  74 ++++++++++++
 .../org/apache/comet/vector/NativeUtil.scala  |  22 ++++
 .../shuffle/CometShuffleExchangeExec.scala    |  33 ++++++
 .../shuffle/CometShuffledRowRDD.scala         |  29 ++++-
 .../apache/spark/sql/comet/operators.scala    |  48 +++++++-
 .../comet/exec/CometAggregateSuite.scala      |  93 +++++++++++++++
 .../apache/comet/exec/CometExecSuite.scala    | 111 ++++++++++++++++++
 13 files changed, 515 insertions(+), 3 deletions(-)

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index dff8275f84..bf90eb20cc 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -1034,6 +1034,14 @@ impl PhysicalPlanner {
                 ))
             }
             OpStruct::HashAgg(agg) => {
+                // [#4515 instrumentation] Every HashAgg construction with proto sizes.
+                log::warn!(
+                    "[#4515] OpStruct::HashAgg grouping_exprs.len={} agg_exprs.len={} result_exprs.len={} mode={}",
+                    agg.grouping_exprs.len(),
+                    agg.agg_exprs.len(),
+                    agg.result_exprs.len(),
+                    agg.mode
+                );
                 assert_eq!(children.len(), 1);
                 let (scans, shuffle_scans, child) =
                     self.create_plan(&children[0], inputs, partition_count)?;
@@ -1173,7 +1181,17 @@ impl PhysicalPlanner {
                     })
                     .collect();
 
-                if agg.result_exprs.is_empty() {
+                if !agg.apply_result_projection {
+                    // [#4515 instrumentation] Confirm whether a native HashAggregate is being
+                    // built with empty result_exprs (catalyst-pruned EXISTS / count(*) subquery)
+                    // and what its natural schema would be.
+                    log::warn!(
+                        "[#4515] HashAgg apply_result_projection=false: emitting aggregate natural schema={:?} grouping_exprs.len={} agg_exprs.len={} result_exprs.len={}",
+                        aggregate.schema(),
+                        agg.grouping_exprs.len(),
+                        agg.agg_exprs.len(),
+                        agg.result_exprs.len()
+                    );
                     Ok((
                         scans,
                         shuffle_scans,
@@ -1443,6 +1461,16 @@ impl PhysicalPlanner {
             OpStruct::Scan(scan) => {
                 let data_types = scan.fields.iter().map(to_arrow_datatype).collect_vec();
 
+                // [#4515 instrumentation] Log every JVM-bridge Scan's declared column count.
+                // A 0-column scan paired with a JVM iterator producing batches with columns is
+                // the AIOOBE-on-exportBatch shape; a non-empty list confirms the inverse.
+                log::info!(
+                    "[#4515] ScanExec source='{}' declared {} cols: {:?}",
+                    scan.source,
+                    data_types.len(),
+                    data_types
+                );
+
                 // If it is not test execution context for unit test, we should have at least one
                 // input source
                 if self.exec_context_id != TEST_EXEC_CONTEXT_ID && inputs.is_empty() {
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index da498087b9..921e5bbf35 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -303,6 +303,12 @@ message HashAggregate {
   // Offset in the child's output where aggregate buffer attributes start.
   // Used by PartialMerge to locate state fields in the input.
   int32 initial_input_buffer_offset = 7;
+  // Whether the native planner should wrap the aggregate in a ProjectionExec using
+  // `result_exprs`. Disambiguates `result_exprs` empty-because-absent (no projection
+  // needed) from empty-because-catalyst-pruned-to-zero-cols (project to nothing).
+  // Without this bit, both wire as `repeated` length 0 and were collapsed to "no
+  // projection," leaking grouping keys when Spark intent was 0 columns.
+  bool apply_result_projection = 8;
 }
 
 message Limit {
diff --git a/spark/src/main/java/org/apache/comet/CometBatchIterator.java b/spark/src/main/java/org/apache/comet/CometBatchIterator.java
index 9b48a47c57..1392f028c0 100644
--- a/spark/src/main/java/org/apache/comet/CometBatchIterator.java
+++ b/spark/src/main/java/org/apache/comet/CometBatchIterator.java
@@ -38,6 +38,8 @@ public class CometBatchIterator {
   private final NativeUtil nativeUtil;
   private ColumnarBatch previousBatch = null;
   private ColumnarBatch currentBatch = null;
+  // [#4515 instrumentation] gate first-batch log per instance
+  private boolean loggedFirstBatch = false;
 
   CometBatchIterator(Iterator<ColumnarBatch> input, NativeUtil nativeUtil) {
     this.input = input;
@@ -79,6 +81,20 @@ public int next(long[] arrayAddrs, long[] schemaAddrs) {
       return -1;
     }
 
+    // [#4515 instrumentation] Log first-batch shape per CometBatchIterator instance.
+    if (!loggedFirstBatch) {
+      loggedFirstBatch = true;
+      org.slf4j.LoggerFactory.getLogger("[#4515]")
+          .warn(
+              "CometBatchIterator.next first batch: numCols={} numRows={} arrayAddrs.length={} schemaAddrs.length={} inputCls={} this={}",
+              currentBatch.numCols(),
+              currentBatch.numRows(),
+              arrayAddrs.length,
+              schemaAddrs.length,
+              input.getClass().getName(),
+              System.identityHashCode(this));
+    }
+
     // export the batch using the Arrow C Data Interface
     int numRows = nativeUtil.exportBatch(arrayAddrs, schemaAddrs, currentBatch);
 
diff --git a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala
index 6140eca553..98a9bc0684 100644
--- a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala
+++ b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala
@@ -92,6 +92,27 @@ class CometExecIterator(
     val conf = SparkEnv.get.conf
     val localDiskDirs = SparkEnv.get.blockManager.getLocalDiskDirs
 
+    // [#4515 instrumentation] Dump proto plan + per-input iterator class so we can correlate
+    // a 0-col Scan in the proto with the JVM iterator that will feed it.
+    {
+      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+      val parsed =
+        scala.util
+          .Try(
+            org.apache.comet.serde.OperatorOuterClass.Operator
+              .parseFrom(protobufQueryPlan)
+              .toString)
+          .toOption
+          .getOrElse("<failed to parse>")
+      val itersDesc = inputIterators.zipWithIndex
+        .map { case (it, idx) => s"  inputIterators[$idx] cls=${it.getClass.getName}" }
+        .mkString("\n")
+      log.warn(
+        s"CometExecIterator constructing plan id=$id partition=$partitionIndex " +
+          s"numParts=$numParts numOutputCols=$numOutputCols\n$itersDesc\n" +
+          s"  proto plan:\n$parsed")
+    }
+
     // serialize Comet related Spark configs in protobuf format
     val protobufSparkConfigs = CometExecIterator.serializeCometSQLConfs()
 
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
index aeb7db40ad..6eb314bbb6 100644
--- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
@@ -166,6 +166,16 @@ case class CometExecRule(session: SparkSession)
             val reverted =
               s.originalPlan.withNewChildren(Seq(s.child)).asInstanceOf[ShuffleExchangeExec]
             reverted.setTagValue(CometExecRule.SKIP_COMET_SHUFFLE_TAG, ())
+            // [#4515 instrumentation] Log every revert: this is a primary place where
+            // vanilla ShuffleExchangeExec instances enter the plan post-Comet-rewrite.
+            org.slf4j.LoggerFactory
+              .getLogger("[#4515]")
+              .warn(
+                s"revertRedundantColumnarShuffle revert: parentAgg=${op.getClass.getSimpleName} " +
+                  s"(output=${op.output}) cometShuffle.output=${s.output} " +
+                  s"reverted.output=${reverted.output} reverted.identityHash=${System
+                      .identityHashCode(reverted)}\n" +
+                  s"  reverted tree:\n${reverted.treeString(verbose = true, addSuffix = false)}")
             logInfo(
               "Reverting Comet columnar shuffle to Spark shuffle between " +
                 s"${op.getClass.getSimpleName} and ${s.child.getClass.getSimpleName} " +
@@ -546,6 +556,20 @@ case class CometExecRule(session: SparkSession)
            |${sideBySide(plan.treeString, newPlan.treeString).mkString("\n")}
            |""".stripMargin)
     }
+    // [#4515 instrumentation] Dump the plan in/out of CometExecRule to correlate with the
+    // 0-col Scan synthesized later. Logs the operator-class diff so we can see what
+    // CometExecRule did or didn't replace, especially around ShuffleExchangeExec wrappers
+    // inside subqueries.
+    if (!newPlan.fastEquals(plan)) {
+      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+      log.warn(
+        "CometExecRule rewrote plan:\n  IN classes: " +
+          plan.collect { case p => p.getClass.getSimpleName }.mkString(",") +
+          "\n  OUT classes: " +
+          newPlan.collect { case p => p.getClass.getSimpleName }.mkString(",") +
+          s"\n  IN tree:\n${plan.treeString(verbose = true, addSuffix = false)}" +
+          s"\n  OUT tree:\n${newPlan.treeString(verbose = true, addSuffix = false)}")
+    }
     newPlan
   }
 
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
index 7601fa1c6b..1f7f509db0 100644
--- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -68,6 +68,17 @@ case class CometScanRule(session: SparkSession)
            |${sideBySide(plan.treeString, newPlan.treeString).mkString("\n")}
            |""".stripMargin)
     }
+    // [#4515 instrumentation]
+    if (!newPlan.fastEquals(plan)) {
+      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+      log.warn(
+        "CometScanRule rewrote plan:\n  IN classes: " +
+          plan.collect { case p => p.getClass.getSimpleName }.mkString(",") +
+          "\n  OUT classes: " +
+          newPlan.collect { case p => p.getClass.getSimpleName }.mkString(",") +
+          s"\n  IN tree:\n${plan.treeString(verbose = true, addSuffix = false)}" +
+          s"\n  OUT tree:\n${newPlan.treeString(verbose = true, addSuffix = false)}")
+    }
     newPlan
   }
 
diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala
index 845803d133..8a1c576a2d 100644
--- a/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala
@@ -49,6 +49,12 @@ abstract class CometSink[T <: SparkPlan] extends CometOperatorSerde[T] {
       op: T,
       builder: Operator.Builder,
       childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = {
+    // [#4515 instrumentation]
+    val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+    log.warn(
+      s"CometSink[${this.getClass.getSimpleName}].convert op=${op.getClass.getName} " +
+        s"simpleString='${op.simpleStringWithNodeId()}' output=${op.output} " +
+        s"output.size=${op.output.size}")
     val supportedTypes =
       op.output.forall(a => supportedDataType(a.dataType, allowComplex = true))
 
@@ -72,6 +78,53 @@ abstract class CometSink[T <: SparkPlan] extends CometOperatorSerde[T] {
     }
 
     if (scanTypes.length == op.output.length) {
+      // [#4515 instrumentation] Log when we synthesize a Scan with zero declared columns.
+      // The runtime JVM iterator may still produce columns (subquery output shrunk by
+      // catalyst before serialization while the underlying RDD reflects the pre-shrink shape),
+      // tripping the column-count guard in NativeUtil.exportBatch.
+      if (scanTypes.isEmpty) {
+        val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+        // scalastyle:off line.size.limit
+        val childInfo = op.children.zipWithIndex
+          .map { case (c, i) =>
+            val canonOut = scala.util
+              .Try(c.canonicalized.output)
+              .toOption
+              .map(_.toString)
+              .getOrElse("<canonicalize failed>")
+            s"  child[$i] cls=${c.getClass.getName} simpleString='${c.simpleString(
+                80)}' output=${c.output} outputSize=${c.output.size} identityHash=${System
+                .identityHashCode(c)} canonicalized.output=$canonOut"
+          }
+          .mkString("\n")
+        val opCanonOut = scala.util
+          .Try(op.canonicalized.output)
+          .toOption
+          .map(_.toString)
+          .getOrElse("<canonicalize failed>")
+        val subqueryInfo = scala.util
+          .Try(op.subqueries.map(s => s"${s.getClass.getName}(output=${s.output}, prepared=?)"))
+          .toOption
+          .getOrElse(Nil)
+          .mkString("[", ", ", "]")
+        val callerStack =
+          new RuntimeException("[#4515] CometSink 0-col Scan caller").getStackTrace
+            .take(20)
+            .map(f => s"    at ${f}")
+            .mkString("\n")
+        log.warn(s"CometSink synthesizing 0-col Scan for op=${op.getClass.getName}\n" +
+          s"  simpleString='${op.simpleStringWithNodeId()}'\n" +
+          s"  op.output=${op.output} op.outputSet=${op.outputSet} op.references=${op.references}\n" +
+          s"  op.canonicalized.output=$opCanonOut\n" +
+          s"  op.subqueries=$subqueryInfo\n" +
+          s"  op identityHash=${System.identityHashCode(op)}\n" +
+          s"  children classes=${op.children.map(_.getClass.getSimpleName).mkString("[", ",", "]")}\n" +
+          childInfo + "\n" +
+          s"  caller stack:\n$callerStack\n" +
+          s"  op tree:\n${op.treeString(verbose = true, addSuffix = false)}")
+        // scalastyle:on line.size.limit
+      }
+
       scanBuilder.addAllFields(scanTypes.asJava)
 
       // Sink operators don't have children
@@ -94,6 +147,27 @@ object CometExchangeSink extends CometSink[SparkPlan] {
       op: SparkPlan,
       builder: Operator.Builder,
       childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = {
+    // [#4515 instrumentation]
+    val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+    val isVanillaSparkExchange =
+      op.getClass.getName == "org.apache.spark.sql.execution.exchange.ShuffleExchangeExec"
+    log.warn(
+      s"CometExchangeSink.convert op=${op.getClass.getName} " +
+        s"simpleString='${op.simpleStringWithNodeId()}' output=${op.output} " +
+        s"useShuffleScan=${shouldUseShuffleScan(op)} " +
+        s"children=${op.children.map(_.getClass.getSimpleName).mkString("[", ",", "]")}")
+    if (isVanillaSparkExchange) {
+      val callerStack =
+        new RuntimeException("[#4515] vanilla ShuffleExchangeExec caller").getStackTrace
+          .take(20)
+          .map(f => s"    at ${f}")
+          .mkString("\n")
+      log.warn(
+        "  vanilla ShuffleExchangeExec being processed by CometExchangeSink:\n" +
+          s"  output=${op.output}\n" +
+          s"  caller stack:\n$callerStack\n" +
+          s"  op tree:\n${op.treeString(verbose = true, addSuffix = false)}")
+    }
     if (shouldUseShuffleScan(op)) {
       convertToShuffleScan(op, builder)
     } else {
diff --git a/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala b/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala
index 4f027cd9e7..388f5114e6 100644
--- a/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala
+++ b/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala
@@ -114,6 +114,28 @@ class NativeUtil {
       batch: ColumnarBatch): Int = {
     val numRows = mutable.ArrayBuffer.empty[Int]
 
+    if (arrayAddrs.length != batch.numCols() || schemaAddrs.length != batch.numCols()) {
+      val schemaSummary = (0 until batch.numCols())
+        .map { i =>
+          val v = batch.column(i) match {
+            case cv: CometVector => cv.getValueVector
+            case _ => null
+          }
+          if (v != null) s"col[$i]: ${v.getField}"
+          else s"col[$i]: ${batch.column(i).getClass.getName}"
+        }
+        .mkString("; ")
+      val taskAttempt = Option(org.apache.spark.TaskContext.get())
+        .map(c => s"stage=${c.stageId} task=${c.taskAttemptId} partition=${c.partitionId}")
+        .getOrElse("no-task")
+      throw new SparkException(
+        "CometBatchIterator column-count mismatch [#4515 instrumentation]: " +
+          s"native expected arrayAddrs=${arrayAddrs.length}, schemaAddrs=${schemaAddrs.length}; " +
+          s"JVM iterator produced batch.numCols=${batch.numCols()} ($taskAttempt). " +
+          s"Batch schema: $schemaSummary",
+        new RuntimeException("placeholder for exportBatch column-count mismatch"))
+    }
+
     (0 until batch.numCols()).foreach { index =>
       batch.column(index) match {
         case a: CometVector =>
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
index 1470d637d9..ba152c2f13 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
@@ -243,6 +243,23 @@ case class CometShuffleExchangeExec(
    * Comet returns RDD[ColumnarBatch] for columnar execution.
    */
   protected override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+    // [#4515 instrumentation] Track every doExecuteColumnar call: which CometShuffleExchange
+    // instance, its output, and the call site. Helps confirm whether this PR's changes
+    // cause an extra EnsureRequirements-inserted vanilla Exchange to wrap us, and what
+    // RDD is being plumbed where.
+    {
+      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+      val callerStack = new RuntimeException(
+        "[#4515] CometShuffleExchangeExec.doExecuteColumnar caller").getStackTrace
+        .take(15)
+        .map(f => s"    at ${f}")
+        .mkString("\n")
+      log.warn(
+        s"CometShuffleExchangeExec.doExecuteColumnar this=${System.identityHashCode(this)} " +
+          s"shuffleType=$shuffleType outputPartitioning=$outputPartitioning " +
+          s"output=$output\n" +
+          s"  caller stack:\n$callerStack")
+    }
     // Returns the same CometShuffledBatchRDD if this plan is used by multiple plans.
     if (cachedShuffleRDD == null) {
       cachedShuffleRDD = new CometShuffledBatchRDD(shuffleDependency, readMetrics)
@@ -687,6 +704,22 @@ object CometShuffleExchangeExec
       serializer: Serializer,
       metrics: Map[String, SQLMetric]): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = {
 
+    // [#4515 instrumentation] Track every placeholder-Scan ShuffleDependency we build, with
+    // outputAttributes (drives Scan declared schema) and call site.
+    {
+      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+      val callerStack =
+        new RuntimeException("[#4515] prepareShuffleDependency caller").getStackTrace
+          .take(15)
+          .map(f => s"    at ${f}")
+          .mkString("\n")
+      log.warn(
+        s"prepareShuffleDependency outputAttributes=$outputAttributes " +
+          s"outputAttributes.size=${outputAttributes.size} " +
+          s"outputPartitioning=$outputPartitioning rdd.numPartitions=${rdd.getNumPartitions}\n" +
+          s"  caller stack:\n$callerStack")
+    }
+
     val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput")
     val scanTypes = outputAttributes.flatten { attr =>
       QueryPlanSerde.serializeDataType(attr.dataType)
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala
index 7604910b06..e0e5c45b2a 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala
@@ -162,7 +162,34 @@ class CometShuffledBatchRDD(
   override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = {
     val reader = createReader(split, context)
     // TODO: Reads IPC by native code
-    reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2)
+    val raw = reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2)
+    // [#4515 instrumentation] Peek the first decoded batch to confirm wire schema vs caller
+    // expectations. Wraps so we don't consume the iterator.
+    val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+    new Iterator[ColumnarBatch] {
+      private var logged = false
+      override def hasNext: Boolean = raw.hasNext
+      override def next(): ColumnarBatch = {
+        val b = raw.next()
+        if (!logged) {
+          logged = true
+          val schemaSummary = (0 until b.numCols())
+            .map { i =>
+              val v = b.column(i) match {
+                case cv: org.apache.comet.vector.CometVector => cv.getValueVector
+                case _ => null
+              }
+              if (v != null) s"col[$i]: ${v.getField}"
+              else s"col[$i]: ${b.column(i).getClass.getName}"
+            }
+            .mkString("; ")
+          log.warn(s"CometShuffledBatchRDD.compute first decoded batch: numCols=${b.numCols()} " +
+            s"numRows=${b.numRows()} stage=${context.stageId()} task=${context.taskAttemptId()} " +
+            s"partition=${split.index} schema=[$schemaSummary]")
+        }
+        b
+      }
+    }
   }
 
   override def clearDependencies(): Unit = {
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index fa7aa6f1b2..310767489d 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -422,7 +422,15 @@ abstract class CometNativeExec extends CometExec {
   /** The Comet native operator */
   def nativeOp: Operator
 
-  override protected def doPrepare(): Unit = prepareSubqueries(this)
+  override protected def doPrepare(): Unit = {
+    // [#4515 instrumentation] Track when subqueries are prepared for this CometNativeExec.
+    val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+    log.warn(
+      s"CometNativeExec.doPrepare this=${System.identityHashCode(this)} " +
+        s"cls=${this.getClass.getName} " +
+        s"originalPlan.cls=${originalPlan.getClass.getName}")
+    prepareSubqueries(this)
+  }
 
   override lazy val metrics: Map[String, SQLMetric] =
     CometMetricNode.baselineMetrics(sparkContext)
@@ -563,6 +571,14 @@ abstract class CometNativeExec extends CometExec {
     // same partition number. But for Comet, we need to zip them so we need to adjust the
     // partition number of Broadcast RDDs to make sure they have the same partition number.
     sparkPlans.zipWithIndex.foreach { case (plan, idx) =>
+      // [#4515 instrumentation] Log every JVM-side input plan we wire to native, so we can
+      // correlate the Scan's declared schema with the runtime plan whose RDD feeds it.
+      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
+      log.warn(
+        s"buildNativeContext binding input[$idx] cls=${plan.getClass.getName} " +
+          s"simpleString='${plan.simpleStringWithNodeId()}' output=${plan.output} " +
+          s"output.size=${plan.output.size} identityHash=${System.identityHashCode(plan)}\n" +
+          s"  subtree:\n${plan.treeString(verbose = true, addSuffix = false)}")
       plan match {
         case c: CometBroadcastExchangeExec =>
           inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
@@ -1516,6 +1532,25 @@ trait CometBaseAggregate {
     // If the aggregateExpressions is empty, we only want to build groupingExpressions,
     // and skip processing of aggregateExpressions.
     if (aggregateExpressions.isEmpty) {
+      // [#4515 instrumentation] Track HashAgg serializations with empty resultExpressions.
+      // Catalyst prunes resultExpressions for EXISTS / row-existence-only subqueries; the
+      // native side currently interprets empty result_exprs as "use aggregate natural
+      // schema", which leaks grouping keys into the output.
+      org.slf4j.LoggerFactory
+        .getLogger("[#4515]")
+        .warn(
+          "HashAgg empty-aggExprs branch: " +
+            s"groupingExprs=${groupingExpressions} " +
+            s"resultExpressions=${resultExpressions} " +
+            s"resultExpressions.size=${resultExpressions.size} " +
+            s"aggregate.output=${aggregate.output} " +
+            s"aggregate.output.size=${aggregate.output.size} " +
+            s"modes(from aggExprs)=${modes} " +
+            s"sparkFinalMode=$sparkFinalMode " +
+            s"requiredChildDistribution=${aggregate.requiredChildDistribution} " +
+            s"isProjectionToEmpty=${resultExpressions.isEmpty && aggregate.output.isEmpty} " +
+            s"naturalEqualsIntent=${resultExpressions.map(_.toAttribute) == groupingExpressions
+                .map(_.toAttribute)}")
       val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder()
       hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava)
       val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes
@@ -1528,6 +1563,14 @@ trait CometBaseAggregate {
         return None
       }
       hashAggBuilder.addAllResultExprs(resultExprs.map(_.get).asJava)
+      // Force native to apply the projection when Spark's expected output (which may
+      // be empty for catalyst-pruned EXISTS / row-existence-only subqueries) differs
+      // from the aggregate's natural grouping output. Without this, an empty proto
+      // result_exprs is indistinguishable from "no projection needed" and the natural
+      // grouping keys leak through.
+      val naturalEqualsIntent =
+        resultExpressions.map(_.toAttribute) == groupingExpressions.map(_.toAttribute)
+      hashAggBuilder.setApplyResultProjection(!naturalEqualsIntent)
       Some(builder.setHashAgg(hashAggBuilder).build())
     } else {
       // Validate mode combinations. We support:
@@ -1608,6 +1651,9 @@ trait CometBaseAggregate {
             return None
           }
           hashAggBuilder.addAllResultExprs(resultExprs.map(_.get).asJava)
+          // Final aggs always need the projection (matches the existing condition under
+          // which we serialize result_exprs at all). See comet#4515.
+          hashAggBuilder.setApplyResultProjection(true)
         }
         hashAggBuilder.setModeValue(mode.getNumber)
 
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
index cd0beb56cc..f098040878 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -2109,4 +2109,97 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
+  // Regression for comet#4515: a HashAggregateExec whose `resultExpressions` (and therefore
+  // `output`) catalyst has pruned to empty must still produce 0-col batches at runtime.
+  // Catalyst prunes `resultExpressions=[]` for plans where the aggregate's column values are
+  // unused downstream - classically EXISTS subqueries that get rewritten into a literal-`1`
+  // wrapper. Before the fix, the native HashAggregate emitted its natural output (the
+  // grouping keys) regardless of the pruned JVM `output`, so any boundary that derived a
+  // schema from `output` (e.g. a wrapping vanilla Spark `ShuffleExchangeExec.output =
+  // child.output = []`, or a JVM-bridge Scan synthesized from the same `output`) declared
+  // 0 columns while the runtime RDD produced 1. The mismatch tripped
+  // `NativeUtil.exportBatch` with an ArrayIndexOutOfBoundsException on a length-0
+  // schemaAddrs[].
+  //
+  // The bug needs a specific catalyst optimizer state: `HashAggregateExec.
+  // resultExpressions.isEmpty`. Whether the optimizer reaches that state from a given SQL
+  // depends on Spark version, scan source (LocalTableScan vs Parquet/native), AQE state,
+  // and which Comet rules already fired - we observed it in the SQL-tests harness running
+  // `subquery/exists-subquery/exists-orderby-limit.sql` on Spark 4.0.2 with parquet-backed
+  // temp views, but not via `Dataset.collect` over `LocalTableScan`-backed temp views
+  // under `CometTestBase`. So the test below writes parquet (matching the harness's scan
+  // shape), tries several known triggers, runs whichever (if any) produces the bug shape
+  // under `checkSparkAnswer`, and skips cleanly otherwise. The upstream SQL-tests run
+  // remains the primary safety net for the harness-only path.
+  test("HashAggregate with catalyst-pruned resultExpressions returns 0-col output (#4515)") {
+    withTempDir { dir =>
+      withTempView("emp", "dept") {
+        // Write parquet so Comet's native scan path (vs LocalTableScan) is the source -
+        // matches the SQL-tests harness setup that surfaced the bug.
+        val empPath = new java.io.File(dir, "emp").getAbsolutePath
+        val deptPath = new java.io.File(dir, "dept").getAbsolutePath
+
+        spark
+          .sql("""SELECT * FROM VALUES
+                 |  (100, 'emp 1', 100.0D, 10),
+                 |  (200, 'emp 2', 200.0D, 10),
+                 |  (300, 'emp 3', 300.0D, 20),
+                 |  (400, 'emp 4', 400.0D, 30),
+                 |  (500, 'emp 5', 400.0D, NULL),
+                 |  (700, 'emp 7', 400.0D, 100),
+                 |  (800, 'emp 8', 150.0D, 70)
+                 |AS t(id, emp_name, salary, dept_id)""".stripMargin)
+          .write
+          .parquet(empPath)
+        spark
+          .sql("""SELECT * FROM VALUES
+                 |  (10, 'CA'), (20, 'NY'), (30, 'TX'),
+                 |  (40, 'OR'), (50, 'NJ'), (70, 'FL')
+                 |AS t(dept_id, state)""".stripMargin)
+          .write
+          .parquet(deptPath)
+
+        spark.read.parquet(empPath).createOrReplaceTempView("emp")
+        spark.read.parquet(deptPath).createOrReplaceTempView("dept")
+
+        val candidates = Seq(
+          // The original failing SQL from the harness - EXISTS with grouped agg + LIMIT/OFFSET.
+          """SELECT * FROM emp
+            |WHERE EXISTS (
+            |  SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1 OFFSET 2)""".stripMargin,
+          // Inline view + outer constant: ColumnPruning may strip the inner agg's output.
+          """SELECT 1 FROM (
+            |  SELECT max(dept_id) FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin,
+          // Scalar subquery returning a constant.
+          """SELECT (SELECT 1 FROM dept GROUP BY state LIMIT 1 OFFSET 2)""".stripMargin,
+          // count(*) over a derived table: outer doesn't reference inner cols.
+          """SELECT count(*) FROM (
+            |  SELECT max(dept_id) AS m FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin)
+
+        // Find a candidate whose plan has a HashAggregateExec (Spark or Comet) with empty
+        // resultExpressions. collectWithSubqueries traverses Subquery nodes too.
+        val triggering = candidates.find { sql =>
+          val plan = spark.sql(sql).queryExecution.executedPlan
+          collectWithSubqueries(plan) {
+            case a: org.apache.spark.sql.execution.aggregate.HashAggregateExec
+                if a.resultExpressions.isEmpty =>
+              a
+            case a: CometHashAggregateExec if a.resultExpressions.isEmpty => a
+          }.nonEmpty
+        }
+
+        triggering match {
+          case Some(sql) => checkSparkAnswer(sql)
+          case None =>
+            cancel(
+              "No candidate query produced a HashAggregateExec with empty resultExpressions " +
+                "in this environment. The catalyst-pruned shape that exercises #4515 only " +
+                "appears under specific optimizer/AQE state we couldn't reproduce here. The " +
+                "upstream SQL-tests run (subquery/exists-subquery/exists-orderby-limit.sql) " +
+                "covers this path.")
+        }
+      }
+    }
+  }
+
 }
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
index 16601d056b..fded42d050 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
@@ -3959,6 +3959,117 @@ class CometExecSuite extends CometTestBase {
     }
   }
 
+  // Repro for the Spark 3.5 SQL-tests failure on subquery/exists-subquery/exists-orderby-limit.sql
+  // (query #19) on branch opt_native_shuffle. Crashes with NativeUtil.exportBatch:132 AIOOBE
+  // on this branch but passes on main. The SQL file declares three CONFIG_DIM1 combos for
+  // codegen; the failure trace doesn't say which one fires it, so sweep all three.
+  test("EXISTS subquery with GROUP BY + LIMIT + OFFSET") {
+    withTempView("emp", "dept") {
+      spark
+        .sql("""SELECT * FROM VALUES
+               |  (100, 'emp 1', date '2005-01-01', 100.0D, 10),
+               |  (200, 'emp 2', date '2003-01-01', 200.0D, 10),
+               |  (300, 'emp 3', date '2002-01-01', 300.0D, 20),
+               |  (400, 'emp 4', date '2005-01-01', 400.0D, 30),
+               |  (500, 'emp 5', date '2001-01-01', 400.0D, NULL),
+               |  (700, 'emp 7', date '2010-01-01', 400.0D, 100),
+               |  (800, 'emp 8', date '2016-01-01', 150.0D, 70)
+               |AS t(id, emp_name, hiredate, salary, dept_id)""".stripMargin)
+        .createOrReplaceTempView("emp")
+      spark
+        .sql("""SELECT * FROM VALUES
+               |  (10, 'dept 1', 'CA'),
+               |  (20, 'dept 2', 'NY'),
+               |  (30, 'dept 3', 'TX'),
+               |  (40, 'dept 4 - unassigned', 'OR'),
+               |  (50, 'dept 5 - unassigned', 'NJ'),
+               |  (70, 'dept 7', 'FL')
+               |AS t(dept_id, dept_name, state)""".stripMargin)
+        .createOrReplaceTempView("dept")
+
+      val configDims = Seq(
+        Map("spark.sql.codegen.wholeStage" -> "true"),
+        Map(
+          "spark.sql.codegen.wholeStage" -> "false",
+          "spark.sql.codegen.factoryMode" -> "CODEGEN_ONLY"),
+        Map(
+          "spark.sql.codegen.wholeStage" -> "false",
+          "spark.sql.codegen.factoryMode" -> "NO_CODEGEN"))
+
+      // Mirror the SQL test order: queries #1 through #16 from the file run before #17 (the
+      // one whose CI failure we're chasing). Query #11's subquery is identical except for
+      // the OFFSET; #13 and #15 share similar shapes. Subquery materialization / AQE plan
+      // cache state from running them first may alter query #17's executed shape.
+      val priorQueries = Seq(
+        // #1
+        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate""",
+        // #2
+        """SELECT id, hiredate FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate DESC""",
+        // #3
+        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 1) ORDER BY hiredate""",
+        // #4
+        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 0) ORDER BY hiredate""",
+        // #5
+        """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate""",
+        // #6
+        """SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state)""",
+        // #7
+        """SELECT count(*) FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY dept_id ORDER BY dept_id)""",
+        // #8
+        """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 1) ORDER BY hiredate""",
+        // #9
+        """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 0) ORDER BY hiredate""",
+        // #10
+        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1)""",
+        // #11 - same subquery as #17 minus the OFFSET
+        """SELECT * FROM emp WHERE EXISTS (SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1)""",
+        // #12
+        """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 100 LIMIT 1)""",
+        // #13
+        """SELECT * FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) FROM dept WHERE dept.dept_id > 100 GROUP BY state LIMIT 1)""",
+        // #14
+        """SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state LIMIT 2 OFFSET 1)""",
+        // #15
+        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1 OFFSET 2)""",
+        // #16
+        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > emp.dept_id LIMIT 1)""")
+
+      for (dim <- configDims) {
+        // SQL-tests harness (dev/diffs/3.5.8.diff, 4.1.1.diff) sets only:
+        //   spark.comet.enabled, spark.comet.exec.enabled, spark.comet.exec.shuffle.enabled,
+        //   spark.comet.parquet.respectFilterPushdown, spark.shuffle.manager,
+        //   spark.comet.memoryOverhead.
+        // It does NOT enable spark.comet.sparkToColumnar.enabled, but CometTestBase does.
+        // Force the harness shape so the reproducer matches the failing config.
+        val harnessConf = dim ++ Map(CometConf.COMET_SPARK_TO_ARROW_ENABLED.key -> "false")
+        withSQLConf(harnessConf.toSeq: _*) {
+          // scalastyle:off println
+          println(s"\n===== config dim: $harnessConf =====")
+          for ((q, idx) <- priorQueries.zipWithIndex) {
+            spark.sql(q).collect()
+            println(s"--- ran prior query #${idx + 1} ---")
+          }
+          val sql = """SELECT *
+                      |FROM emp
+                      |WHERE EXISTS (SELECT max(dept.dept_id)
+                      |              FROM dept
+                      |              GROUP BY state
+                      |              LIMIT 1
+                      |              OFFSET 2)""".stripMargin
+          val df = spark.sql(sql)
+          println("--- query #17 initial executedPlan ---")
+          println(df.queryExecution.executedPlan)
+          val rows = df.collect()
+          println("--- query #17 final (post-AQE) executedPlan ---")
+          println(df.queryExecution.executedPlan)
+          println(s"--- ${rows.length} rows ---")
+          // scalastyle:on println
+          checkSparkAnswer(sql)
+        }
+      }
+    }
+  }
+
 }
 
 case class BucketedTableTestSpec(

From 1b55b97132f627b4e5257f44ca144f8d70c40494 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 14:21:14 -0400
Subject: [PATCH 06/13] refactor to handle on JVM side.

---
 native/core/src/execution/planner.rs          |  63 ++--------
 native/proto/src/proto/operator.proto         |  14 +--
 .../apache/spark/sql/comet/operators.scala    | 113 ++++++++++++------
 3 files changed, 99 insertions(+), 91 deletions(-)

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index bf90eb20cc..8abc19c1f5 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -1036,10 +1036,9 @@ impl PhysicalPlanner {
             OpStruct::HashAgg(agg) => {
                 // [#4515 instrumentation] Every HashAgg construction with proto sizes.
                 log::warn!(
-                    "[#4515] OpStruct::HashAgg grouping_exprs.len={} agg_exprs.len={} result_exprs.len={} mode={}",
+                    "[#4515] OpStruct::HashAgg grouping_exprs.len={} agg_exprs.len={} mode={}",
                     agg.grouping_exprs.len(),
                     agg.agg_exprs.len(),
-                    agg.result_exprs.len(),
                     agg.mode
                 );
                 assert_eq!(children.len(), 1);
@@ -1171,55 +1170,19 @@ impl PhysicalPlanner {
                         Arc::clone(&schema),
                     )?,
                 );
-                let result_exprs: PhyExprResult = agg
-                    .result_exprs
-                    .iter()
-                    .enumerate()
-                    .map(|(idx, expr)| {
-                        self.create_expr(expr, aggregate.schema())
-                            .map(|r| (r, format!("col_{idx}")))
-                    })
-                    .collect();
 
-                if !agg.apply_result_projection {
-                    // [#4515 instrumentation] Confirm whether a native HashAggregate is being
-                    // built with empty result_exprs (catalyst-pruned EXISTS / count(*) subquery)
-                    // and what its natural schema would be.
-                    log::warn!(
-                        "[#4515] HashAgg apply_result_projection=false: emitting aggregate natural schema={:?} grouping_exprs.len={} agg_exprs.len={} result_exprs.len={}",
-                        aggregate.schema(),
-                        agg.grouping_exprs.len(),
-                        agg.agg_exprs.len(),
-                        agg.result_exprs.len()
-                    );
-                    Ok((
-                        scans,
-                        shuffle_scans,
-                        Arc::new(SparkPlan::new(spark_plan.plan_id, aggregate, vec![child])),
-                    ))
-                } else {
-                    // For final aggregation, DF's hash aggregate exec doesn't support Spark's
-                    // aggregate result expressions like `COUNT(col) + 1`, but instead relying
-                    // on additional `ProjectionExec` to handle the case. Therefore, here we'll
-                    // add a projection node on top of the aggregate node.
-                    //
-                    // Note that `result_exprs` should only be set for final aggregation on the
-                    // Spark side.
-                    let projection = Arc::new(ProjectionExec::try_new(
-                        result_exprs?,
-                        Arc::clone(&aggregate),
-                    )?);
-                    Ok((
-                        scans,
-                        shuffle_scans,
-                        Arc::new(SparkPlan::new_with_additional(
-                            spark_plan.plan_id,
-                            projection,
-                            vec![child],
-                            vec![aggregate],
-                        )),
-                    ))
-                }
+                // The native HashAggregate emits its natural shape (group keys + agg
+                // results / state). Any post-aggregate projection Spark catalyst declares
+                // (`COUNT(col) + 1`, EXISTS-pruned-to-empty output, alias renames, etc.) is
+                // expressed as an explicit `OpStruct::Projection` op above the aggregate
+                // by the JVM serializer (see `CometBaseAggregate.doConvert`). Keeping that
+                // logic on the JVM side means only one place decides plan shape, and the
+                // native side stays a faithful executor. See comet#4515.
+                Ok((
+                    scans,
+                    shuffle_scans,
+                    Arc::new(SparkPlan::new(spark_plan.plan_id, aggregate, vec![child])),
+                ))
             }
             OpStruct::Limit(limit) => {
                 assert_eq!(children.len(), 1);
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index 921e5bbf35..aaff3cc4de 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -294,7 +294,13 @@ message Sort {
 message HashAggregate {
   repeated spark.spark_expression.Expr grouping_exprs = 1;
   repeated spark.spark_expression.AggExpr agg_exprs = 2;
-  repeated spark.spark_expression.Expr result_exprs = 3;
+  // Field 3 (`result_exprs`) and field 8 (`apply_result_projection`) were used to apply a
+  // post-aggregate projection inside the HashAggregate operator. The same effect is now
+  // expressed by emitting an explicit `Projection` op above the `HashAggregate` from the
+  // JVM serializer when needed (see `CometBaseAggregate.doConvert`). Reserved to avoid
+  // accidental reuse at incompatible semantics.
+  reserved 3, 8;
+  reserved "result_exprs", "apply_result_projection";
   AggregateMode mode = 5;
   // Per-expression modes for mixed-mode aggregates (e.g., PartialMerge + Partial).
   // When set, each entry corresponds to agg_exprs at the same index.
@@ -303,12 +309,6 @@ message HashAggregate {
   // Offset in the child's output where aggregate buffer attributes start.
   // Used by PartialMerge to locate state fields in the input.
   int32 initial_input_buffer_offset = 7;
-  // Whether the native planner should wrap the aggregate in a ProjectionExec using
-  // `result_exprs`. Disambiguates `result_exprs` empty-because-absent (no projection
-  // needed) from empty-because-catalyst-pruned-to-zero-cols (project to nothing).
-  // Without this bit, both wire as `repeated` length 0 and were collapsed to "no
-  // projection," leaking grouping keys when Spark intent was 0 columns.
-  bool apply_result_projection = 8;
 }
 
 message Limit {
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index 310767489d..3240adcc01 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -1553,25 +1553,48 @@ trait CometBaseAggregate {
                 .map(_.toAttribute)}")
       val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder()
       hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava)
-      val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes
-      val resultExprs = resultExpressions.map(exprToProto(_, attributes))
-      if (resultExprs.exists(_.isEmpty)) {
-        withInfo(
-          aggregate,
-          s"Unsupported result expressions found in: $resultExpressions",
-          resultExpressions: _*)
-        return None
+      // The native HashAggregate emits its natural shape (the grouping keys, since there
+      // are no aggregate functions). When Spark catalyst declares a different output -
+      // either column-renaming via aliases, or an entirely empty output for catalyst-pruned
+      // EXISTS / row-existence-only subqueries - we wrap the HashAggregate in an explicit
+      // Projection op so the native side reshapes accordingly. See comet#4515: an empty
+      // declared output paired with the natural grouping-key output crashed downstream
+      // boundaries that derived their schema from the declared output.
+      val naturalOutput = groupingExpressions.map(_.toAttribute)
+      val needsProjection = resultExpressions.map(_.toAttribute) != naturalOutput
+      if (needsProjection) {
+        val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes
+        val resultExprs = resultExpressions.map(exprToProto(_, attributes))
+        if (resultExprs.exists(_.isEmpty)) {
+          withInfo(
+            aggregate,
+            s"Unsupported result expressions found in: $resultExpressions",
+            resultExpressions: _*)
+          return None
+        }
+        // Build the inner HashAgg op carrying the original child operators from `builder`.
+        // Use a fresh outer builder for the Projection so it gets a single child (the
+        // HashAgg op), not the original children appended on top. Both ops share the same
+        // plan_id so the inner aggregate's native metrics roll up under the same Spark
+        // operator in the metric tree (otherwise they'd orphan against plan_id=0).
+        val hashAggOp = OperatorOuterClass.Operator
+          .newBuilder()
+          .setPlanId(builder.getPlanId)
+          .addAllChildren(builder.getChildrenList)
+          .setHashAgg(hashAggBuilder)
+          .build()
+        val projectionBuilder = OperatorOuterClass.Projection.newBuilder()
+        projectionBuilder.addAllProjectList(resultExprs.map(_.get).asJava)
+        Some(
+          OperatorOuterClass.Operator
+            .newBuilder()
+            .setPlanId(builder.getPlanId)
+            .addChildren(hashAggOp)
+            .setProjection(projectionBuilder)
+            .build())
+      } else {
+        Some(builder.setHashAgg(hashAggBuilder).build())
       }
-      hashAggBuilder.addAllResultExprs(resultExprs.map(_.get).asJava)
-      // Force native to apply the projection when Spark's expected output (which may
-      // be empty for catalyst-pruned EXISTS / row-existence-only subqueries) differs
-      // from the aggregate's natural grouping output. Without this, an empty proto
-      // result_exprs is indistinguishable from "no projection needed" and the natural
-      // grouping keys leak through.
-      val naturalEqualsIntent =
-        resultExpressions.map(_.toAttribute) == groupingExpressions.map(_.toAttribute)
-      hashAggBuilder.setApplyResultProjection(!naturalEqualsIntent)
-      Some(builder.setHashAgg(hashAggBuilder).build())
     } else {
       // Validate mode combinations. We support:
       // - All Partial
@@ -1640,21 +1663,6 @@ trait CometBaseAggregate {
         val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder()
         hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava)
         hashAggBuilder.addAllAggExprs(aggExprs.map(_.get).asJava)
-        if (mode == CometAggregateMode.Final) {
-          val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes
-          val resultExprs = resultExpressions.map(exprToProto(_, attributes))
-          if (resultExprs.exists(_.isEmpty)) {
-            withInfo(
-              aggregate,
-              s"Unsupported result expressions found in: $resultExpressions",
-              resultExpressions: _*)
-            return None
-          }
-          hashAggBuilder.addAllResultExprs(resultExprs.map(_.get).asJava)
-          // Final aggs always need the projection (matches the existing condition under
-          // which we serialize result_exprs at all). See comet#4515.
-          hashAggBuilder.setApplyResultProjection(true)
-        }
         hashAggBuilder.setModeValue(mode.getNumber)
 
         // Send per-expression modes and buffer offset for PartialMerge handling
@@ -1673,7 +1681,44 @@ trait CometBaseAggregate {
           hashAggBuilder.setInitialInputBufferOffset(aggregate.initialInputBufferOffset)
         }
 
-        Some(builder.setHashAgg(hashAggBuilder).build())
+        // Final aggregations may carry a result projection (e.g. `COUNT(col) + 1`) that
+        // catalyst encodes via `resultExpressions`. DataFusion's hash aggregate only emits
+        // its natural shape (group keys + agg results), so we wrap the HashAggregate in
+        // an explicit Projection op to apply Spark's result expressions. Partial /
+        // PartialMerge aggregates emit raw state buffers and never need the projection.
+        // See comet#4515.
+        if (mode == CometAggregateMode.Final) {
+          val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes
+          val resultExprs = resultExpressions.map(exprToProto(_, attributes))
+          if (resultExprs.exists(_.isEmpty)) {
+            withInfo(
+              aggregate,
+              s"Unsupported result expressions found in: $resultExpressions",
+              resultExpressions: _*)
+            return None
+          }
+          // Inner HashAgg keeps the original input children from `builder`. Outer
+          // Projection uses a fresh builder so it has a single child (the HashAgg op).
+          // Both ops share the same plan_id so the aggregate's native metrics aggregate
+          // under the same Spark operator (else they'd orphan against plan_id=0).
+          val hashAggOp = OperatorOuterClass.Operator
+            .newBuilder()
+            .setPlanId(builder.getPlanId)
+            .addAllChildren(builder.getChildrenList)
+            .setHashAgg(hashAggBuilder)
+            .build()
+          val projectionBuilder = OperatorOuterClass.Projection.newBuilder()
+          projectionBuilder.addAllProjectList(resultExprs.map(_.get).asJava)
+          Some(
+            OperatorOuterClass.Operator
+              .newBuilder()
+              .setPlanId(builder.getPlanId)
+              .addChildren(hashAggOp)
+              .setProjection(projectionBuilder)
+              .build())
+        } else {
+          Some(builder.setHashAgg(hashAggBuilder).build())
+        }
       } else {
         val allChildren: Seq[Expression] =
           groupingExpressions ++ aggregateExpressions ++ aggregateAttributes

From d736dd5b7194c4363fb8a62d17e37b56efb2d8a3 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 14:36:21 -0400
Subject: [PATCH 07/13] remove instrumentation.

---
 native/core/src/execution/planner.rs          |  17 ---
 .../org/apache/comet/CometBatchIterator.java  |  16 ---
 .../org/apache/comet/CometExecIterator.scala  |  21 ----
 .../apache/comet/rules/CometExecRule.scala    |  24 ----
 .../apache/comet/rules/CometScanRule.scala    |  11 --
 .../comet/serde/operator/CometSink.scala      |  74 ------------
 .../org/apache/comet/vector/NativeUtil.scala  |  22 ----
 .../shuffle/CometShuffleExchangeExec.scala    |  33 ------
 .../shuffle/CometShuffledRowRDD.scala         |  29 +----
 .../apache/spark/sql/comet/operators.scala    |  37 +-----
 .../apache/comet/exec/CometExecSuite.scala    | 111 ------------------
 11 files changed, 2 insertions(+), 393 deletions(-)

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index 8abc19c1f5..18c3acf2dd 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -1034,13 +1034,6 @@ impl PhysicalPlanner {
                 ))
             }
             OpStruct::HashAgg(agg) => {
-                // [#4515 instrumentation] Every HashAgg construction with proto sizes.
-                log::warn!(
-                    "[#4515] OpStruct::HashAgg grouping_exprs.len={} agg_exprs.len={} mode={}",
-                    agg.grouping_exprs.len(),
-                    agg.agg_exprs.len(),
-                    agg.mode
-                );
                 assert_eq!(children.len(), 1);
                 let (scans, shuffle_scans, child) =
                     self.create_plan(&children[0], inputs, partition_count)?;
@@ -1424,16 +1417,6 @@ impl PhysicalPlanner {
             OpStruct::Scan(scan) => {
                 let data_types = scan.fields.iter().map(to_arrow_datatype).collect_vec();
 
-                // [#4515 instrumentation] Log every JVM-bridge Scan's declared column count.
-                // A 0-column scan paired with a JVM iterator producing batches with columns is
-                // the AIOOBE-on-exportBatch shape; a non-empty list confirms the inverse.
-                log::info!(
-                    "[#4515] ScanExec source='{}' declared {} cols: {:?}",
-                    scan.source,
-                    data_types.len(),
-                    data_types
-                );
-
                 // If it is not test execution context for unit test, we should have at least one
                 // input source
                 if self.exec_context_id != TEST_EXEC_CONTEXT_ID && inputs.is_empty() {
diff --git a/spark/src/main/java/org/apache/comet/CometBatchIterator.java b/spark/src/main/java/org/apache/comet/CometBatchIterator.java
index 1392f028c0..9b48a47c57 100644
--- a/spark/src/main/java/org/apache/comet/CometBatchIterator.java
+++ b/spark/src/main/java/org/apache/comet/CometBatchIterator.java
@@ -38,8 +38,6 @@ public class CometBatchIterator {
   private final NativeUtil nativeUtil;
   private ColumnarBatch previousBatch = null;
   private ColumnarBatch currentBatch = null;
-  // [#4515 instrumentation] gate first-batch log per instance
-  private boolean loggedFirstBatch = false;
 
   CometBatchIterator(Iterator<ColumnarBatch> input, NativeUtil nativeUtil) {
     this.input = input;
@@ -81,20 +79,6 @@ public int next(long[] arrayAddrs, long[] schemaAddrs) {
       return -1;
     }
 
-    // [#4515 instrumentation] Log first-batch shape per CometBatchIterator instance.
-    if (!loggedFirstBatch) {
-      loggedFirstBatch = true;
-      org.slf4j.LoggerFactory.getLogger("[#4515]")
-          .warn(
-              "CometBatchIterator.next first batch: numCols={} numRows={} arrayAddrs.length={} schemaAddrs.length={} inputCls={} this={}",
-              currentBatch.numCols(),
-              currentBatch.numRows(),
-              arrayAddrs.length,
-              schemaAddrs.length,
-              input.getClass().getName(),
-              System.identityHashCode(this));
-    }
-
     // export the batch using the Arrow C Data Interface
     int numRows = nativeUtil.exportBatch(arrayAddrs, schemaAddrs, currentBatch);
 
diff --git a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala
index 98a9bc0684..6140eca553 100644
--- a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala
+++ b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala
@@ -92,27 +92,6 @@ class CometExecIterator(
     val conf = SparkEnv.get.conf
     val localDiskDirs = SparkEnv.get.blockManager.getLocalDiskDirs
 
-    // [#4515 instrumentation] Dump proto plan + per-input iterator class so we can correlate
-    // a 0-col Scan in the proto with the JVM iterator that will feed it.
-    {
-      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-      val parsed =
-        scala.util
-          .Try(
-            org.apache.comet.serde.OperatorOuterClass.Operator
-              .parseFrom(protobufQueryPlan)
-              .toString)
-          .toOption
-          .getOrElse("<failed to parse>")
-      val itersDesc = inputIterators.zipWithIndex
-        .map { case (it, idx) => s"  inputIterators[$idx] cls=${it.getClass.getName}" }
-        .mkString("\n")
-      log.warn(
-        s"CometExecIterator constructing plan id=$id partition=$partitionIndex " +
-          s"numParts=$numParts numOutputCols=$numOutputCols\n$itersDesc\n" +
-          s"  proto plan:\n$parsed")
-    }
-
     // serialize Comet related Spark configs in protobuf format
     val protobufSparkConfigs = CometExecIterator.serializeCometSQLConfs()
 
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
index 6eb314bbb6..aeb7db40ad 100644
--- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
@@ -166,16 +166,6 @@ case class CometExecRule(session: SparkSession)
             val reverted =
               s.originalPlan.withNewChildren(Seq(s.child)).asInstanceOf[ShuffleExchangeExec]
             reverted.setTagValue(CometExecRule.SKIP_COMET_SHUFFLE_TAG, ())
-            // [#4515 instrumentation] Log every revert: this is a primary place where
-            // vanilla ShuffleExchangeExec instances enter the plan post-Comet-rewrite.
-            org.slf4j.LoggerFactory
-              .getLogger("[#4515]")
-              .warn(
-                s"revertRedundantColumnarShuffle revert: parentAgg=${op.getClass.getSimpleName} " +
-                  s"(output=${op.output}) cometShuffle.output=${s.output} " +
-                  s"reverted.output=${reverted.output} reverted.identityHash=${System
-                      .identityHashCode(reverted)}\n" +
-                  s"  reverted tree:\n${reverted.treeString(verbose = true, addSuffix = false)}")
             logInfo(
               "Reverting Comet columnar shuffle to Spark shuffle between " +
                 s"${op.getClass.getSimpleName} and ${s.child.getClass.getSimpleName} " +
@@ -556,20 +546,6 @@ case class CometExecRule(session: SparkSession)
            |${sideBySide(plan.treeString, newPlan.treeString).mkString("\n")}
            |""".stripMargin)
     }
-    // [#4515 instrumentation] Dump the plan in/out of CometExecRule to correlate with the
-    // 0-col Scan synthesized later. Logs the operator-class diff so we can see what
-    // CometExecRule did or didn't replace, especially around ShuffleExchangeExec wrappers
-    // inside subqueries.
-    if (!newPlan.fastEquals(plan)) {
-      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-      log.warn(
-        "CometExecRule rewrote plan:\n  IN classes: " +
-          plan.collect { case p => p.getClass.getSimpleName }.mkString(",") +
-          "\n  OUT classes: " +
-          newPlan.collect { case p => p.getClass.getSimpleName }.mkString(",") +
-          s"\n  IN tree:\n${plan.treeString(verbose = true, addSuffix = false)}" +
-          s"\n  OUT tree:\n${newPlan.treeString(verbose = true, addSuffix = false)}")
-    }
     newPlan
   }
 
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
index 1f7f509db0..7601fa1c6b 100644
--- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
+++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -68,17 +68,6 @@ case class CometScanRule(session: SparkSession)
            |${sideBySide(plan.treeString, newPlan.treeString).mkString("\n")}
            |""".stripMargin)
     }
-    // [#4515 instrumentation]
-    if (!newPlan.fastEquals(plan)) {
-      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-      log.warn(
-        "CometScanRule rewrote plan:\n  IN classes: " +
-          plan.collect { case p => p.getClass.getSimpleName }.mkString(",") +
-          "\n  OUT classes: " +
-          newPlan.collect { case p => p.getClass.getSimpleName }.mkString(",") +
-          s"\n  IN tree:\n${plan.treeString(verbose = true, addSuffix = false)}" +
-          s"\n  OUT tree:\n${newPlan.treeString(verbose = true, addSuffix = false)}")
-    }
     newPlan
   }
 
diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala
index 8a1c576a2d..845803d133 100644
--- a/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala
@@ -49,12 +49,6 @@ abstract class CometSink[T <: SparkPlan] extends CometOperatorSerde[T] {
       op: T,
       builder: Operator.Builder,
       childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = {
-    // [#4515 instrumentation]
-    val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-    log.warn(
-      s"CometSink[${this.getClass.getSimpleName}].convert op=${op.getClass.getName} " +
-        s"simpleString='${op.simpleStringWithNodeId()}' output=${op.output} " +
-        s"output.size=${op.output.size}")
     val supportedTypes =
       op.output.forall(a => supportedDataType(a.dataType, allowComplex = true))
 
@@ -78,53 +72,6 @@ abstract class CometSink[T <: SparkPlan] extends CometOperatorSerde[T] {
     }
 
     if (scanTypes.length == op.output.length) {
-      // [#4515 instrumentation] Log when we synthesize a Scan with zero declared columns.
-      // The runtime JVM iterator may still produce columns (subquery output shrunk by
-      // catalyst before serialization while the underlying RDD reflects the pre-shrink shape),
-      // tripping the column-count guard in NativeUtil.exportBatch.
-      if (scanTypes.isEmpty) {
-        val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-        // scalastyle:off line.size.limit
-        val childInfo = op.children.zipWithIndex
-          .map { case (c, i) =>
-            val canonOut = scala.util
-              .Try(c.canonicalized.output)
-              .toOption
-              .map(_.toString)
-              .getOrElse("<canonicalize failed>")
-            s"  child[$i] cls=${c.getClass.getName} simpleString='${c.simpleString(
-                80)}' output=${c.output} outputSize=${c.output.size} identityHash=${System
-                .identityHashCode(c)} canonicalized.output=$canonOut"
-          }
-          .mkString("\n")
-        val opCanonOut = scala.util
-          .Try(op.canonicalized.output)
-          .toOption
-          .map(_.toString)
-          .getOrElse("<canonicalize failed>")
-        val subqueryInfo = scala.util
-          .Try(op.subqueries.map(s => s"${s.getClass.getName}(output=${s.output}, prepared=?)"))
-          .toOption
-          .getOrElse(Nil)
-          .mkString("[", ", ", "]")
-        val callerStack =
-          new RuntimeException("[#4515] CometSink 0-col Scan caller").getStackTrace
-            .take(20)
-            .map(f => s"    at ${f}")
-            .mkString("\n")
-        log.warn(s"CometSink synthesizing 0-col Scan for op=${op.getClass.getName}\n" +
-          s"  simpleString='${op.simpleStringWithNodeId()}'\n" +
-          s"  op.output=${op.output} op.outputSet=${op.outputSet} op.references=${op.references}\n" +
-          s"  op.canonicalized.output=$opCanonOut\n" +
-          s"  op.subqueries=$subqueryInfo\n" +
-          s"  op identityHash=${System.identityHashCode(op)}\n" +
-          s"  children classes=${op.children.map(_.getClass.getSimpleName).mkString("[", ",", "]")}\n" +
-          childInfo + "\n" +
-          s"  caller stack:\n$callerStack\n" +
-          s"  op tree:\n${op.treeString(verbose = true, addSuffix = false)}")
-        // scalastyle:on line.size.limit
-      }
-
       scanBuilder.addAllFields(scanTypes.asJava)
 
       // Sink operators don't have children
@@ -147,27 +94,6 @@ object CometExchangeSink extends CometSink[SparkPlan] {
       op: SparkPlan,
       builder: Operator.Builder,
       childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = {
-    // [#4515 instrumentation]
-    val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-    val isVanillaSparkExchange =
-      op.getClass.getName == "org.apache.spark.sql.execution.exchange.ShuffleExchangeExec"
-    log.warn(
-      s"CometExchangeSink.convert op=${op.getClass.getName} " +
-        s"simpleString='${op.simpleStringWithNodeId()}' output=${op.output} " +
-        s"useShuffleScan=${shouldUseShuffleScan(op)} " +
-        s"children=${op.children.map(_.getClass.getSimpleName).mkString("[", ",", "]")}")
-    if (isVanillaSparkExchange) {
-      val callerStack =
-        new RuntimeException("[#4515] vanilla ShuffleExchangeExec caller").getStackTrace
-          .take(20)
-          .map(f => s"    at ${f}")
-          .mkString("\n")
-      log.warn(
-        "  vanilla ShuffleExchangeExec being processed by CometExchangeSink:\n" +
-          s"  output=${op.output}\n" +
-          s"  caller stack:\n$callerStack\n" +
-          s"  op tree:\n${op.treeString(verbose = true, addSuffix = false)}")
-    }
     if (shouldUseShuffleScan(op)) {
       convertToShuffleScan(op, builder)
     } else {
diff --git a/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala b/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala
index 388f5114e6..4f027cd9e7 100644
--- a/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala
+++ b/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala
@@ -114,28 +114,6 @@ class NativeUtil {
       batch: ColumnarBatch): Int = {
     val numRows = mutable.ArrayBuffer.empty[Int]
 
-    if (arrayAddrs.length != batch.numCols() || schemaAddrs.length != batch.numCols()) {
-      val schemaSummary = (0 until batch.numCols())
-        .map { i =>
-          val v = batch.column(i) match {
-            case cv: CometVector => cv.getValueVector
-            case _ => null
-          }
-          if (v != null) s"col[$i]: ${v.getField}"
-          else s"col[$i]: ${batch.column(i).getClass.getName}"
-        }
-        .mkString("; ")
-      val taskAttempt = Option(org.apache.spark.TaskContext.get())
-        .map(c => s"stage=${c.stageId} task=${c.taskAttemptId} partition=${c.partitionId}")
-        .getOrElse("no-task")
-      throw new SparkException(
-        "CometBatchIterator column-count mismatch [#4515 instrumentation]: " +
-          s"native expected arrayAddrs=${arrayAddrs.length}, schemaAddrs=${schemaAddrs.length}; " +
-          s"JVM iterator produced batch.numCols=${batch.numCols()} ($taskAttempt). " +
-          s"Batch schema: $schemaSummary",
-        new RuntimeException("placeholder for exportBatch column-count mismatch"))
-    }
-
     (0 until batch.numCols()).foreach { index =>
       batch.column(index) match {
         case a: CometVector =>
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
index ba152c2f13..1470d637d9 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
@@ -243,23 +243,6 @@ case class CometShuffleExchangeExec(
    * Comet returns RDD[ColumnarBatch] for columnar execution.
    */
   protected override def doExecuteColumnar(): RDD[ColumnarBatch] = {
-    // [#4515 instrumentation] Track every doExecuteColumnar call: which CometShuffleExchange
-    // instance, its output, and the call site. Helps confirm whether this PR's changes
-    // cause an extra EnsureRequirements-inserted vanilla Exchange to wrap us, and what
-    // RDD is being plumbed where.
-    {
-      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-      val callerStack = new RuntimeException(
-        "[#4515] CometShuffleExchangeExec.doExecuteColumnar caller").getStackTrace
-        .take(15)
-        .map(f => s"    at ${f}")
-        .mkString("\n")
-      log.warn(
-        s"CometShuffleExchangeExec.doExecuteColumnar this=${System.identityHashCode(this)} " +
-          s"shuffleType=$shuffleType outputPartitioning=$outputPartitioning " +
-          s"output=$output\n" +
-          s"  caller stack:\n$callerStack")
-    }
     // Returns the same CometShuffledBatchRDD if this plan is used by multiple plans.
     if (cachedShuffleRDD == null) {
       cachedShuffleRDD = new CometShuffledBatchRDD(shuffleDependency, readMetrics)
@@ -704,22 +687,6 @@ object CometShuffleExchangeExec
       serializer: Serializer,
       metrics: Map[String, SQLMetric]): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = {
 
-    // [#4515 instrumentation] Track every placeholder-Scan ShuffleDependency we build, with
-    // outputAttributes (drives Scan declared schema) and call site.
-    {
-      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-      val callerStack =
-        new RuntimeException("[#4515] prepareShuffleDependency caller").getStackTrace
-          .take(15)
-          .map(f => s"    at ${f}")
-          .mkString("\n")
-      log.warn(
-        s"prepareShuffleDependency outputAttributes=$outputAttributes " +
-          s"outputAttributes.size=${outputAttributes.size} " +
-          s"outputPartitioning=$outputPartitioning rdd.numPartitions=${rdd.getNumPartitions}\n" +
-          s"  caller stack:\n$callerStack")
-    }
-
     val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput")
     val scanTypes = outputAttributes.flatten { attr =>
       QueryPlanSerde.serializeDataType(attr.dataType)
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala
index e0e5c45b2a..7604910b06 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala
@@ -162,34 +162,7 @@ class CometShuffledBatchRDD(
   override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = {
     val reader = createReader(split, context)
     // TODO: Reads IPC by native code
-    val raw = reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2)
-    // [#4515 instrumentation] Peek the first decoded batch to confirm wire schema vs caller
-    // expectations. Wraps so we don't consume the iterator.
-    val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-    new Iterator[ColumnarBatch] {
-      private var logged = false
-      override def hasNext: Boolean = raw.hasNext
-      override def next(): ColumnarBatch = {
-        val b = raw.next()
-        if (!logged) {
-          logged = true
-          val schemaSummary = (0 until b.numCols())
-            .map { i =>
-              val v = b.column(i) match {
-                case cv: org.apache.comet.vector.CometVector => cv.getValueVector
-                case _ => null
-              }
-              if (v != null) s"col[$i]: ${v.getField}"
-              else s"col[$i]: ${b.column(i).getClass.getName}"
-            }
-            .mkString("; ")
-          log.warn(s"CometShuffledBatchRDD.compute first decoded batch: numCols=${b.numCols()} " +
-            s"numRows=${b.numRows()} stage=${context.stageId()} task=${context.taskAttemptId()} " +
-            s"partition=${split.index} schema=[$schemaSummary]")
-        }
-        b
-      }
-    }
+    reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2)
   }
 
   override def clearDependencies(): Unit = {
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index 3240adcc01..7ec8a2e898 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -422,15 +422,7 @@ abstract class CometNativeExec extends CometExec {
   /** The Comet native operator */
   def nativeOp: Operator
 
-  override protected def doPrepare(): Unit = {
-    // [#4515 instrumentation] Track when subqueries are prepared for this CometNativeExec.
-    val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-    log.warn(
-      s"CometNativeExec.doPrepare this=${System.identityHashCode(this)} " +
-        s"cls=${this.getClass.getName} " +
-        s"originalPlan.cls=${originalPlan.getClass.getName}")
-    prepareSubqueries(this)
-  }
+  override protected def doPrepare(): Unit = prepareSubqueries(this)
 
   override lazy val metrics: Map[String, SQLMetric] =
     CometMetricNode.baselineMetrics(sparkContext)
@@ -571,14 +563,6 @@ abstract class CometNativeExec extends CometExec {
     // same partition number. But for Comet, we need to zip them so we need to adjust the
     // partition number of Broadcast RDDs to make sure they have the same partition number.
     sparkPlans.zipWithIndex.foreach { case (plan, idx) =>
-      // [#4515 instrumentation] Log every JVM-side input plan we wire to native, so we can
-      // correlate the Scan's declared schema with the runtime plan whose RDD feeds it.
-      val log = org.slf4j.LoggerFactory.getLogger("[#4515]")
-      log.warn(
-        s"buildNativeContext binding input[$idx] cls=${plan.getClass.getName} " +
-          s"simpleString='${plan.simpleStringWithNodeId()}' output=${plan.output} " +
-          s"output.size=${plan.output.size} identityHash=${System.identityHashCode(plan)}\n" +
-          s"  subtree:\n${plan.treeString(verbose = true, addSuffix = false)}")
       plan match {
         case c: CometBroadcastExchangeExec =>
           inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions)
@@ -1532,25 +1516,6 @@ trait CometBaseAggregate {
     // If the aggregateExpressions is empty, we only want to build groupingExpressions,
     // and skip processing of aggregateExpressions.
     if (aggregateExpressions.isEmpty) {
-      // [#4515 instrumentation] Track HashAgg serializations with empty resultExpressions.
-      // Catalyst prunes resultExpressions for EXISTS / row-existence-only subqueries; the
-      // native side currently interprets empty result_exprs as "use aggregate natural
-      // schema", which leaks grouping keys into the output.
-      org.slf4j.LoggerFactory
-        .getLogger("[#4515]")
-        .warn(
-          "HashAgg empty-aggExprs branch: " +
-            s"groupingExprs=${groupingExpressions} " +
-            s"resultExpressions=${resultExpressions} " +
-            s"resultExpressions.size=${resultExpressions.size} " +
-            s"aggregate.output=${aggregate.output} " +
-            s"aggregate.output.size=${aggregate.output.size} " +
-            s"modes(from aggExprs)=${modes} " +
-            s"sparkFinalMode=$sparkFinalMode " +
-            s"requiredChildDistribution=${aggregate.requiredChildDistribution} " +
-            s"isProjectionToEmpty=${resultExpressions.isEmpty && aggregate.output.isEmpty} " +
-            s"naturalEqualsIntent=${resultExpressions.map(_.toAttribute) == groupingExpressions
-                .map(_.toAttribute)}")
       val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder()
       hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava)
       // The native HashAggregate emits its natural shape (the grouping keys, since there
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
index fded42d050..16601d056b 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
@@ -3959,117 +3959,6 @@ class CometExecSuite extends CometTestBase {
     }
   }
 
-  // Repro for the Spark 3.5 SQL-tests failure on subquery/exists-subquery/exists-orderby-limit.sql
-  // (query #19) on branch opt_native_shuffle. Crashes with NativeUtil.exportBatch:132 AIOOBE
-  // on this branch but passes on main. The SQL file declares three CONFIG_DIM1 combos for
-  // codegen; the failure trace doesn't say which one fires it, so sweep all three.
-  test("EXISTS subquery with GROUP BY + LIMIT + OFFSET") {
-    withTempView("emp", "dept") {
-      spark
-        .sql("""SELECT * FROM VALUES
-               |  (100, 'emp 1', date '2005-01-01', 100.0D, 10),
-               |  (200, 'emp 2', date '2003-01-01', 200.0D, 10),
-               |  (300, 'emp 3', date '2002-01-01', 300.0D, 20),
-               |  (400, 'emp 4', date '2005-01-01', 400.0D, 30),
-               |  (500, 'emp 5', date '2001-01-01', 400.0D, NULL),
-               |  (700, 'emp 7', date '2010-01-01', 400.0D, 100),
-               |  (800, 'emp 8', date '2016-01-01', 150.0D, 70)
-               |AS t(id, emp_name, hiredate, salary, dept_id)""".stripMargin)
-        .createOrReplaceTempView("emp")
-      spark
-        .sql("""SELECT * FROM VALUES
-               |  (10, 'dept 1', 'CA'),
-               |  (20, 'dept 2', 'NY'),
-               |  (30, 'dept 3', 'TX'),
-               |  (40, 'dept 4 - unassigned', 'OR'),
-               |  (50, 'dept 5 - unassigned', 'NJ'),
-               |  (70, 'dept 7', 'FL')
-               |AS t(dept_id, dept_name, state)""".stripMargin)
-        .createOrReplaceTempView("dept")
-
-      val configDims = Seq(
-        Map("spark.sql.codegen.wholeStage" -> "true"),
-        Map(
-          "spark.sql.codegen.wholeStage" -> "false",
-          "spark.sql.codegen.factoryMode" -> "CODEGEN_ONLY"),
-        Map(
-          "spark.sql.codegen.wholeStage" -> "false",
-          "spark.sql.codegen.factoryMode" -> "NO_CODEGEN"))
-
-      // Mirror the SQL test order: queries #1 through #16 from the file run before #17 (the
-      // one whose CI failure we're chasing). Query #11's subquery is identical except for
-      // the OFFSET; #13 and #15 share similar shapes. Subquery materialization / AQE plan
-      // cache state from running them first may alter query #17's executed shape.
-      val priorQueries = Seq(
-        // #1
-        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate""",
-        // #2
-        """SELECT id, hiredate FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate DESC""",
-        // #3
-        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 1) ORDER BY hiredate""",
-        // #4
-        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 0) ORDER BY hiredate""",
-        // #5
-        """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate""",
-        // #6
-        """SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state)""",
-        // #7
-        """SELECT count(*) FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY dept_id ORDER BY dept_id)""",
-        // #8
-        """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 1) ORDER BY hiredate""",
-        // #9
-        """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 0) ORDER BY hiredate""",
-        // #10
-        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1)""",
-        // #11 - same subquery as #17 minus the OFFSET
-        """SELECT * FROM emp WHERE EXISTS (SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1)""",
-        // #12
-        """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 100 LIMIT 1)""",
-        // #13
-        """SELECT * FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) FROM dept WHERE dept.dept_id > 100 GROUP BY state LIMIT 1)""",
-        // #14
-        """SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state LIMIT 2 OFFSET 1)""",
-        // #15
-        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1 OFFSET 2)""",
-        // #16
-        """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > emp.dept_id LIMIT 1)""")
-
-      for (dim <- configDims) {
-        // SQL-tests harness (dev/diffs/3.5.8.diff, 4.1.1.diff) sets only:
-        //   spark.comet.enabled, spark.comet.exec.enabled, spark.comet.exec.shuffle.enabled,
-        //   spark.comet.parquet.respectFilterPushdown, spark.shuffle.manager,
-        //   spark.comet.memoryOverhead.
-        // It does NOT enable spark.comet.sparkToColumnar.enabled, but CometTestBase does.
-        // Force the harness shape so the reproducer matches the failing config.
-        val harnessConf = dim ++ Map(CometConf.COMET_SPARK_TO_ARROW_ENABLED.key -> "false")
-        withSQLConf(harnessConf.toSeq: _*) {
-          // scalastyle:off println
-          println(s"\n===== config dim: $harnessConf =====")
-          for ((q, idx) <- priorQueries.zipWithIndex) {
-            spark.sql(q).collect()
-            println(s"--- ran prior query #${idx + 1} ---")
-          }
-          val sql = """SELECT *
-                      |FROM emp
-                      |WHERE EXISTS (SELECT max(dept.dept_id)
-                      |              FROM dept
-                      |              GROUP BY state
-                      |              LIMIT 1
-                      |              OFFSET 2)""".stripMargin
-          val df = spark.sql(sql)
-          println("--- query #17 initial executedPlan ---")
-          println(df.queryExecution.executedPlan)
-          val rows = df.collect()
-          println("--- query #17 final (post-AQE) executedPlan ---")
-          println(df.queryExecution.executedPlan)
-          println(s"--- ${rows.length} rows ---")
-          // scalastyle:on println
-          checkSparkAnswer(sql)
-        }
-      }
-    }
-  }
-
 }
 
 case class BucketedTableTestSpec(

From 214a75be2f7def766bc0c98158651c7ff01b5db6 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 15:04:28 -0400
Subject: [PATCH 08/13] cleanup.

---
 native/core/src/execution/planner.rs          |  10 +-
 native/proto/src/proto/operator.proto         |   7 +-
 .../apache/spark/sql/comet/operators.scala    | 134 ++++++++----------
 .../comet/exec/CometAggregateSuite.scala      | 118 +++++----------
 4 files changed, 97 insertions(+), 172 deletions(-)

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index 18c3acf2dd..328ca65760 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -1164,13 +1164,9 @@ impl PhysicalPlanner {
                     )?,
                 );
 
-                // The native HashAggregate emits its natural shape (group keys + agg
-                // results / state). Any post-aggregate projection Spark catalyst declares
-                // (`COUNT(col) + 1`, EXISTS-pruned-to-empty output, alias renames, etc.) is
-                // expressed as an explicit `OpStruct::Projection` op above the aggregate
-                // by the JVM serializer (see `CometBaseAggregate.doConvert`). Keeping that
-                // logic on the JVM side means only one place decides plan shape, and the
-                // native side stays a faithful executor. See comet#4515.
+                // HashAggregate emits its natural shape (group keys + agg results); any
+                // post-aggregate projection is serialized as an explicit `OpStruct::Projection`
+                // op above by the JVM serializer (see `CometBaseAggregate.doConvert`)
                 Ok((
                     scans,
                     shuffle_scans,
diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
index aaff3cc4de..cc56331d74 100644
--- a/native/proto/src/proto/operator.proto
+++ b/native/proto/src/proto/operator.proto
@@ -294,11 +294,8 @@ message Sort {
 message HashAggregate {
   repeated spark.spark_expression.Expr grouping_exprs = 1;
   repeated spark.spark_expression.AggExpr agg_exprs = 2;
-  // Field 3 (`result_exprs`) and field 8 (`apply_result_projection`) were used to apply a
-  // post-aggregate projection inside the HashAggregate operator. The same effect is now
-  // expressed by emitting an explicit `Projection` op above the `HashAggregate` from the
-  // JVM serializer when needed (see `CometBaseAggregate.doConvert`). Reserved to avoid
-  // accidental reuse at incompatible semantics.
+  // Was result_exprs / apply_result_projection; now expressed as an explicit Projection
+  // op above HashAggregate (see CometBaseAggregate.doConvert, comet#4515).
   reserved 3, 8;
   reserved "result_exprs", "apply_result_projection";
   AggregateMode mode = 5;
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index 7ec8a2e898..cbd9dc4cbb 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -1518,48 +1518,12 @@ trait CometBaseAggregate {
     if (aggregateExpressions.isEmpty) {
       val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder()
       hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava)
-      // The native HashAggregate emits its natural shape (the grouping keys, since there
-      // are no aggregate functions). When Spark catalyst declares a different output -
-      // either column-renaming via aliases, or an entirely empty output for catalyst-pruned
-      // EXISTS / row-existence-only subqueries - we wrap the HashAggregate in an explicit
-      // Projection op so the native side reshapes accordingly. See comet#4515: an empty
-      // declared output paired with the natural grouping-key output crashed downstream
-      // boundaries that derived their schema from the declared output.
-      val naturalOutput = groupingExpressions.map(_.toAttribute)
-      val needsProjection = resultExpressions.map(_.toAttribute) != naturalOutput
-      if (needsProjection) {
-        val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes
-        val resultExprs = resultExpressions.map(exprToProto(_, attributes))
-        if (resultExprs.exists(_.isEmpty)) {
-          withInfo(
-            aggregate,
-            s"Unsupported result expressions found in: $resultExpressions",
-            resultExpressions: _*)
-          return None
-        }
-        // Build the inner HashAgg op carrying the original child operators from `builder`.
-        // Use a fresh outer builder for the Projection so it gets a single child (the
-        // HashAgg op), not the original children appended on top. Both ops share the same
-        // plan_id so the inner aggregate's native metrics roll up under the same Spark
-        // operator in the metric tree (otherwise they'd orphan against plan_id=0).
-        val hashAggOp = OperatorOuterClass.Operator
-          .newBuilder()
-          .setPlanId(builder.getPlanId)
-          .addAllChildren(builder.getChildrenList)
-          .setHashAgg(hashAggBuilder)
-          .build()
-        val projectionBuilder = OperatorOuterClass.Projection.newBuilder()
-        projectionBuilder.addAllProjectList(resultExprs.map(_.get).asJava)
-        Some(
-          OperatorOuterClass.Operator
-            .newBuilder()
-            .setPlanId(builder.getPlanId)
-            .addChildren(hashAggOp)
-            .setProjection(projectionBuilder)
-            .build())
-      } else {
-        Some(builder.setHashAgg(hashAggBuilder).build())
-      }
+      buildAggOp(
+        builder,
+        hashAggBuilder,
+        groupingExpressions.map(_.toAttribute),
+        resultExpressions,
+        aggregate)
     } else {
       // Validate mode combinations. We support:
       // - All Partial
@@ -1647,40 +1611,15 @@ trait CometBaseAggregate {
         }
 
         // Final aggregations may carry a result projection (e.g. `COUNT(col) + 1`) that
-        // catalyst encodes via `resultExpressions`. DataFusion's hash aggregate only emits
-        // its natural shape (group keys + agg results), so we wrap the HashAggregate in
-        // an explicit Projection op to apply Spark's result expressions. Partial /
-        // PartialMerge aggregates emit raw state buffers and never need the projection.
-        // See comet#4515.
+        // catalyst encodes via `resultExpressions`. Partial / PartialMerge aggregates emit
+        // raw state buffers and never need it. See comet#4515.
         if (mode == CometAggregateMode.Final) {
-          val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes
-          val resultExprs = resultExpressions.map(exprToProto(_, attributes))
-          if (resultExprs.exists(_.isEmpty)) {
-            withInfo(
-              aggregate,
-              s"Unsupported result expressions found in: $resultExpressions",
-              resultExpressions: _*)
-            return None
-          }
-          // Inner HashAgg keeps the original input children from `builder`. Outer
-          // Projection uses a fresh builder so it has a single child (the HashAgg op).
-          // Both ops share the same plan_id so the aggregate's native metrics aggregate
-          // under the same Spark operator (else they'd orphan against plan_id=0).
-          val hashAggOp = OperatorOuterClass.Operator
-            .newBuilder()
-            .setPlanId(builder.getPlanId)
-            .addAllChildren(builder.getChildrenList)
-            .setHashAgg(hashAggBuilder)
-            .build()
-          val projectionBuilder = OperatorOuterClass.Projection.newBuilder()
-          projectionBuilder.addAllProjectList(resultExprs.map(_.get).asJava)
-          Some(
-            OperatorOuterClass.Operator
-              .newBuilder()
-              .setPlanId(builder.getPlanId)
-              .addChildren(hashAggOp)
-              .setProjection(projectionBuilder)
-              .build())
+          buildAggOp(
+            builder,
+            hashAggBuilder,
+            groupingExpressions.map(_.toAttribute) ++ aggregateAttributes,
+            resultExpressions,
+            aggregate)
         } else {
           Some(builder.setHashAgg(hashAggBuilder).build())
         }
@@ -1694,6 +1633,51 @@ trait CometBaseAggregate {
 
   }
 
+  /**
+   * Serialize a HashAggregate, wrapping it in an explicit `Projection` op when Spark's declared
+   * output (`resultExpressions`) differs from the aggregate's natural output. DataFusion's hash
+   * aggregate emits only its natural shape (group keys + agg results), so any reshape catalyst
+   * declared - alias renames, `COUNT(col) + 1`, or empty output for catalyst-pruned EXISTS /
+   * row-existence-only subqueries - is expressed as a separate Projection above the HashAgg. Both
+   * ops share the caller's `plan_id` so the aggregate's native metrics roll up under the same
+   * Spark operator. See comet#4515.
+   */
+  private def buildAggOp(
+      builder: Operator.Builder,
+      hashAggBuilder: OperatorOuterClass.HashAggregate.Builder,
+      naturalOutput: Seq[Attribute],
+      resultExpressions: Seq[NamedExpression],
+      aggregate: BaseAggregateExec): Option[Operator] = {
+    if (resultExpressions.map(_.toAttribute) == naturalOutput) {
+      return Some(builder.setHashAgg(hashAggBuilder).build())
+    }
+    val resultExprs = resultExpressions.map(exprToProto(_, naturalOutput))
+    if (resultExprs.exists(_.isEmpty)) {
+      withInfo(
+        aggregate,
+        s"Unsupported result expressions found in: $resultExpressions",
+        resultExpressions: _*)
+      return None
+    }
+    val planId = builder.getPlanId
+    val hashAggOp = OperatorOuterClass.Operator
+      .newBuilder()
+      .setPlanId(planId)
+      .addAllChildren(builder.getChildrenList)
+      .setHashAgg(hashAggBuilder)
+      .build()
+    val projection = OperatorOuterClass.Projection
+      .newBuilder()
+      .addAllProjectList(resultExprs.map(_.get).asJava)
+    Some(
+      OperatorOuterClass.Operator
+        .newBuilder()
+        .setPlanId(planId)
+        .addChildren(hashAggOp)
+        .setProjection(projection)
+        .build())
+  }
+
   /**
    * Find the first Comet partial aggregate in the plan. If it reaches a Spark HashAggregate with
    * partial or partial-merge mode, it will return None.
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
index f098040878..3e883d7a32 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -2109,95 +2109,43 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
-  // Regression for comet#4515: a HashAggregateExec whose `resultExpressions` (and therefore
-  // `output`) catalyst has pruned to empty must still produce 0-col batches at runtime.
-  // Catalyst prunes `resultExpressions=[]` for plans where the aggregate's column values are
-  // unused downstream - classically EXISTS subqueries that get rewritten into a literal-`1`
-  // wrapper. Before the fix, the native HashAggregate emitted its natural output (the
-  // grouping keys) regardless of the pruned JVM `output`, so any boundary that derived a
-  // schema from `output` (e.g. a wrapping vanilla Spark `ShuffleExchangeExec.output =
-  // child.output = []`, or a JVM-bridge Scan synthesized from the same `output`) declared
-  // 0 columns while the runtime RDD produced 1. The mismatch tripped
-  // `NativeUtil.exportBatch` with an ArrayIndexOutOfBoundsException on a length-0
-  // schemaAddrs[].
+  // Regression for comet#4515: catalyst prunes `HashAggregateExec.resultExpressions` to
+  // empty for EXISTS / row-existence-only subqueries. The native HashAggregate's natural
+  // output (the grouping keys) then disagrees with the pruned JVM `output`, leaking through
+  // any boundary that derived its schema from `output`. The fix wraps the aggregate in an
+  // explicit Projection op when natural != declared.
   //
-  // The bug needs a specific catalyst optimizer state: `HashAggregateExec.
-  // resultExpressions.isEmpty`. Whether the optimizer reaches that state from a given SQL
-  // depends on Spark version, scan source (LocalTableScan vs Parquet/native), AQE state,
-  // and which Comet rules already fired - we observed it in the SQL-tests harness running
-  // `subquery/exists-subquery/exists-orderby-limit.sql` on Spark 4.0.2 with parquet-backed
-  // temp views, but not via `Dataset.collect` over `LocalTableScan`-backed temp views
-  // under `CometTestBase`. So the test below writes parquet (matching the harness's scan
-  // shape), tries several known triggers, runs whichever (if any) produces the bug shape
-  // under `checkSparkAnswer`, and skips cleanly otherwise. The upstream SQL-tests run
-  // remains the primary safety net for the harness-only path.
+  // Surfaced upstream in `subquery/exists-subquery/exists-orderby-limit.sql` (query #19,
+  // an EXISTS over `max(...) GROUP BY state LIMIT 1 OFFSET 2`). The exact `EXISTS-in-WHERE`
+  // shape doesn't reproduce under CometTestBase's optimizer state, but `count(*)` over the
+  // same derived aggregate triggers the equivalent ColumnPruning path locally - we assert
+  // the inner HashAgg's resultExpressions actually got pruned, so a future Spark version
+  // that breaks the trigger fails the test loudly rather than passing silently.
   test("HashAggregate with catalyst-pruned resultExpressions returns 0-col output (#4515)") {
     withTempDir { dir =>
-      withTempView("emp", "dept") {
-        // Write parquet so Comet's native scan path (vs LocalTableScan) is the source -
-        // matches the SQL-tests harness setup that surfaced the bug.
-        val empPath = new java.io.File(dir, "emp").getAbsolutePath
-        val deptPath = new java.io.File(dir, "dept").getAbsolutePath
-
-        spark
-          .sql("""SELECT * FROM VALUES
-                 |  (100, 'emp 1', 100.0D, 10),
-                 |  (200, 'emp 2', 200.0D, 10),
-                 |  (300, 'emp 3', 300.0D, 20),
-                 |  (400, 'emp 4', 400.0D, 30),
-                 |  (500, 'emp 5', 400.0D, NULL),
-                 |  (700, 'emp 7', 400.0D, 100),
-                 |  (800, 'emp 8', 150.0D, 70)
-                 |AS t(id, emp_name, salary, dept_id)""".stripMargin)
-          .write
-          .parquet(empPath)
-        spark
-          .sql("""SELECT * FROM VALUES
-                 |  (10, 'CA'), (20, 'NY'), (30, 'TX'),
-                 |  (40, 'OR'), (50, 'NJ'), (70, 'FL')
-                 |AS t(dept_id, state)""".stripMargin)
-          .write
-          .parquet(deptPath)
-
-        spark.read.parquet(empPath).createOrReplaceTempView("emp")
-        spark.read.parquet(deptPath).createOrReplaceTempView("dept")
-
-        val candidates = Seq(
-          // The original failing SQL from the harness - EXISTS with grouped agg + LIMIT/OFFSET.
-          """SELECT * FROM emp
-            |WHERE EXISTS (
-            |  SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1 OFFSET 2)""".stripMargin,
-          // Inline view + outer constant: ColumnPruning may strip the inner agg's output.
-          """SELECT 1 FROM (
-            |  SELECT max(dept_id) FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin,
-          // Scalar subquery returning a constant.
-          """SELECT (SELECT 1 FROM dept GROUP BY state LIMIT 1 OFFSET 2)""".stripMargin,
-          // count(*) over a derived table: outer doesn't reference inner cols.
+      val deptPath = new Path(dir.toURI.toString, "dept")
+      spark
+        .sql("""SELECT * FROM VALUES
+               |  (10, 'CA'), (20, 'NY'), (30, 'TX'),
+               |  (40, 'OR'), (50, 'NJ'), (70, 'FL')
+               |AS t(dept_id, state)""".stripMargin)
+        .write
+        .parquet(deptPath.toUri.toString)
+      withParquetTable(deptPath.toUri.toString, "dept") {
+        val sql =
           """SELECT count(*) FROM (
-            |  SELECT max(dept_id) AS m FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin)
-
-        // Find a candidate whose plan has a HashAggregateExec (Spark or Comet) with empty
-        // resultExpressions. collectWithSubqueries traverses Subquery nodes too.
-        val triggering = candidates.find { sql =>
-          val plan = spark.sql(sql).queryExecution.executedPlan
-          collectWithSubqueries(plan) {
-            case a: org.apache.spark.sql.execution.aggregate.HashAggregateExec
-                if a.resultExpressions.isEmpty =>
-              a
-            case a: CometHashAggregateExec if a.resultExpressions.isEmpty => a
-          }.nonEmpty
-        }
-
-        triggering match {
-          case Some(sql) => checkSparkAnswer(sql)
-          case None =>
-            cancel(
-              "No candidate query produced a HashAggregateExec with empty resultExpressions " +
-                "in this environment. The catalyst-pruned shape that exercises #4515 only " +
-                "appears under specific optimizer/AQE state we couldn't reproduce here. The " +
-                "upstream SQL-tests run (subquery/exists-subquery/exists-orderby-limit.sql) " +
-                "covers this path.")
-        }
+            |  SELECT max(dept_id) AS m FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin
+        val plan = spark.sql(sql).queryExecution.executedPlan
+        val pruned = collectWithSubqueries(plan) {
+          case a: org.apache.spark.sql.execution.aggregate.HashAggregateExec
+              if a.resultExpressions.isEmpty =>
+            a
+          case a: CometHashAggregateExec if a.resultExpressions.isEmpty => a
+        }
+        assert(
+          pruned.nonEmpty,
+          s"Expected a HashAggregateExec with empty resultExpressions in:\n$plan")
+        checkSparkAnswerAndOperator(sql)
       }
     }
   }

From e8e438f451b52e678ff43f01e259851eef37c232 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 15:12:40 -0400
Subject: [PATCH 09/13] cleanup.

---
 .../src/main/scala/org/apache/spark/sql/comet/operators.scala | 4 ++--
 .../scala/org/apache/comet/exec/CometAggregateSuite.scala     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
index cbd9dc4cbb..f419ca1b28 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -1612,7 +1612,7 @@ trait CometBaseAggregate {
 
         // Final aggregations may carry a result projection (e.g. `COUNT(col) + 1`) that
         // catalyst encodes via `resultExpressions`. Partial / PartialMerge aggregates emit
-        // raw state buffers and never need it. See comet#4515.
+        // raw state buffers and never need it.
         if (mode == CometAggregateMode.Final) {
           buildAggOp(
             builder,
@@ -1640,7 +1640,7 @@ trait CometBaseAggregate {
    * declared - alias renames, `COUNT(col) + 1`, or empty output for catalyst-pruned EXISTS /
    * row-existence-only subqueries - is expressed as a separate Projection above the HashAgg. Both
    * ops share the caller's `plan_id` so the aggregate's native metrics roll up under the same
-   * Spark operator. See comet#4515.
+   * Spark operator.
    */
   private def buildAggOp(
       builder: Operator.Builder,
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
index 3e883d7a32..ae14c68207 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -2109,7 +2109,7 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
-  // Regression for comet#4515: catalyst prunes `HashAggregateExec.resultExpressions` to
+  // Regression: Catalyst prunes `HashAggregateExec.resultExpressions` to
   // empty for EXISTS / row-existence-only subqueries. The native HashAggregate's natural
   // output (the grouping keys) then disagrees with the pruned JVM `output`, leaking through
   // any boundary that derived its schema from `output`. The fix wraps the aggregate in an
@@ -2121,7 +2121,7 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
   // same derived aggregate triggers the equivalent ColumnPruning path locally - we assert
   // the inner HashAgg's resultExpressions actually got pruned, so a future Spark version
   // that breaks the trigger fails the test loudly rather than passing silently.
-  test("HashAggregate with catalyst-pruned resultExpressions returns 0-col output (#4515)") {
+  test("HashAggregate with catalyst-pruned resultExpressions returns 0-col output") {
     withTempDir { dir =>
       val deptPath = new Path(dir.toURI.toString, "dept")
       spark

From 1913208e7b137a95dd4b6002cf24ed7b21c86037 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 17:05:01 -0400
Subject: [PATCH 10/13] fix aggregation wrapping now that we don't have an
 extra CometExecIterator.

---
 native/shuffle/src/shuffle_writer.rs          | 34 +++++++++----------
 .../comet/exec/CometAggregateSuite.scala      | 24 +++++++++++++
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs
index 8502c79624..b96917550f 100644
--- a/native/shuffle/src/shuffle_writer.rs
+++ b/native/shuffle/src/shuffle_writer.rs
@@ -29,7 +29,7 @@ use datafusion::physical_expr::{EquivalenceProperties, Partitioning};
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::EmptyRecordBatchStream;
 use datafusion::{
-    arrow::{datatypes::SchemaRef, error::ArrowError},
+    arrow::datatypes::SchemaRef,
     error::Result,
     execution::context::TaskContext,
     physical_plan::{
@@ -38,7 +38,7 @@ use datafusion::{
         DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream,
     },
 };
-use futures::{StreamExt, TryFutureExt, TryStreamExt};
+use futures::{StreamExt, TryStreamExt};
 use std::{
     any::Any,
     fmt,
@@ -171,23 +171,23 @@ impl ExecutionPlan for ShuffleWriterExec {
         let input = self.input.execute(partition, Arc::clone(&context))?;
         let metrics = ShufflePartitionerMetrics::new(&self.metrics, 0);
 
+        // Propagate DataFusionError unchanged: the JNI bridge only downcasts a single
+        // `DataFusionError::External(SparkError)` layer, so any extra wrap here loses the
+        // typed exception (e.g. SparkArithmeticException on decimal overflow).
         Ok(Box::pin(RecordBatchStreamAdapter::new(
             self.schema(),
-            futures::stream::once(
-                external_shuffle(
-                    input,
-                    partition,
-                    self.output_data_file.clone(),
-                    self.output_index_file.clone(),
-                    self.partitioning.clone(),
-                    metrics,
-                    context,
-                    self.codec.clone(),
-                    self.tracing_enabled,
-                    self.write_buffer_size,
-                )
-                .map_err(|e| ArrowError::ExternalError(Box::new(e))),
-            )
+            futures::stream::once(external_shuffle(
+                input,
+                partition,
+                self.output_data_file.clone(),
+                self.output_index_file.clone(),
+                self.partitioning.clone(),
+                metrics,
+                context,
+                self.codec.clone(),
+                self.tracing_enabled,
+                self.write_buffer_size,
+            ))
             .try_flatten(),
         )))
     }
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
index ae14c68207..3079db842f 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -1817,6 +1817,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
                 // make sure that the error message throws overflow exception only
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
+                assert(
+                  cometExc.isInstanceOf[ArithmeticException],
+                  "expected ArithmeticException, got " +
+                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ => fail("Exception should be thrown for Long overflow in ANSI mode")
             }
           } else {
@@ -1831,6 +1835,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
+                assert(
+                  cometExc.isInstanceOf[ArithmeticException],
+                  "expected ArithmeticException, got " +
+                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ => fail("Exception should be thrown for Long underflow in ANSI mode")
             }
           } else {
@@ -1870,6 +1878,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
+                assert(
+                  cometExc.isInstanceOf[ArithmeticException],
+                  "expected ArithmeticException, got " +
+                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ =>
                 fail("Exception should be thrown for decimal overflow in ANSI mode")
             }
@@ -1893,6 +1905,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
+                assert(
+                  cometExc.isInstanceOf[ArithmeticException],
+                  "expected ArithmeticException, got " +
+                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ =>
                 fail("Exception should be thrown for Long overflow with GROUP BY in ANSI mode")
             }
@@ -1910,6 +1926,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
+                assert(
+                  cometExc.isInstanceOf[ArithmeticException],
+                  "expected ArithmeticException, got " +
+                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ =>
                 fail("Exception should be thrown for Long underflow with GROUP BY in ANSI mode")
             }
@@ -1951,6 +1971,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
+                assert(
+                  cometExc.isInstanceOf[ArithmeticException],
+                  "expected ArithmeticException, got " +
+                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ =>
                 fail("Exception should be thrown for decimal overflow with GROUP BY in ANSI mode")
             }

From 327a6535d53ac07c076e1faa1f3610441d4d134e Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 17:44:23 -0400
Subject: [PATCH 11/13] Remove archeology comments.

---
 .../core/src/execution/operators/schema_align.rs | 16 +++++++++-------
 native/core/src/execution/planner.rs             |  3 ---
 .../shuffle/CometShuffleExchangeExec.scala       |  7 +++----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/native/core/src/execution/operators/schema_align.rs b/native/core/src/execution/operators/schema_align.rs
index 3d207e0202..58983d132c 100644
--- a/native/core/src/execution/operators/schema_align.rs
+++ b/native/core/src/execution/operators/schema_align.rs
@@ -16,11 +16,11 @@
 // under the License.
 
 //! `SchemaAlignExec` reshapes its child's output so the per-column Arrow type and field-level
-//! nullability match what Spark catalyst declared. Used between an inlined native subtree and
-//! `ShuffleWriterExec` when the FFI deep-copy + `ScanExec` cast in `build_record_batch` are both
-//! gone, so DataFusion / `datafusion-spark` return-type drift would otherwise be written into
-//! shuffle blocks. See <https://github.com/apache/datafusion-comet/issues/4515> for the running
-//! list of mismatched functions.
+//! nullability match what Spark catalyst declared, casting where necessary. Sits between a native
+//! subtree and `ShuffleWriterExec` so DataFusion / `datafusion-spark` return-type drift is caught
+//! before it reaches shuffle blocks. See
+//! <https://github.com/apache/datafusion-comet/issues/4515> for the running list of mismatched
+//! functions.
 
 use arrow::array::{ArrayRef, RecordBatch, RecordBatchOptions};
 use arrow::compute::{cast_with_options, CastOptions};
@@ -53,6 +53,9 @@ fn warn_dedup() -> &'static Mutex<HashSet<String>> {
     SET.get_or_init(|| Mutex::new(HashSet::new()))
 }
 
+/// Casts each column of `child`'s output to the data_type Spark catalyst declared, widening
+/// nullability to `actual.nullable || expected.nullable`. See
+/// <https://github.com/apache/datafusion-comet/issues/4515>.
 #[derive(Debug)]
 pub struct SchemaAlignExec {
     child: Arc<dyn ExecutionPlan>,
@@ -74,8 +77,7 @@ impl SchemaAlignExec {
     /// Build a SchemaAlignExec that aligns `child`'s output to `expected`. Returns
     /// `Ok(child)` unchanged when no per-column reshape is needed; otherwise wraps `child`
     /// in a SchemaAlignExec whose target schema preserves `expected`'s data_type and metadata
-    /// but widens nullability to `actual.nullable || expected.nullable` (matching the
-    /// reconciliation rule used at the FFI boundary on `main`).
+    /// but widens nullability to `actual.nullable || expected.nullable`.
     pub fn try_new_or_passthrough(
         child: Arc<dyn ExecutionPlan>,
         expected: &SchemaRef,
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
index 328ca65760..91d77a35b4 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -1164,9 +1164,6 @@ impl PhysicalPlanner {
                     )?,
                 );
 
-                // HashAggregate emits its natural shape (group keys + agg results); any
-                // post-aggregate projection is serialized as an explicit `OpStruct::Projection`
-                // op above by the JVM serializer (see `CometBaseAggregate.doConvert`)
                 Ok((
                     scans,
                     shuffle_scans,
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
index 861ce6509e..05f3923714 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala
@@ -124,7 +124,7 @@ case class CometShuffleExchangeExec(
           ctx.shuffleScanIndices)
       case None =>
         // Non-native child (e.g. CometSparkToColumnarExec): no subtree to inline. The dep gets
-        // built via the legacy convenience overload below; we just need a real RDD of batches.
+        // built via the convenience overload below; we just need a real RDD of batches.
         child.executeColumnar()
     }
   } else if (shuffleType == CometColumnarShuffle) {
@@ -676,9 +676,8 @@ object CometShuffleExchangeExec
    * Implemented as a thin wrapper around [[prepareNativeShuffleDependency]]: synthesizes a
    * `Scan("ShuffleWriterInput")` as the child native op (so the writer's plan is still
    * `ShuffleWriter -> Scan`, consuming JVM batches via Arrow C Stream), wraps `rdd` as the single
-   * leaf input of a thin scheduling RDD, and supplies a minimal [[NativeExecContext]]. Same wire
-   * shape as before; one writer code path for both this case and the [[CometShuffleExchangeExec]]
-   * case.
+   * leaf input of a thin scheduling RDD, and supplies a minimal [[NativeExecContext]]. Lets the
+   * writer use one code path for both this case and the [[CometShuffleExchangeExec]] case.
    */
   def prepareShuffleDependency(
       rdd: RDD[ColumnarBatch],

From 0b71de43014c683630730d0bf297187d979ba091 Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 18:54:22 -0400
Subject: [PATCH 12/13] Undo stricter tests since they're not happy on Spark
 3.x.

---
 .../comet/exec/CometAggregateSuite.scala      | 33 +++++--------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
index 3079db842f..c441480cec 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -1805,6 +1805,15 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     (1 to 50).flatMap(_ => Seq((maxDec38_0, 1)))
   }
 
+  // SparkPlan.executeCollect wraps task failures in SparkException("Job aborted..."), so the
+  // typed exception thrown by Comet (SparkArithmeticException, which extends ArithmeticException)
+  // ends up in the cause chain rather than at the top.
+  private def hasArithmeticInChain(t: Throwable): Boolean =
+    Iterator
+      .iterate[Throwable](t)(_.getCause)
+      .takeWhile(_ != null)
+      .exists(_.isInstanceOf[ArithmeticException])
+
   test("ANSI support - SUM function") {
     Seq(true, false).foreach { ansiEnabled =>
       withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) {
@@ -1817,10 +1826,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
                 // make sure that the error message throws overflow exception only
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
-                assert(
-                  cometExc.isInstanceOf[ArithmeticException],
-                  "expected ArithmeticException, got " +
-                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ => fail("Exception should be thrown for Long overflow in ANSI mode")
             }
           } else {
@@ -1835,10 +1840,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
-                assert(
-                  cometExc.isInstanceOf[ArithmeticException],
-                  "expected ArithmeticException, got " +
-                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ => fail("Exception should be thrown for Long underflow in ANSI mode")
             }
           } else {
@@ -1878,10 +1879,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
-                assert(
-                  cometExc.isInstanceOf[ArithmeticException],
-                  "expected ArithmeticException, got " +
-                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ =>
                 fail("Exception should be thrown for decimal overflow in ANSI mode")
             }
@@ -1905,10 +1902,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
-                assert(
-                  cometExc.isInstanceOf[ArithmeticException],
-                  "expected ArithmeticException, got " +
-                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ =>
                 fail("Exception should be thrown for Long overflow with GROUP BY in ANSI mode")
             }
@@ -1926,10 +1919,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
-                assert(
-                  cometExc.isInstanceOf[ArithmeticException],
-                  "expected ArithmeticException, got " +
-                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ =>
                 fail("Exception should be thrown for Long underflow with GROUP BY in ANSI mode")
             }
@@ -1971,10 +1960,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               case (Some(sparkExc), Some(cometExc)) =>
                 assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
                 assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW"))
-                assert(
-                  cometExc.isInstanceOf[ArithmeticException],
-                  "expected ArithmeticException, got " +
-                    s"${cometExc.getClass.getName}: ${cometExc.getMessage}")
               case _ =>
                 fail("Exception should be thrown for decimal overflow with GROUP BY in ANSI mode")
             }

From 1ecfd8a2001f9a1c5b6313fcca05745f768fdbfb Mon Sep 17 00:00:00 2001
From: Matt Butrovich <mbutrovich@gmail.com>
Date: Fri, 29 May 2026 18:55:19 -0400
Subject: [PATCH 13/13] Remove unintended change.

---
 .../org/apache/comet/exec/CometAggregateSuite.scala      | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
index c441480cec..ae14c68207 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -1805,15 +1805,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     (1 to 50).flatMap(_ => Seq((maxDec38_0, 1)))
   }
 
-  // SparkPlan.executeCollect wraps task failures in SparkException("Job aborted..."), so the
-  // typed exception thrown by Comet (SparkArithmeticException, which extends ArithmeticException)
-  // ends up in the cause chain rather than at the top.
-  private def hasArithmeticInChain(t: Throwable): Boolean =
-    Iterator
-      .iterate[Throwable](t)(_.getCause)
-      .takeWhile(_ != null)
-      .exists(_.isInstanceOf[ArithmeticException])
-
   test("ANSI support - SUM function") {
     Seq(true, false).foreach { ansiEnabled =>
       withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) {