From 5c4121526a73be75fe3d02bb710bf1029ed1ce5a Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Thu, 28 May 2026 20:21:47 -0400 Subject: [PATCH 01/13] Passes CometFuzzTestSuite, CometNativeShuffleSuite, CometExecSuite. --- .../shuffle/CometNativeShuffleWriter.scala | 341 +++++++++--------- .../shuffle/CometShuffleDependency.scala | 13 +- .../shuffle/CometShuffleExchangeExec.scala | 170 ++++++++- .../shuffle/CometShuffleManager.scala | 3 + .../apache/spark/sql/comet/operators.scala | 320 ++++++++-------- 5 files changed, 528 insertions(+), 319 deletions(-) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala index f27d021ac4..68078e8507 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala @@ -32,19 +32,34 @@ import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriteMetricsR import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning, SinglePartition} -import org.apache.spark.sql.comet.{CometExec, CometMetricNode} +import org.apache.spark.sql.comet.{CometExec, CometMetricNode, CometScalarSubquery, NativeExecContext, PlanDataInjector} import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.comet.CometConf +import org.apache.comet.{CometConf, CometExecIterator} import org.apache.comet.serde.{OperatorOuterClass, PartitioningOuterClass, QueryPlanSerde} import org.apache.comet.serde.OperatorOuterClass.{CompressionCodec, Operator} -import org.apache.comet.serde.QueryPlanSerde.serializeDataType /** - * A [[ShuffleWriter]] that will delegate shuffle write to native shuffle. + * A [[ShuffleWriter]] that drives the native shuffle write in a single [[CometExecIterator]] per + * partition. The unified plan it executes has [[OperatorOuterClass.ShuffleWriter]] at the root + * with `childNativeOp` as its only child. Leaf input iterators come from + * [[CometNativeShuffleInputIterator]] (constructed by [[CometNativeShuffleInputRDD.compute]]). + * + * Two flavors of `childNativeOp` are in use: + * - rich Comet native subtree (e.g. HashAgg / Filter / ShuffleScan), supplied by + * [[CometShuffleExchangeExec]] when its child is a + * [[org.apache.spark.sql.comet.CometNativeExec]]. + * - synthetic `Scan("ShuffleWriterInput")` placeholder, supplied by the convenience overload of + * [[CometShuffleExchangeExec.prepareShuffleDependency]] for callers that already hold an + * `RDD[ColumnarBatch]` of native-driven batches (e.g. + * [[org.apache.spark.sql.comet.CometCollectLimitExec]]). + * + * The writer treats both shapes identically. */ class CometNativeShuffleWriter[K, V]( + childNativeOp: Operator, + childMetricNode: CometMetricNode, + nativeContext: NativeExecContext, outputPartitioning: Partitioning, outputAttributes: Seq[Attribute], metrics: Map[String, SQLMetric], @@ -72,8 +87,22 @@ class CometNativeShuffleWriter[K, V]( val tempDataFilePath = Paths.get(tempDataFilename) val tempIndexFilePath = Paths.get(tempIndexFilename) - // Call native shuffle write - val nativePlan = getNativePlan(tempDataFilename, tempIndexFilename) + // Pull the per-partition leaf iterators and partition index from the iterator handed to us + // by Spark's ShuffleMapTask. CometNativeShuffleInputRDD.compute always returns this exact + // iterator type; no other RDD layers between produce a Product2[Int, ColumnarBatch]. + val shuffleInputIter = inputs.asInstanceOf[CometNativeShuffleInputIterator] + val partitionIdx = shuffleInputIter.partitionIndex + val leafIterators = shuffleInputIter.leafIterators + val shuffleBlockIters = shuffleInputIter.shuffleBlockIterators + + val unifiedPlan = buildUnifiedPlan(tempDataFilename, tempIndexFilename) + val finalNativePlan = if (nativeContext.commonByKey.nonEmpty) { + val partitionDataByKey = + nativeContext.perPartitionByKey.map { case (k, arr) => k -> arr(partitionIdx) } + PlanDataInjector.injectPlanData(unifiedPlan, nativeContext.commonByKey, partitionDataByKey) + } else { + unifiedPlan + } val detailedMetrics = Seq( "elapsed_compute", @@ -82,29 +111,42 @@ class CometNativeShuffleWriter[K, V]( "input_batches", "spill_count", "spilled_bytes") - - // Maps native metrics to SQL metrics val metricsOutputRows = new SQLMetric("outputRows") val metricsWriteTime = new SQLMetric("writeTime") - val nativeSQLMetrics = Map( + val shuffleWriterSQLMetrics = Map( "output_rows" -> metricsOutputRows, "data_size" -> metrics("dataSize"), "write_time" -> metricsWriteTime) ++ metrics.filterKeys(detailedMetrics.contains) - val nativeMetrics = CometMetricNode(nativeSQLMetrics) - // Getting rid of the fake partitionId - val newInputs = inputs.asInstanceOf[Iterator[_ <: Product2[Any, Any]]].map(_._2) + // ShuffleWriter metrics live at the root of the metric tree; the child operator's metric + // tree (rich subtree or empty leaf for the Scan placeholder) is attached underneath so the + // SQL UI sees the same per-node breakdown the split-driver flow produced. + val nativeMetrics = CometMetricNode(shuffleWriterSQLMetrics, Seq(childMetricNode)) - val cometIter = CometExec.getCometIterator( - Seq(newInputs.asInstanceOf[Iterator[ColumnarBatch]]), + val cometIter = new CometExecIterator( + CometExec.newIterId, + leafIterators, outputAttributes.length, - nativePlan, + PlanDataInjector.serializeOperator(finalNativePlan), nativeMetrics, numParts, - context.partitionId(), - broadcastedHadoopConfForEncryption = None, - encryptedFilePaths = Seq.empty) + partitionIdx, + nativeContext.broadcastedHadoopConfForEncryption, + nativeContext.encryptedFilePaths, + shuffleBlockIters) + + // Register subqueries against the iterator id so native callbacks resolve them to values. + nativeContext.subqueries.foreach { sub => + CometScalarSubquery.setSubquery(cometIter.id, sub) + } + Option(context).foreach { taskCtx => + taskCtx.addTaskCompletionListener[Unit] { _ => + nativeContext.subqueries.foreach { sub => + CometScalarSubquery.removeSubquery(cometIter.id, sub) + } + } + } while (cometIter.hasNext) { cometIter.next() @@ -134,7 +176,7 @@ class CometNativeShuffleWriter[K, V]( // Report spill metrics to Spark's task metrics so they appear in // Spark UI task summaries (not just SQL metrics) - val spilledBytes = nativeSQLMetrics.get("spilled_bytes").map(_.value).getOrElse(0L) + val spilledBytes = shuffleWriterSQLMetrics.get("spilled_bytes").map(_.value).getOrElse(0L) if (spilledBytes > 0) { context.taskMetrics().incMemoryBytesSpilled(spilledBytes) context.taskMetrics().incDiskBytesSpilled(spilledBytes) @@ -162,163 +204,138 @@ class CometNativeShuffleWriter[K, V]( case _ => false } - private def getNativePlan(dataFile: String, indexFile: String): Operator = { - val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput") - val opBuilder = OperatorOuterClass.Operator.newBuilder() - - val scanTypes = outputAttributes.flatten { attr => - serializeDataType(attr.dataType) + /** + * Build the unified `ShuffleWriter(child = childNativeOp)` plan with the partitioning serde, + * compression settings, and output file paths. + */ + private def buildUnifiedPlan(dataFile: String, indexFile: String): Operator = { + val shuffleWriterBuilder = OperatorOuterClass.ShuffleWriter.newBuilder() + shuffleWriterBuilder.setOutputDataFile(dataFile) + shuffleWriterBuilder.setOutputIndexFile(indexFile) + + if (SparkEnv.get.conf.getBoolean("spark.shuffle.compress", true)) { + val codec = CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_CODEC.get() match { + case "zstd" => CompressionCodec.Zstd + case "lz4" => CompressionCodec.Lz4 + case "snappy" => CompressionCodec.Snappy + case other => throw new UnsupportedOperationException(s"invalid codec: $other") + } + shuffleWriterBuilder.setCodec(codec) + } else { + shuffleWriterBuilder.setCodec(CompressionCodec.None) } - - if (scanTypes.length == outputAttributes.length) { - scanBuilder.addAllFields(scanTypes.asJava) - - val shuffleWriterBuilder = OperatorOuterClass.ShuffleWriter.newBuilder() - shuffleWriterBuilder.setOutputDataFile(dataFile) - shuffleWriterBuilder.setOutputIndexFile(indexFile) - - if (SparkEnv.get.conf.getBoolean("spark.shuffle.compress", true)) { - val codec = CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_CODEC.get() match { - case "zstd" => CompressionCodec.Zstd - case "lz4" => CompressionCodec.Lz4 - case "snappy" => CompressionCodec.Snappy - case other => throw new UnsupportedOperationException(s"invalid codec: $other") + shuffleWriterBuilder.setCompressionLevel( + CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL.get) + shuffleWriterBuilder.setWriteBufferSize( + CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().min(Int.MaxValue).toInt) + + outputPartitioning match { + case p if isSinglePartitioning(p) => + val partitioning = PartitioningOuterClass.SinglePartition.newBuilder() + val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder() + shuffleWriterBuilder.setPartitioning( + partitioningBuilder.setSinglePartition(partitioning).build()) + case _: HashPartitioning => + val hashPartitioning = outputPartitioning.asInstanceOf[HashPartitioning] + val partitioning = PartitioningOuterClass.HashPartition.newBuilder() + partitioning.setNumPartitions(outputPartitioning.numPartitions) + + val partitionExprs = hashPartitioning.expressions + .flatMap(e => QueryPlanSerde.exprToProto(e, outputAttributes)) + + if (partitionExprs.length != hashPartitioning.expressions.length) { + throw new UnsupportedOperationException( + s"Partitioning $hashPartitioning is not supported.") } - shuffleWriterBuilder.setCodec(codec) - } else { - shuffleWriterBuilder.setCodec(CompressionCodec.None) - } - shuffleWriterBuilder.setCompressionLevel( - CometConf.COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL.get) - shuffleWriterBuilder.setWriteBufferSize( - CometConf.COMET_SHUFFLE_WRITE_BUFFER_SIZE.get().min(Int.MaxValue).toInt) - - outputPartitioning match { - case p if isSinglePartitioning(p) => - val partitioning = PartitioningOuterClass.SinglePartition.newBuilder() - val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder() - shuffleWriterBuilder.setPartitioning( - partitioningBuilder.setSinglePartition(partitioning).build()) - case _: HashPartitioning => - val hashPartitioning = outputPartitioning.asInstanceOf[HashPartitioning] - - val partitioning = PartitioningOuterClass.HashPartition.newBuilder() - partitioning.setNumPartitions(outputPartitioning.numPartitions) + partitioning.addAllHashExpression(partitionExprs.asJava) + + val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder() + shuffleWriterBuilder.setPartitioning( + partitioningBuilder.setHashPartition(partitioning).build()) + case _: RangePartitioning => + val rangePartitioning = outputPartitioning.asInstanceOf[RangePartitioning] + val partitioning = PartitioningOuterClass.RangePartition.newBuilder() + partitioning.setNumPartitions(outputPartitioning.numPartitions) + + // Detect duplicates by tracking expressions directly, similar to DataFusion's LexOrdering + // DataFusion will deduplicate identical sort expressions in LexOrdering, + // so we need to transform boundary rows to match the deduplicated structure + val seenExprs = mutable.HashSet[Expression]() + val deduplicationMap = mutable.ArrayBuffer[(Int, Boolean)]() + + rangePartitioning.ordering.zipWithIndex.foreach { case (sortOrder, idx) => + if (seenExprs.contains(sortOrder.child)) { + deduplicationMap += (idx -> false) + } else { + seenExprs += sortOrder.child + deduplicationMap += (idx -> true) + } + } - val partitionExprs = hashPartitioning.expressions + { + val orderingExprs = rangePartitioning.ordering .flatMap(e => QueryPlanSerde.exprToProto(e, outputAttributes)) - - if (partitionExprs.length != hashPartitioning.expressions.length) { + if (orderingExprs.length != rangePartitioning.ordering.length) { throw new UnsupportedOperationException( - s"Partitioning $hashPartitioning is not supported.") - } - - partitioning.addAllHashExpression(partitionExprs.asJava) - - val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder() - shuffleWriterBuilder.setPartitioning( - partitioningBuilder.setHashPartition(partitioning).build()) - case _: RangePartitioning => - val rangePartitioning = outputPartitioning.asInstanceOf[RangePartitioning] - - val partitioning = PartitioningOuterClass.RangePartition.newBuilder() - partitioning.setNumPartitions(outputPartitioning.numPartitions) - - // Detect duplicates by tracking expressions directly, similar to DataFusion's LexOrdering - // DataFusion will deduplicate identical sort expressions in LexOrdering, - // so we need to transform boundary rows to match the deduplicated structure - val seenExprs = mutable.HashSet[Expression]() - val deduplicationMap = mutable.ArrayBuffer[(Int, Boolean)]() // (originalIndex, isKept) - - rangePartitioning.ordering.zipWithIndex.foreach { case (sortOrder, idx) => - if (seenExprs.contains(sortOrder.child)) { - deduplicationMap += (idx -> false) // Will be deduplicated by DataFusion - } else { - seenExprs += sortOrder.child - deduplicationMap += (idx -> true) // Will be kept by DataFusion - } - } - - { - // Serialize the ordering expressions for comparisons - val orderingExprs = rangePartitioning.ordering - .flatMap(e => QueryPlanSerde.exprToProto(e, outputAttributes)) - if (orderingExprs.length != rangePartitioning.ordering.length) { - throw new UnsupportedOperationException( - s"Partitioning $rangePartitioning is not supported.") - } - partitioning.addAllSortOrders(orderingExprs.asJava) + s"Partitioning $rangePartitioning is not supported.") } + partitioning.addAllSortOrders(orderingExprs.asJava) + } - // Convert Spark's sequence of InternalRows that represent partitioning boundaries to - // sequences of Literals, where each outer entry represents a boundary row, and each - // internal entry is a value in that row. In other words, these are stored in row major - // order, not column major - val boundarySchema = rangePartitioning.ordering.flatMap(e => Some(e.dataType)) - - // Transform boundary rows to match DataFusion's deduplicated structure - val transformedBoundaryExprs: Seq[Seq[Literal]] = - rangePartitionBounds.get.map((row: InternalRow) => { - // For every InternalRow, map its values to Literals - val allLiterals = - row.toSeq(boundarySchema).zip(boundarySchema).map { case (value, valueType) => - Literal(value, valueType) - } - - // Keep only the literals that correspond to non-deduplicated expressions - allLiterals - .zip(deduplicationMap) - .filter(_._2._2) // Keep only where isKept = true - .map(_._1) // Extract the literal + val boundarySchema = rangePartitioning.ordering.flatMap(e => Some(e.dataType)) + + val transformedBoundaryExprs: Seq[Seq[Literal]] = + rangePartitionBounds.get.map((row: InternalRow) => { + val allLiterals = + row.toSeq(boundarySchema).zip(boundarySchema).map { case (value, valueType) => + Literal(value, valueType) + } + allLiterals + .zip(deduplicationMap) + .filter(_._2._2) + .map(_._1) + }) + + { + val boundaryRows: Seq[PartitioningOuterClass.BoundaryRow] = transformedBoundaryExprs + .map((rowLiterals: Seq[Literal]) => { + val rowBuilder = PartitioningOuterClass.BoundaryRow.newBuilder(); + val serializedExprs = + rowLiterals.map(lit_value => + QueryPlanSerde.exprToProto(lit_value, outputAttributes).get) + rowBuilder.addAllPartitionBounds(serializedExprs.asJava) + rowBuilder.build() }) + partitioning.addAllBoundaryRows(boundaryRows.asJava) + } - { - // Convert the sequences of Literals to a collection of serialized BoundaryRows - val boundaryRows: Seq[PartitioningOuterClass.BoundaryRow] = transformedBoundaryExprs - .map((rowLiterals: Seq[Literal]) => { - // Serialize each sequence of Literals as a BoundaryRow - val rowBuilder = PartitioningOuterClass.BoundaryRow.newBuilder(); - val serializedExprs = - rowLiterals.map(lit_value => - QueryPlanSerde.exprToProto(lit_value, outputAttributes).get) - rowBuilder.addAllPartitionBounds(serializedExprs.asJava) - rowBuilder.build() - }) - partitioning.addAllBoundaryRows(boundaryRows.asJava) - } - - val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder() - shuffleWriterBuilder.setPartitioning( - partitioningBuilder.setRangePartition(partitioning).build()) + val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder() + shuffleWriterBuilder.setPartitioning( + partitioningBuilder.setRangePartition(partitioning).build()) - case _: RoundRobinPartitioning => - val partitioning = PartitioningOuterClass.RoundRobinPartition.newBuilder() - partitioning.setNumPartitions(outputPartitioning.numPartitions) - partitioning.setMaxHashColumns( - CometConf.COMET_EXEC_SHUFFLE_WITH_ROUND_ROBIN_PARTITIONING_MAX_HASH_COLUMNS.get()) + case _: RoundRobinPartitioning => + val partitioning = PartitioningOuterClass.RoundRobinPartition.newBuilder() + partitioning.setNumPartitions(outputPartitioning.numPartitions) + partitioning.setMaxHashColumns( + CometConf.COMET_EXEC_SHUFFLE_WITH_ROUND_ROBIN_PARTITIONING_MAX_HASH_COLUMNS.get()) - val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder() - shuffleWriterBuilder.setPartitioning( - partitioningBuilder.setRoundRobinPartition(partitioning).build()) + val partitioningBuilder = PartitioningOuterClass.Partitioning.newBuilder() + shuffleWriterBuilder.setPartitioning( + partitioningBuilder.setRoundRobinPartition(partitioning).build()) - case _ => - throw new UnsupportedOperationException( - s"Partitioning $outputPartitioning is not supported.") - } + case _ => + throw new UnsupportedOperationException( + s"Partitioning $outputPartitioning is not supported.") + } - shuffleWriterBuilder.setTracingEnabled(CometConf.COMET_TRACING_ENABLED.get()) + shuffleWriterBuilder.setTracingEnabled(CometConf.COMET_TRACING_ENABLED.get()) - val shuffleWriterOpBuilder = OperatorOuterClass.Operator.newBuilder() - shuffleWriterOpBuilder - .setShuffleWriter(shuffleWriterBuilder) - .addChildren(opBuilder.setScan(scanBuilder).build()) - .build() - } else { - // There are unsupported scan type - throw new UnsupportedOperationException( - s"$outputAttributes contains unsupported data types for CometShuffleExchangeExec.") - } + OperatorOuterClass.Operator + .newBuilder() + .setShuffleWriter(shuffleWriterBuilder) + .addChildren(childNativeOp) + .build() } override def stop(success: Boolean): Option[MapStatus] = { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala index 2b74e5a168..9ec25c49ec 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala @@ -28,11 +28,19 @@ import org.apache.spark.shuffle.ShuffleWriteProcessor import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.physical.Partitioning +import org.apache.spark.sql.comet.{CometMetricNode, NativeExecContext} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types.StructType +import org.apache.comet.serde.OperatorOuterClass + /** * A [[ShuffleDependency]] that allows us to identify the shuffle dependency as a Comet shuffle. + * + * On the native-shuffle path, also carries the child plan's per-partition execution context, root + * native operator, and metric node so [[CometNativeShuffleWriter]] can drive the unified + * `ShuffleWriter(child = childNativeOp)` plan in a single [[org.apache.comet.CometExecIterator]] + * per partition. These three fields are populated only when `shuffleType == CometNativeShuffle`. */ class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag]( @transient private val _rdd: RDD[_ <: Product2[K, V]], @@ -49,7 +57,10 @@ class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag]( val outputAttributes: Seq[Attribute] = Seq.empty, val shuffleWriteMetrics: Map[String, SQLMetric] = Map.empty, val numParts: Int = 0, - val rangePartitionBounds: Option[Seq[InternalRow]] = None) + val rangePartitionBounds: Option[Seq[InternalRow]] = None, + val nativeExecContext: Option[NativeExecContext] = None, + val childNativeOp: Option[OperatorOuterClass.Operator] = None, + val childMetricNode: Option[CometMetricNode] = None) extends ShuffleDependency[K, V, C]( _rdd, partitioner, diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala index 493c20f8b7..16bd40d402 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, Exp import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.comet.{CometMetricNode, CometNativeExec, CometPlan, CometSinkPlaceHolder} +import org.apache.spark.sql.comet.{CometMetricNode, CometNativeExec, CometPlan, CometSinkPlaceHolder, NativeExecContext} import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleExchangeExec, ShuffleExchangeLike, ShuffleOrigin} @@ -101,9 +101,38 @@ case class CometShuffleExchangeExec( private lazy val serializer: Serializer = new UnsafeRowSerializer(child.output.size, longMetric("dataSize")) + /** + * Per-partition execution context for the child native subtree, computed once and shared + * between [[inputRDD]] (which uses it to wire DAGScheduler dependencies) and + * [[shuffleDependency]] (which threads it through to [[CometNativeShuffleWriter]] for + * single-iterator native execution). Only populated when `shuffleType == CometNativeShuffle` + * AND the child is a [[CometNativeExec]] subtree we can inline. When the child is a non-native + * Comet plan (e.g. [[org.apache.spark.sql.comet.CometSparkToColumnarExec]]), this stays `None` + * and the shuffle falls back to the legacy `Scan("ShuffleWriterInput") -> ShuffleWriter` plan + * via the convenience overload of `prepareShuffleDependency`. + */ + @transient private lazy val nativeChildContext: Option[NativeExecContext] = child match { + case nativeChild: CometNativeExec if shuffleType == CometNativeShuffle => + Some(nativeChild.buildNativeContext()) + case _ => None + } + @transient lazy val inputRDD: RDD[_] = if (shuffleType == CometNativeShuffle) { - // CometNativeShuffle assumes that the input plan is Comet plan. - child.executeColumnar() + nativeChildContext match { + case Some(ctx) => + // Single-driver path: thin scheduling anchor; CometNativeShuffleWriter drives the + // unified ShuffleWriter + child plan in a single CometExecIterator per partition. + new CometNativeShuffleInputRDD( + sparkContext, + ctx.inputs, + ctx.numPartitions, + ctx.shuffleScanIndices) + case None => + // Child is a Comet plan but not a CometNativeExec subtree (e.g. CometSparkToColumnarExec). + // No native subtree to inline; the writer's plan is `Scan("ShuffleWriterInput") -> + // ShuffleWriter` and JVM batches flow into native through Arrow C Stream Interface. + child.executeColumnar() + } } else if (shuffleType == CometColumnarShuffle) { // CometColumnarShuffle uses Spark's row-based execute() API. For Spark row-based plans, // rows flow directly. For Comet native plans, their doExecute() wraps with ColumnarToRowExec @@ -149,12 +178,36 @@ case class CometShuffleExchangeExec( @transient lazy val shuffleDependency: ShuffleDependency[Int, _, _] = if (shuffleType == CometNativeShuffle) { - val dep = CometShuffleExchangeExec.prepareShuffleDependency( - inputRDD.asInstanceOf[RDD[ColumnarBatch]], - child.output, - outputPartitioning, - serializer, - metrics) + val dep = nativeChildContext match { + case Some(ctx) => + // Single-driver path: child is a CometNativeExec subtree. RangePartitioning needs real + // rows to compute partition bounds; use a regular columnar execution of the child for + // sampling only. The actual shuffle still goes through the single-iterator path. + val nativeChild = child.asInstanceOf[CometNativeExec] + val samplingRDD: Option[RDD[ColumnarBatch]] = outputPartitioning match { + case _: RangePartitioning => Some(child.executeColumnar()) + case _ => None + } + CometShuffleExchangeExec.prepareNativeShuffleDependency( + inputRDD.asInstanceOf[CometNativeShuffleInputRDD], + samplingRDD, + child.output, + outputPartitioning, + serializer, + metrics, + ctx, + nativeChild.nativeOp, + CometMetricNode.fromCometPlan(nativeChild)) + case None => + // Child is a non-native Comet plan; the writer falls back to its Scan-placeholder + // path via the convenience overload of prepareShuffleDependency. + CometShuffleExchangeExec.prepareShuffleDependency( + inputRDD.asInstanceOf[RDD[ColumnarBatch]], + child.output, + outputPartitioning, + serializer, + metrics) + } metrics("numPartitions").set(dep.partitioner.numPartitions) val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates( @@ -623,21 +676,107 @@ object CometShuffleExchangeExec } } + /** + * Build a Comet native shuffle dependency around an existing `RDD[ColumnarBatch]` of real + * batches. Used by [[org.apache.spark.sql.comet.CometCollectLimitExec]] and + * [[org.apache.spark.sql.comet.CometTakeOrderedAndProjectExec]] where the input is the result + * of a local-limit / topK transform and there is no separate child native subtree to inline. + * + * Implemented as a thin wrapper around [[prepareNativeShuffleDependency]]: synthesizes a + * `Scan("ShuffleWriterInput")` as the child native op (so the writer's plan is still + * `ShuffleWriter -> Scan`, consuming JVM batches via Arrow C Stream), wraps `rdd` as the single + * leaf input of a thin scheduling RDD, and supplies a minimal [[NativeExecContext]]. Same wire + * shape as before; one writer code path for both this case and the [[CometShuffleExchangeExec]] + * case. + */ def prepareShuffleDependency( rdd: RDD[ColumnarBatch], outputAttributes: Seq[Attribute], outputPartitioning: Partitioning, serializer: Serializer, metrics: Map[String, SQLMetric]): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = { - val numParts = rdd.getNumPartitions + + val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput") + val scanTypes = outputAttributes.flatten { attr => + QueryPlanSerde.serializeDataType(attr.dataType) + } + if (scanTypes.length != outputAttributes.length) { + throw new UnsupportedOperationException( + s"$outputAttributes contains unsupported data types for CometShuffleExchangeExec.") + } + scanBuilder.addAllFields(scanTypes.asJava) + val scanOp = OperatorOuterClass.Operator.newBuilder().setScan(scanBuilder).build() + + val thinRDD = new CometNativeShuffleInputRDD( + rdd.sparkContext, + Seq(rdd), + rdd.getNumPartitions, + shuffleScanIndices = Set.empty) + + val ctx = NativeExecContext( + inputs = Seq(rdd), + numPartitions = rdd.getNumPartitions, + subqueries = Seq.empty, + broadcastedHadoopConfForEncryption = None, + encryptedFilePaths = Seq.empty, + commonByKey = Map.empty, + perPartitionByKey = Map.empty, + shuffleScanIndices = Set.empty, + hasScanInput = false) + + // The Scan placeholder has no per-operator metrics, so the metric tree for the unified plan + // is `shuffleWriterMetrics` at the root with one empty leaf for the Scan child. + prepareNativeShuffleDependency( + thinRDD, + Some(rdd), + outputAttributes, + outputPartitioning, + serializer, + metrics, + ctx, + scanOp, + CometMetricNode(Map.empty)) + } + + /** + * Build a Comet native shuffle dependency for the [[CometShuffleExchangeExec]] case where the + * shuffle is fed by a [[CometNativeExec]] child. The writer drives the unified + * `ShuffleWriter(child = childNativeOp)` plan in a single + * [[org.apache.comet.CometExecIterator]] per partition. The returned dep carries the child's + * per-partition execution context, root native operator, and metric node so + * [[CometNativeShuffleWriter]] can reach them at task time. + * + * @param thinRDD + * scheduling-anchor RDD whose `compute` returns a [[CometNativeShuffleInputIterator]]; + * produces no batches itself. + * @param samplingRDD + * regular columnar execution of the child, only required for [[RangePartitioning]] (sampling + * needs real rows). `None` for hash / single / round-robin. + */ + def prepareNativeShuffleDependency( + thinRDD: CometNativeShuffleInputRDD, + samplingRDD: Option[RDD[ColumnarBatch]], + outputAttributes: Seq[Attribute], + outputPartitioning: Partitioning, + serializer: Serializer, + metrics: Map[String, SQLMetric], + nativeExecContext: NativeExecContext, + childNativeOp: OperatorOuterClass.Operator, + childMetricNode: CometMetricNode): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = { + val numParts = thinRDD.getNumPartitions // The code block below is mostly brought over from // ShuffleExchangeExec::prepareShuffleDependency val (partitioner, rangePartitionBounds) = outputPartitioning match { case rangePartitioning: RangePartitioning => + // Sampling needs real rows; use the dedicated samplingRDD (a regular columnar execution + // of the child). The thin RDD itself yields nothing. + val samplingInput = samplingRDD.getOrElse( + throw new IllegalStateException( + "RangePartitioning requires a samplingRDD on the native-shuffle path")) // Extract only fields used for sorting to avoid collecting large fields that does not // affect sorting result when deciding partition bounds in RangePartitioner - val rddForSampling = rdd.mapPartitionsInternal { iter => + val rddForSampling = samplingInput.mapPartitionsInternal { iter => val projection = UnsafeProjection.create(rangePartitioning.ordering.map(_.child), outputAttributes) val mutablePair = new MutablePair[InternalRow, Null]() @@ -683,9 +822,7 @@ object CometShuffleExchangeExec } val dependency = new CometShuffleDependency[Int, ColumnarBatch, ColumnarBatch]( - rdd.map( - (0, _) - ), // adding fake partitionId that is always 0 because ShuffleDependency requires it + thinRDD, serializer = serializer, shuffleWriterProcessor = ShuffleExchangeExec.createShuffleWriteProcessor(metrics), shuffleType = CometNativeShuffle, @@ -695,7 +832,10 @@ object CometShuffleExchangeExec outputAttributes = outputAttributes, shuffleWriteMetrics = metrics, numParts = numParts, - rangePartitionBounds = rangePartitionBounds) + rangePartitionBounds = rangePartitionBounds, + nativeExecContext = Some(nativeExecContext), + childNativeOp = Some(childNativeOp), + childMetricNode = Some(childMetricNode)) dependency } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala index c8f2199d53..16a21bbfd9 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala @@ -231,6 +231,9 @@ class CometShuffleManager(conf: SparkConf) extends ShuffleManager with Logging { case cometShuffleHandle: CometNativeShuffleHandle[K @unchecked, V @unchecked] => val dep = cometShuffleHandle.dependency.asInstanceOf[CometShuffleDependency[_, _, _]] new CometNativeShuffleWriter( + dep.childNativeOp.get, + dep.childMetricNode.get, + dep.nativeExecContext.get, dep.outputPartitioning.get, dep.outputAttributes, dep.shuffleWriteMetrics, diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 7d5398ae62..748362cd65 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -381,6 +381,29 @@ object CometExec { } } +/** + * Per-partition execution context for a native subtree rooted at a [[CometNativeExec]] boundary. + * + * Built once on the driver from the SparkPlan tree and consumed by either: + * - [[CometNativeExec.doExecuteColumnar]] to construct a [[CometExecRDD]], or + * - the native-shuffle path, where [[CometNativeShuffleWriter]] drives the same child plan with + * a `ShuffleWriter` operator as its root. + * + * The fields capture everything that depends on tree-walking the SparkPlan and aligning leaf + * input RDDs (broadcast partition counts, plan-data, subqueries, encryption options) so callers + * do not have to re-derive them. + */ +private[comet] case class NativeExecContext( + inputs: Seq[RDD[ColumnarBatch]], + numPartitions: Int, + subqueries: Seq[ScalarSubquery], + broadcastedHadoopConfForEncryption: Option[Broadcast[SerializableConfiguration]], + encryptedFilePaths: Seq[String], + commonByKey: Map[String, Array[Byte]], + perPartitionByKey: Map[String, Array[Array[Byte]]], + shuffleScanIndices: Set[Int], + hasScanInput: Boolean) + /** * A Comet native physical operator. */ @@ -426,157 +449,30 @@ abstract class CometNativeExec extends CometExec { throw new CometRuntimeException( s"CometNativeExec should not be executed directly without a serialized plan: $this") case Some(serializedPlan) => - val serializedPlanCopy = serializedPlan // TODO: support native metrics for all operators. val nativeMetrics = CometMetricNode.fromCometPlan(this) + val ctx = buildNativeContext() - // Go over all the native scans, in order to see if they need encryption options. - // For each relation in a CometNativeScan generate a hadoopConf, - // for each file path in a relation associate with hadoopConf - // This is done per native plan, so only count scans until a comet input is reached. - val encryptionOptions = - mutable.ArrayBuffer.empty[(Broadcast[SerializableConfiguration], Seq[String])] - foreachUntilCometInput(this) { - case scan: CometNativeScanExec => - // This creates a hadoopConf that brings in any SQLConf "spark.hadoop.*" configs and - // per-relation configs since different tables might have different decryption - // properties. - val hadoopConf = scan.relation.sparkSession.sessionState - .newHadoopConfWithOptions(scan.relation.options) - val encryptionEnabled = CometParquetUtils.encryptionEnabled(hadoopConf) - if (encryptionEnabled) { - // hadoopConf isn't serializable, so we have to do a broadcasted config. - val broadcastedConf = - scan.relation.sparkSession.sparkContext - .broadcast(new SerializableConfiguration(hadoopConf)) - - val optsTuple: (Broadcast[SerializableConfiguration], Seq[String]) = - (broadcastedConf, scan.relation.inputFiles.toSeq) - encryptionOptions += optsTuple - } - case _ => // no-op - } - assert( - encryptionOptions.size <= 1, - "We expect one native scan that requires encryption reading in a Comet plan," + - " since we will broadcast one hadoopConf.") - // If this assumption changes in the future, you can look at the commit history of #2447 - // to see how there used to be a map of relations to broadcasted confs in case multiple - // relations in a single plan. The example that came up was UNION. See discussion at: - // https://github.com/apache/datafusion-comet/pull/2447#discussion_r2406118264 - val (broadcastedHadoopConfForEncryption, encryptedFilePaths) = - encryptionOptions.headOption match { - case Some((conf, paths)) => (Some(conf), paths) - case None => (None, Seq.empty) - } - - // Find planning data within this stage (stops at shuffle boundaries). - val (commonByKey, perPartitionByKey) = findAllPlanData(this) - - // Collect the input ColumnarBatches from the child operators and create a CometExecIterator - // to execute the native plan. - val sparkPlans = ArrayBuffer.empty[SparkPlan] - val inputs = ArrayBuffer.empty[RDD[ColumnarBatch]] - - foreachUntilCometInput(this)(sparkPlans += _) - - // Find the first non broadcast plan - val firstNonBroadcastPlan = sparkPlans.zipWithIndex.find { - case (_: CometBroadcastExchangeExec, _) => false - case (BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _), _) => false - case (BroadcastQueryStageExec(_, _: ReusedExchangeExec, _), _) => false - case (ReusedExchangeExec(_, _: CometBroadcastExchangeExec), _) => false - case _ => true - } - - val containsBroadcastInput = sparkPlans.exists { - case _: CometBroadcastExchangeExec => true - case BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _) => true - case BroadcastQueryStageExec(_, _: ReusedExchangeExec, _) => true - case ReusedExchangeExec(_, _: CometBroadcastExchangeExec) => true - case _ => false - } - - // If the first non broadcast plan is not found, it means all the plans are broadcast plans. - // This is not expected, so throw an exception. - if (containsBroadcastInput && firstNonBroadcastPlan.isEmpty) { - throw new CometRuntimeException(s"Cannot find the first non broadcast plan: $this") - } - - // If the first non broadcast plan is found, we need to adjust the partition number of - // the broadcast plans to make sure they have the same partition number as the first non - // broadcast plan. - val (firstNonBroadcastPlanRDD, firstNonBroadcastPlanNumPartitions) = - firstNonBroadcastPlan.get._1 match { - case plan: CometNativeExec => - (null, plan.outputPartitioning.numPartitions) - case plan => - val rdd = plan.executeColumnar() - (rdd, rdd.getNumPartitions) - } - - // Spark doesn't need to zip Broadcast RDDs, so it doesn't schedule Broadcast RDDs with - // same partition number. But for Comet, we need to zip them so we need to adjust the - // partition number of Broadcast RDDs to make sure they have the same partition number. - sparkPlans.zipWithIndex.foreach { case (plan, idx) => - plan match { - case c: CometBroadcastExchangeExec => - inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) - case BroadcastQueryStageExec(_, c: CometBroadcastExchangeExec, _) => - inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) - case ReusedExchangeExec(_, c: CometBroadcastExchangeExec) => - inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) - case BroadcastQueryStageExec( - _, - ReusedExchangeExec(_, c: CometBroadcastExchangeExec), - _) => - inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) - case _: CometNativeExec => - // no-op - case _ if idx == firstNonBroadcastPlan.get._2 => - inputs += firstNonBroadcastPlanRDD - case _ => - val rdd = plan.executeColumnar() - if (rdd.getNumPartitions != firstNonBroadcastPlanNumPartitions) { - throw new CometRuntimeException( - s"Partition number mismatch: ${rdd.getNumPartitions} != " + - s"$firstNonBroadcastPlanNumPartitions") - } else { - inputs += rdd - } - } - } - - if (inputs.isEmpty && !sparkPlans.forall(_.isInstanceOf[CometNativeExec])) { - throw new CometRuntimeException(s"No input for CometNativeExec:\n $this") - } - - // Detect ShuffleScan indices for direct read in CometExecRDD - val shuffleScanIndices = findShuffleScanIndices(serializedPlanCopy) - - // Unified RDD creation - CometExecRDD handles all cases - val subqueries = collectSubqueries(this) - val hasScanInput = sparkPlans.exists(_.isInstanceOf[CometNativeScanExec]) new CometExecRDD( sparkContext, - inputs.toSeq, - commonByKey, - perPartitionByKey, - serializedPlanCopy, - firstNonBroadcastPlanNumPartitions, + ctx.inputs, + ctx.commonByKey, + ctx.perPartitionByKey, + serializedPlan, + ctx.numPartitions, output.length, nativeMetrics, - subqueries, - broadcastedHadoopConfForEncryption, - encryptedFilePaths, - shuffleScanIndices) { + ctx.subqueries, + ctx.broadcastedHadoopConfForEncryption, + ctx.encryptedFilePaths, + ctx.shuffleScanIndices) { override def compute( split: Partition, context: TaskContext): Iterator[ColumnarBatch] = { val res = super.compute(split, context) // Report scan input metrics only when the native plan contains a scan. - if (hasScanInput) { + if (ctx.hasScanInput) { Option(context).foreach(nativeMetrics.reportScanInputMetrics) } @@ -586,6 +482,149 @@ abstract class CometNativeExec extends CometExec { } } + /** + * Walk this CometNativeExec subtree once and gather everything needed to launch native + * execution: leaf input RDDs (with broadcast partition counts aligned to the first + * non-broadcast plan), per-partition planning data, subqueries, encryption options, and + * shuffle-scan indices. See [[NativeExecContext]] for the full set of fields. + * + * Used by [[doExecuteColumnar]] (CometExecRDD path) and by the native-shuffle path + * (CometShuffleExchangeExec) so both observe the same input alignment. + */ + private[comet] def buildNativeContext(): NativeExecContext = { + // Go over all the native scans, in order to see if they need encryption options. + // For each relation in a CometNativeScan generate a hadoopConf, + // for each file path in a relation associate with hadoopConf + // This is done per native plan, so only count scans until a comet input is reached. + val encryptionOptions = + mutable.ArrayBuffer.empty[(Broadcast[SerializableConfiguration], Seq[String])] + foreachUntilCometInput(this) { + case scan: CometNativeScanExec => + // This creates a hadoopConf that brings in any SQLConf "spark.hadoop.*" configs and + // per-relation configs since different tables might have different decryption + // properties. + val hadoopConf = scan.relation.sparkSession.sessionState + .newHadoopConfWithOptions(scan.relation.options) + val encryptionEnabled = CometParquetUtils.encryptionEnabled(hadoopConf) + if (encryptionEnabled) { + // hadoopConf isn't serializable, so we have to do a broadcasted config. + val broadcastedConf = + scan.relation.sparkSession.sparkContext + .broadcast(new SerializableConfiguration(hadoopConf)) + + val optsTuple: (Broadcast[SerializableConfiguration], Seq[String]) = + (broadcastedConf, scan.relation.inputFiles.toSeq) + encryptionOptions += optsTuple + } + case _ => // no-op + } + assert( + encryptionOptions.size <= 1, + "We expect one native scan that requires encryption reading in a Comet plan," + + " since we will broadcast one hadoopConf.") + // If this assumption changes in the future, you can look at the commit history of #2447 + // to see how there used to be a map of relations to broadcasted confs in case multiple + // relations in a single plan. The example that came up was UNION. See discussion at: + // https://github.com/apache/datafusion-comet/pull/2447#discussion_r2406118264 + val (broadcastedHadoopConfForEncryption, encryptedFilePaths) = + encryptionOptions.headOption match { + case Some((conf, paths)) => (Some(conf), paths) + case None => (None, Seq.empty) + } + + // Find planning data within this stage (stops at shuffle boundaries). + val (commonByKey, perPartitionByKey) = findAllPlanData(this) + + // Collect the input ColumnarBatches from the child operators and create a CometExecIterator + // to execute the native plan. + val sparkPlans = ArrayBuffer.empty[SparkPlan] + val inputs = ArrayBuffer.empty[RDD[ColumnarBatch]] + + foreachUntilCometInput(this)(sparkPlans += _) + + // Find the first non broadcast plan + val firstNonBroadcastPlan = sparkPlans.zipWithIndex.find { + case (_: CometBroadcastExchangeExec, _) => false + case (BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _), _) => false + case (BroadcastQueryStageExec(_, _: ReusedExchangeExec, _), _) => false + case (ReusedExchangeExec(_, _: CometBroadcastExchangeExec), _) => false + case _ => true + } + + val containsBroadcastInput = sparkPlans.exists { + case _: CometBroadcastExchangeExec => true + case BroadcastQueryStageExec(_, _: CometBroadcastExchangeExec, _) => true + case BroadcastQueryStageExec(_, _: ReusedExchangeExec, _) => true + case ReusedExchangeExec(_, _: CometBroadcastExchangeExec) => true + case _ => false + } + + // If the first non broadcast plan is not found, it means all the plans are broadcast plans. + // This is not expected, so throw an exception. + if (containsBroadcastInput && firstNonBroadcastPlan.isEmpty) { + throw new CometRuntimeException(s"Cannot find the first non broadcast plan: $this") + } + + // If the first non broadcast plan is found, we need to adjust the partition number of + // the broadcast plans to make sure they have the same partition number as the first non + // broadcast plan. + val (firstNonBroadcastPlanRDD, firstNonBroadcastPlanNumPartitions) = + firstNonBroadcastPlan.get._1 match { + case plan: CometNativeExec => + (null, plan.outputPartitioning.numPartitions) + case plan => + val rdd = plan.executeColumnar() + (rdd, rdd.getNumPartitions) + } + + // Spark doesn't need to zip Broadcast RDDs, so it doesn't schedule Broadcast RDDs with + // same partition number. But for Comet, we need to zip them so we need to adjust the + // partition number of Broadcast RDDs to make sure they have the same partition number. + sparkPlans.zipWithIndex.foreach { case (plan, idx) => + plan match { + case c: CometBroadcastExchangeExec => + inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) + case BroadcastQueryStageExec(_, c: CometBroadcastExchangeExec, _) => + inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) + case ReusedExchangeExec(_, c: CometBroadcastExchangeExec) => + inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) + case BroadcastQueryStageExec( + _, + ReusedExchangeExec(_, c: CometBroadcastExchangeExec), + _) => + inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) + case _: CometNativeExec => + // no-op + case _ if idx == firstNonBroadcastPlan.get._2 => + inputs += firstNonBroadcastPlanRDD + case _ => + val rdd = plan.executeColumnar() + if (rdd.getNumPartitions != firstNonBroadcastPlanNumPartitions) { + throw new CometRuntimeException( + s"Partition number mismatch: ${rdd.getNumPartitions} != " + + s"$firstNonBroadcastPlanNumPartitions") + } else { + inputs += rdd + } + } + } + + if (inputs.isEmpty && !sparkPlans.forall(_.isInstanceOf[CometNativeExec])) { + throw new CometRuntimeException(s"No input for CometNativeExec:\n $this") + } + + NativeExecContext( + inputs = inputs.toSeq, + numPartitions = firstNonBroadcastPlanNumPartitions, + subqueries = collectSubqueries(this), + broadcastedHadoopConfForEncryption = broadcastedHadoopConfForEncryption, + encryptedFilePaths = encryptedFilePaths, + commonByKey = commonByKey, + perPartitionByKey = perPartitionByKey, + shuffleScanIndices = findShuffleScanIndices(nativeOp), + hasScanInput = sparkPlans.exists(_.isInstanceOf[CometNativeScanExec])) + } + /** * Traverse the tree of Comet physical operators until reaching the input sources operators and * apply the given function to each operator. @@ -623,11 +662,10 @@ abstract class CometNativeExec extends CometExec { } /** - * Walk the serialized protobuf plan depth-first to find which input indices correspond to + * Walk the protobuf operator tree depth-first to find which input indices correspond to * ShuffleScan vs Scan leaf nodes. Each Scan or ShuffleScan leaf consumes one input in order. */ - private def findShuffleScanIndices(planBytes: Array[Byte]): Set[Int] = { - val plan = OperatorOuterClass.Operator.parseFrom(planBytes) + private def findShuffleScanIndices(plan: OperatorOuterClass.Operator): Set[Int] = { var scanIndex = 0 val indices = mutable.Set.empty[Int] def walk(op: OperatorOuterClass.Operator): Unit = { From 8c99fc5a9b4f5301994787debedd5c9c6a77e50f Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Thu, 28 May 2026 20:29:02 -0400 Subject: [PATCH 02/13] Passes CometFuzzTestSuite, CometNativeShuffleSuite, CometExecSuite. --- .../shuffle/CometNativeShuffleInputRDD.scala | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala new file mode 100644 index 0000000000..29b9745366 --- /dev/null +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet.execution.shuffle + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.vectorized.ColumnarBatch + +import org.apache.comet.CometShuffleBlockIterator + +/** + * Thin RDD that anchors Spark scheduling for the native-shuffle path. Native execution itself is + * driven by [[CometNativeShuffleWriter]] using the unified `ShuffleWriter(child = childNativeOp)` + * plan. + * + * The RDD's role here is twofold: + * - declare `OneToOneDependency` on each leaf input RDD so the DAGScheduler walks the lineage + * and triggers prior stages, broadcast materialization, etc. + * - construct the per-partition leaf iterators (and shuffle-block iterators where applicable) + * in `compute`, packaged into a [[CometNativeShuffleInputIterator]] that the writer downcasts + * to extract the inputs it needs to feed the native plan. + * + * The iterator returned by `compute` always reports `hasNext = false`. Spark's `ShuffleMapTask` + * will hand it to `writer.write`; the writer ignores it as an iterator and reads its exposed + * fields directly. + */ +private[shuffle] class CometNativeShuffleInputRDD( + sc: SparkContext, + var inputRDDs: Seq[RDD[ColumnarBatch]], + numPartitionsParam: Int, + shuffleScanIndices: Set[Int]) + extends RDD[Product2[Int, ColumnarBatch]]( + sc, + inputRDDs.map(rdd => new OneToOneDependency(rdd))) { + + override protected def getPartitions: Array[Partition] = + (0 until numPartitionsParam).map { i => + // Resolve leaf-RDD partitions on the driver here (where their @transient fields are still + // populated). Stashing them on the partition lets `compute` avoid touching + // `leafRdd.partitions` on the executor, which would otherwise trigger getPartitions and + // hit the @transient-null trap (e.g. CometExecRDD.perPartitionByKey). + val inputParts = inputRDDs.map(_.partitions(i)).toArray + new CometNativeShuffleInputPartition(i, inputParts) + }.toArray + + override def compute( + split: Partition, + context: TaskContext): Iterator[Product2[Int, ColumnarBatch]] = { + val partition = split.asInstanceOf[CometNativeShuffleInputPartition] + val leafIterators = inputRDDs.zip(partition.inputPartitions).map { case (rdd, part) => + rdd.iterator(part, context) + } + val shuffleBlockIters: Map[Int, CometShuffleBlockIterator] = + shuffleScanIndices.flatMap { si => + inputRDDs(si) match { + case rdd: CometShuffledBatchRDD => + Some(si -> rdd.computeAsShuffleBlockIterator(partition.inputPartitions(si), context)) + case _ => None + } + }.toMap + new CometNativeShuffleInputIterator(partition.index, leafIterators, shuffleBlockIters) + } + + override def getPreferredLocations(split: Partition): Seq[String] = { + if (inputRDDs == null || inputRDDs.isEmpty) return Nil + val partition = split.asInstanceOf[CometNativeShuffleInputPartition] + val prefs = inputRDDs.zip(partition.inputPartitions).map { case (rdd, part) => + rdd.preferredLocations(part) + } + val intersection = prefs.reduce((a, b) => a.intersect(b)) + if (intersection.nonEmpty) intersection else prefs.flatten.distinct + } + + override def clearDependencies(): Unit = { + super.clearDependencies() + inputRDDs = null + } +} + +private[shuffle] class CometNativeShuffleInputPartition( + override val index: Int, + val inputPartitions: Array[Partition]) + extends Partition + +/** + * Iterator handed to [[CometNativeShuffleWriter.write]] via Spark's ShuffleMapTask. Reports no + * elements; the writer downcasts and reads `partitionIndex`, `leafIterators`, and + * `shuffleBlockIterators` directly to drive the unified native plan. + */ +private[shuffle] class CometNativeShuffleInputIterator( + val partitionIndex: Int, + val leafIterators: Seq[Iterator[ColumnarBatch]], + val shuffleBlockIterators: Map[Int, CometShuffleBlockIterator]) + extends Iterator[Product2[Int, ColumnarBatch]] { + + override def hasNext: Boolean = false + + override def next(): Product2[Int, ColumnarBatch] = + throw new NoSuchElementException( + "CometNativeShuffleInputIterator does not produce elements; CometNativeShuffleWriter " + + "drives native execution via the iterator's exposed fields.") +} From 443a1c7866ff4dd80664425fecd06f4cbfa211e3 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Thu, 28 May 2026 20:49:43 -0400 Subject: [PATCH 03/13] Cleanup, update docs. --- .../contributor-guide/native_shuffle.md | 38 ++++-- .../shuffle/CometNativeShuffleInputRDD.scala | 23 +--- .../shuffle/CometNativeShuffleWriter.scala | 72 +++++----- .../shuffle/CometShuffleDependency.scala | 22 ++- .../shuffle/CometShuffleExchangeExec.scala | 62 ++++----- .../shuffle/CometShuffleManager.scala | 4 +- .../apache/spark/sql/comet/operators.scala | 128 ++++++++---------- 7 files changed, 160 insertions(+), 189 deletions(-) diff --git a/docs/source/contributor-guide/native_shuffle.md b/docs/source/contributor-guide/native_shuffle.md index 18e80a90c8..c46b8b45a8 100644 --- a/docs/source/contributor-guide/native_shuffle.md +++ b/docs/source/contributor-guide/native_shuffle.md @@ -69,8 +69,9 @@ Native shuffle (`CometExchange`) is selected when all of the following condition ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ │ CometNativeShuffleWriter │ -│ - Constructs protobuf operator plan │ -│ - Invokes native execution via CometExec.getCometIterator() │ +│ - Builds protobuf operator plan: ShuffleWriter(child = childNativeOp) │ +│ - Reads per-partition leaf iterators from CometNativeShuffleInputIterator │ +│ - Drives one CometExecIterator per partition │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ (JNI) @@ -103,13 +104,14 @@ Native shuffle (`CometExchange`) is selected when all of the following condition ### Scala Side -| Class | Location | Description | -| ------------------------------ | ------------------------------------------------ | --------------------------------------------------------------------------------------------- | -| `CometShuffleExchangeExec` | `.../shuffle/CometShuffleExchangeExec.scala` | Physical plan node. Validates types and partitioning, creates `CometShuffleDependency`. | -| `CometNativeShuffleWriter` | `.../shuffle/CometNativeShuffleWriter.scala` | Implements `ShuffleWriter`. Builds protobuf plan and invokes native execution. | -| `CometShuffleDependency` | `.../shuffle/CometShuffleDependency.scala` | Extends `ShuffleDependency`. Holds shuffle type, schema, and range partition bounds. | -| `CometBlockStoreShuffleReader` | `.../shuffle/CometBlockStoreShuffleReader.scala` | Reads shuffle blocks via `ShuffleBlockFetcherIterator`. Decodes Arrow IPC to `ColumnarBatch`. | -| `NativeBatchDecoderIterator` | `.../shuffle/NativeBatchDecoderIterator.scala` | Reads compressed Arrow IPC from input stream. Calls native decode via JNI. | +| Class | Location | Description | +| ------------------------------ | ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------- | +| `CometShuffleExchangeExec` | `.../shuffle/CometShuffleExchangeExec.scala` | Physical plan node. Validates types and partitioning, creates `CometShuffleDependency`. | +| `CometNativeShuffleWriter` | `.../shuffle/CometNativeShuffleWriter.scala` | Implements `ShuffleWriter`. Builds the unified `ShuffleWriter(child = childNativeOp)` plan and runs it in one `CometExecIterator` per partition. | +| `CometShuffleDependency` | `.../shuffle/CometShuffleDependency.scala` | Extends `ShuffleDependency`. Holds shuffle type, schema, range partition bounds, and (native shuffle only) a `NativeShuffleSpec`. | +| `CometNativeShuffleInputRDD` | `.../shuffle/CometNativeShuffleInputRDD.scala` | Thin scheduling-anchor RDD on the native-shuffle path. `compute` returns a `CometNativeShuffleInputIterator` carrying per-partition leaf iterators. | +| `CometBlockStoreShuffleReader` | `.../shuffle/CometBlockStoreShuffleReader.scala` | Reads shuffle blocks via `ShuffleBlockFetcherIterator`. Decodes Arrow IPC to `ColumnarBatch`. | +| `NativeBatchDecoderIterator` | `.../shuffle/NativeBatchDecoderIterator.scala` | Reads compressed Arrow IPC from input stream. Calls native decode via JNI. | ### Rust Side @@ -123,11 +125,19 @@ Native shuffle (`CometExchange`) is selected when all of the following condition ### Write Path -1. **Plan construction**: `CometNativeShuffleWriter` builds a protobuf operator plan containing: - - A scan operator reading from the input iterator - - A `ShuffleWriter` operator with partitioning config and compression codec - -2. **Native execution**: `CometExec.getCometIterator()` executes the plan in Rust. +1. **Plan construction**: `CometNativeShuffleWriter` builds a protobuf operator tree with a + `ShuffleWriter` operator at the root and `childNativeOp` as its child. `childNativeOp` takes + one of two shapes: + - The child plan's `nativeOp` directly, when `CometShuffleExchangeExec`'s child is a + `CometNativeExec` subtree. The upstream operators run inside the same `CometExecIterator` + as the writer, with no JVM-to-native batch boundary between them. + - A synthetic `Scan("ShuffleWriterInput")` placeholder, when the dep was built via the + convenience `prepareShuffleDependency(rdd, ...)` overload (used by + `CometCollectLimitExec` and `CometTakeOrderedAndProjectExec`, or when the + exchange's child is a non-native `CometPlan` such as `CometSparkToColumnarExec`). Native + code reads `ColumnarBatch`es from the JVM input iterator via Arrow C Stream Interface. + +2. **Native execution**: A single `CometExecIterator` per partition runs the unified plan. 3. **Partitioning**: `ShuffleWriterExec` receives batches and routes to the appropriate partitioner: - `MultiPartitionShuffleRepartitioner`: For hash/range/round-robin partitioning diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala index 29b9745366..811c5bd3be 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleInputRDD.scala @@ -26,20 +26,11 @@ import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.comet.CometShuffleBlockIterator /** - * Thin RDD that anchors Spark scheduling for the native-shuffle path. Native execution itself is - * driven by [[CometNativeShuffleWriter]] using the unified `ShuffleWriter(child = childNativeOp)` - * plan. - * - * The RDD's role here is twofold: - * - declare `OneToOneDependency` on each leaf input RDD so the DAGScheduler walks the lineage - * and triggers prior stages, broadcast materialization, etc. - * - construct the per-partition leaf iterators (and shuffle-block iterators where applicable) - * in `compute`, packaged into a [[CometNativeShuffleInputIterator]] that the writer downcasts - * to extract the inputs it needs to feed the native plan. - * - * The iterator returned by `compute` always reports `hasNext = false`. Spark's `ShuffleMapTask` - * will hand it to `writer.write`; the writer ignores it as an iterator and reads its exposed - * fields directly. + * Thin scheduling-anchor RDD for the native-shuffle path. Declares `OneToOneDependency` on each + * leaf input RDD (so the DAGScheduler triggers prior stages, broadcasts, etc.) and constructs + * per-partition leaf iterators in `compute`, packaged into a [[CometNativeShuffleInputIterator]]. + * The iterator reports `hasNext = false`; [[CometNativeShuffleWriter]] downcasts it and reads the + * leaf iterators directly to drive the unified `ShuffleWriter(child = childNativeOp)` plan. */ private[shuffle] class CometNativeShuffleInputRDD( sc: SparkContext, @@ -114,6 +105,6 @@ private[shuffle] class CometNativeShuffleInputIterator( override def next(): Product2[Int, ColumnarBatch] = throw new NoSuchElementException( - "CometNativeShuffleInputIterator does not produce elements; CometNativeShuffleWriter " + - "drives native execution via the iterator's exposed fields.") + "CometNativeShuffleInputIterator should never be drained as an iterator. Reaching this " + + "code means a non-Comet ShuffleWriter is consuming the input, which is a bug.") } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala index 68078e8507..21395c58fd 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala @@ -32,7 +32,7 @@ import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriteMetricsR import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning, SinglePartition} -import org.apache.spark.sql.comet.{CometExec, CometMetricNode, CometScalarSubquery, NativeExecContext, PlanDataInjector} +import org.apache.spark.sql.comet.{CometExec, CometMetricNode, CometScalarSubquery, PlanDataInjector} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.comet.{CometConf, CometExecIterator} @@ -40,26 +40,16 @@ import org.apache.comet.serde.{OperatorOuterClass, PartitioningOuterClass, Query import org.apache.comet.serde.OperatorOuterClass.{CompressionCodec, Operator} /** - * A [[ShuffleWriter]] that drives the native shuffle write in a single [[CometExecIterator]] per - * partition. The unified plan it executes has [[OperatorOuterClass.ShuffleWriter]] at the root - * with `childNativeOp` as its only child. Leaf input iterators come from - * [[CometNativeShuffleInputIterator]] (constructed by [[CometNativeShuffleInputRDD.compute]]). - * - * Two flavors of `childNativeOp` are in use: - * - rich Comet native subtree (e.g. HashAgg / Filter / ShuffleScan), supplied by - * [[CometShuffleExchangeExec]] when its child is a - * [[org.apache.spark.sql.comet.CometNativeExec]]. - * - synthetic `Scan("ShuffleWriterInput")` placeholder, supplied by the convenience overload of - * [[CometShuffleExchangeExec.prepareShuffleDependency]] for callers that already hold an - * `RDD[ColumnarBatch]` of native-driven batches (e.g. - * [[org.apache.spark.sql.comet.CometCollectLimitExec]]). - * - * The writer treats both shapes identically. + * Drives the native shuffle write in a single [[CometExecIterator]] per partition. The plan is + * `ShuffleWriter(child = childNativeOp)`; leaf iterators come from a + * [[CometNativeShuffleInputIterator]]. `childNativeOp` is either a rich Comet native subtree + * (when fed by [[CometShuffleExchangeExec]] with a [[org.apache.spark.sql.comet.CometNativeExec]] + * child) or a synthetic `Scan("ShuffleWriterInput")` placeholder (the + * [[CometShuffleExchangeExec.prepareShuffleDependency]] convenience overload). Same handling + * either way. */ class CometNativeShuffleWriter[K, V]( - childNativeOp: Operator, - childMetricNode: CometMetricNode, - nativeContext: NativeExecContext, + spec: NativeShuffleSpec, outputPartitioning: Partitioning, outputAttributes: Seq[Attribute], metrics: Map[String, SQLMetric], @@ -87,19 +77,28 @@ class CometNativeShuffleWriter[K, V]( val tempDataFilePath = Paths.get(tempDataFilename) val tempIndexFilePath = Paths.get(tempIndexFilename) - // Pull the per-partition leaf iterators and partition index from the iterator handed to us - // by Spark's ShuffleMapTask. CometNativeShuffleInputRDD.compute always returns this exact - // iterator type; no other RDD layers between produce a Product2[Int, ColumnarBatch]. - val shuffleInputIter = inputs.asInstanceOf[CometNativeShuffleInputIterator] + // The dep's _rdd is always a CometNativeShuffleInputRDD on this path. Pattern-match instead + // of asInstanceOf so a future RDD-layering change produces a clear error here rather than a + // bare ClassCastException deeper in the stack. + val shuffleInputIter = inputs match { + case it: CometNativeShuffleInputIterator => it + case other => + throw new IllegalStateException( + "CometNativeShuffleWriter expects its input iterator to be a " + + "CometNativeShuffleInputIterator (produced by CometNativeShuffleInputRDD), got " + + s"${other.getClass.getName}") + } val partitionIdx = shuffleInputIter.partitionIndex val leafIterators = shuffleInputIter.leafIterators val shuffleBlockIters = shuffleInputIter.shuffleBlockIterators val unifiedPlan = buildUnifiedPlan(tempDataFilename, tempIndexFilename) - val finalNativePlan = if (nativeContext.commonByKey.nonEmpty) { - val partitionDataByKey = - nativeContext.perPartitionByKey.map { case (k, arr) => k -> arr(partitionIdx) } - PlanDataInjector.injectPlanData(unifiedPlan, nativeContext.commonByKey, partitionDataByKey) + val ctx = spec.execContext + val finalNativePlan = if (ctx.commonByKey.nonEmpty) { + val partitionDataByKey = ctx.perPartitionByKey.map { case (k, arr) => + k -> arr(partitionIdx) + } + PlanDataInjector.injectPlanData(unifiedPlan, ctx.commonByKey, partitionDataByKey) } else { unifiedPlan } @@ -119,30 +118,29 @@ class CometNativeShuffleWriter[K, V]( "write_time" -> metricsWriteTime) ++ metrics.filterKeys(detailedMetrics.contains) - // ShuffleWriter metrics live at the root of the metric tree; the child operator's metric - // tree (rich subtree or empty leaf for the Scan placeholder) is attached underneath so the - // SQL UI sees the same per-node breakdown the split-driver flow produced. - val nativeMetrics = CometMetricNode(shuffleWriterSQLMetrics, Seq(childMetricNode)) + // ShuffleWriter metrics at the root; child's metric tree underneath so the SQL UI's per-node + // breakdown matches what the split-driver flow showed. + val nativeMetrics = CometMetricNode(shuffleWriterSQLMetrics, Seq(spec.childMetricNode)) val cometIter = new CometExecIterator( CometExec.newIterId, leafIterators, outputAttributes.length, - PlanDataInjector.serializeOperator(finalNativePlan), + CometExec.serializeNativePlan(finalNativePlan), nativeMetrics, numParts, partitionIdx, - nativeContext.broadcastedHadoopConfForEncryption, - nativeContext.encryptedFilePaths, + ctx.broadcastedHadoopConfForEncryption, + ctx.encryptedFilePaths, shuffleBlockIters) // Register subqueries against the iterator id so native callbacks resolve them to values. - nativeContext.subqueries.foreach { sub => + ctx.subqueries.foreach { sub => CometScalarSubquery.setSubquery(cometIter.id, sub) } Option(context).foreach { taskCtx => taskCtx.addTaskCompletionListener[Unit] { _ => - nativeContext.subqueries.foreach { sub => + ctx.subqueries.foreach { sub => CometScalarSubquery.removeSubquery(cometIter.id, sub) } } @@ -334,7 +332,7 @@ class CometNativeShuffleWriter[K, V]( OperatorOuterClass.Operator .newBuilder() .setShuffleWriter(shuffleWriterBuilder) - .addChildren(childNativeOp) + .addChildren(spec.childNativeOp) .build() } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala index 9ec25c49ec..2a05843007 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleDependency.scala @@ -34,13 +34,23 @@ import org.apache.spark.sql.types.StructType import org.apache.comet.serde.OperatorOuterClass +/** + * Bundle of context the native shuffle write path needs at task time. Co-populated for native + * shuffles only; consolidated into a single field on [[CometShuffleDependency]] so it cannot be + * partially set. + */ +case class NativeShuffleSpec( + childNativeOp: OperatorOuterClass.Operator, + childMetricNode: CometMetricNode, + execContext: NativeExecContext) + /** * A [[ShuffleDependency]] that allows us to identify the shuffle dependency as a Comet shuffle. * - * On the native-shuffle path, also carries the child plan's per-partition execution context, root - * native operator, and metric node so [[CometNativeShuffleWriter]] can drive the unified - * `ShuffleWriter(child = childNativeOp)` plan in a single [[org.apache.comet.CometExecIterator]] - * per partition. These three fields are populated only when `shuffleType == CometNativeShuffle`. + * On the native-shuffle path, also carries a [[NativeShuffleSpec]] so + * [[CometNativeShuffleWriter]] can drive the unified `ShuffleWriter(child = childNativeOp)` plan + * in a single [[org.apache.comet.CometExecIterator]] per partition. `nativeShuffleSpec` is + * populated only when `shuffleType == CometNativeShuffle`. */ class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag]( @transient private val _rdd: RDD[_ <: Product2[K, V]], @@ -58,9 +68,7 @@ class CometShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag]( val shuffleWriteMetrics: Map[String, SQLMetric] = Map.empty, val numParts: Int = 0, val rangePartitionBounds: Option[Seq[InternalRow]] = None, - val nativeExecContext: Option[NativeExecContext] = None, - val childNativeOp: Option[OperatorOuterClass.Operator] = None, - val childMetricNode: Option[CometMetricNode] = None) + val nativeShuffleSpec: Option[NativeShuffleSpec] = None) extends ShuffleDependency[K, V, C]( _rdd, partitioner, diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala index 16bd40d402..1470d637d9 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala @@ -102,14 +102,11 @@ case class CometShuffleExchangeExec( new UnsafeRowSerializer(child.output.size, longMetric("dataSize")) /** - * Per-partition execution context for the child native subtree, computed once and shared - * between [[inputRDD]] (which uses it to wire DAGScheduler dependencies) and - * [[shuffleDependency]] (which threads it through to [[CometNativeShuffleWriter]] for - * single-iterator native execution). Only populated when `shuffleType == CometNativeShuffle` - * AND the child is a [[CometNativeExec]] subtree we can inline. When the child is a non-native - * Comet plan (e.g. [[org.apache.spark.sql.comet.CometSparkToColumnarExec]]), this stays `None` - * and the shuffle falls back to the legacy `Scan("ShuffleWriterInput") -> ShuffleWriter` plan - * via the convenience overload of `prepareShuffleDependency`. + * Single-driver native-shuffle context, computed once and shared between [[inputRDD]] and + * [[shuffleDependency]]. `Some` only when `shuffleType == CometNativeShuffle` AND the child is + * a [[CometNativeExec]] subtree. Otherwise the dep is built via the + * [[CometShuffleExchangeExec.prepareShuffleDependency]] convenience overload (synthetic Scan + * placeholder). */ @transient private lazy val nativeChildContext: Option[NativeExecContext] = child match { case nativeChild: CometNativeExec if shuffleType == CometNativeShuffle => @@ -120,23 +117,19 @@ case class CometShuffleExchangeExec( @transient lazy val inputRDD: RDD[_] = if (shuffleType == CometNativeShuffle) { nativeChildContext match { case Some(ctx) => - // Single-driver path: thin scheduling anchor; CometNativeShuffleWriter drives the - // unified ShuffleWriter + child plan in a single CometExecIterator per partition. new CometNativeShuffleInputRDD( sparkContext, ctx.inputs, ctx.numPartitions, ctx.shuffleScanIndices) case None => - // Child is a Comet plan but not a CometNativeExec subtree (e.g. CometSparkToColumnarExec). - // No native subtree to inline; the writer's plan is `Scan("ShuffleWriterInput") -> - // ShuffleWriter` and JVM batches flow into native through Arrow C Stream Interface. + // Non-native child (e.g. CometSparkToColumnarExec): no subtree to inline. The dep gets + // built via the legacy convenience overload below; we just need a real RDD of batches. child.executeColumnar() } } else if (shuffleType == CometColumnarShuffle) { - // CometColumnarShuffle uses Spark's row-based execute() API. For Spark row-based plans, - // rows flow directly. For Comet native plans, their doExecute() wraps with ColumnarToRowExec - // to convert columnar batches to rows. + // Row-based shuffle. CometNativeExec.doExecute wraps columnar output with + // ColumnarToRowExec; non-Comet children flow through directly. child.execute() } else { throw new UnsupportedOperationException( @@ -180,12 +173,11 @@ case class CometShuffleExchangeExec( if (shuffleType == CometNativeShuffle) { val dep = nativeChildContext match { case Some(ctx) => - // Single-driver path: child is a CometNativeExec subtree. RangePartitioning needs real - // rows to compute partition bounds; use a regular columnar execution of the child for - // sampling only. The actual shuffle still goes through the single-iterator path. val nativeChild = child.asInstanceOf[CometNativeExec] + // RangePartitioner needs real rows for sampling. Reuse the precomputed context so we + // don't re-walk the SparkPlan tree or re-broadcast the encryption Hadoop conf. val samplingRDD: Option[RDD[ColumnarBatch]] = outputPartitioning match { - case _: RangePartitioning => Some(child.executeColumnar()) + case _: RangePartitioning => Some(nativeChild.executeColumnarWithContext(ctx)) case _ => None } CometShuffleExchangeExec.prepareNativeShuffleDependency( @@ -195,12 +187,11 @@ case class CometShuffleExchangeExec( outputPartitioning, serializer, metrics, - ctx, - nativeChild.nativeOp, - CometMetricNode.fromCometPlan(nativeChild)) + NativeShuffleSpec( + nativeChild.nativeOp, + CometMetricNode.fromCometPlan(nativeChild), + ctx)) case None => - // Child is a non-native Comet plan; the writer falls back to its Scan-placeholder - // path via the convenience overload of prepareShuffleDependency. CometShuffleExchangeExec.prepareShuffleDependency( inputRDD.asInstanceOf[RDD[ColumnarBatch]], child.output, @@ -733,18 +724,16 @@ object CometShuffleExchangeExec outputPartitioning, serializer, metrics, - ctx, - scanOp, - CometMetricNode(Map.empty)) + NativeShuffleSpec(scanOp, CometMetricNode(Map.empty), ctx)) } /** * Build a Comet native shuffle dependency for the [[CometShuffleExchangeExec]] case where the * shuffle is fed by a [[CometNativeExec]] child. The writer drives the unified * `ShuffleWriter(child = childNativeOp)` plan in a single - * [[org.apache.comet.CometExecIterator]] per partition. The returned dep carries the child's - * per-partition execution context, root native operator, and metric node so - * [[CometNativeShuffleWriter]] can reach them at task time. + * [[org.apache.comet.CometExecIterator]] per partition. The returned dep carries the + * [[NativeShuffleSpec]] so [[CometNativeShuffleWriter]] can reach the child's per-partition + * execution context, root native operator, and metric node at task time. * * @param thinRDD * scheduling-anchor RDD whose `compute` returns a [[CometNativeShuffleInputIterator]]; @@ -760,9 +749,7 @@ object CometShuffleExchangeExec outputPartitioning: Partitioning, serializer: Serializer, metrics: Map[String, SQLMetric], - nativeExecContext: NativeExecContext, - childNativeOp: OperatorOuterClass.Operator, - childMetricNode: CometMetricNode): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = { + spec: NativeShuffleSpec): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = { val numParts = thinRDD.getNumPartitions // The code block below is mostly brought over from @@ -821,7 +808,7 @@ object CometShuffleExchangeExec None) } - val dependency = new CometShuffleDependency[Int, ColumnarBatch, ColumnarBatch]( + new CometShuffleDependency[Int, ColumnarBatch, ColumnarBatch]( thinRDD, serializer = serializer, shuffleWriterProcessor = ShuffleExchangeExec.createShuffleWriteProcessor(metrics), @@ -833,10 +820,7 @@ object CometShuffleExchangeExec shuffleWriteMetrics = metrics, numParts = numParts, rangePartitionBounds = rangePartitionBounds, - nativeExecContext = Some(nativeExecContext), - childNativeOp = Some(childNativeOp), - childMetricNode = Some(childMetricNode)) - dependency + nativeShuffleSpec = Some(spec)) } /** diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala index 16a21bbfd9..bd69e91898 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleManager.scala @@ -231,9 +231,7 @@ class CometShuffleManager(conf: SparkConf) extends ShuffleManager with Logging { case cometShuffleHandle: CometNativeShuffleHandle[K @unchecked, V @unchecked] => val dep = cometShuffleHandle.dependency.asInstanceOf[CometShuffleDependency[_, _, _]] new CometNativeShuffleWriter( - dep.childNativeOp.get, - dep.childMetricNode.get, - dep.nativeExecContext.get, + dep.nativeShuffleSpec.get, dep.outputPartitioning.get, dep.outputAttributes, dep.shuffleWriteMetrics, diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 748362cd65..fa7aa6f1b2 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -383,15 +383,11 @@ object CometExec { /** * Per-partition execution context for a native subtree rooted at a [[CometNativeExec]] boundary. - * - * Built once on the driver from the SparkPlan tree and consumed by either: - * - [[CometNativeExec.doExecuteColumnar]] to construct a [[CometExecRDD]], or - * - the native-shuffle path, where [[CometNativeShuffleWriter]] drives the same child plan with - * a `ShuffleWriter` operator as its root. - * - * The fields capture everything that depends on tree-walking the SparkPlan and aligning leaf - * input RDDs (broadcast partition counts, plan-data, subqueries, encryption options) so callers - * do not have to re-derive them. + * Built once on the driver from the SparkPlan tree, then consumed by either + * [[CometNativeExec.executeColumnarWithContext]] (to build a [[CometExecRDD]]) or the + * native-shuffle path (to drive [[CometNativeShuffleWriter]]). Captures broadcast partition + * alignment, plan-data, subqueries, and encryption options so each consumer doesn't re-walk the + * tree. */ private[comet] case class NativeExecContext( inputs: Seq[RDD[ColumnarBatch]], @@ -402,7 +398,15 @@ private[comet] case class NativeExecContext( commonByKey: Map[String, Array[Byte]], perPartitionByKey: Map[String, Array[Array[Byte]]], shuffleScanIndices: Set[Int], - hasScanInput: Boolean) + hasScanInput: Boolean) { + // Catch shape divergence (e.g. broadcast scans with different partition counts after DPP + // filtering) at construction so consumers don't trip ArrayIndexOutOfBoundsException at + // partition idx access time. + require( + perPartitionByKey.values.forall(_.length == numPartitions), + s"All per-partition arrays must have length $numPartitions, but found: " + + perPartitionByKey.map { case (key, arr) => s"$key -> ${arr.length}" }.mkString(", ")) +} /** * A Comet native physical operator. @@ -442,90 +446,68 @@ abstract class CometNativeExec extends CometExec { runningSubqueries.clear() } - override def doExecuteColumnar(): RDD[ColumnarBatch] = { - serializedPlanOpt.plan match { - case None => - // This is in the middle of a native execution, it should not be executed directly. - throw new CometRuntimeException( - s"CometNativeExec should not be executed directly without a serialized plan: $this") - case Some(serializedPlan) => - // TODO: support native metrics for all operators. - val nativeMetrics = CometMetricNode.fromCometPlan(this) - val ctx = buildNativeContext() - - new CometExecRDD( - sparkContext, - ctx.inputs, - ctx.commonByKey, - ctx.perPartitionByKey, - serializedPlan, - ctx.numPartitions, - output.length, - nativeMetrics, - ctx.subqueries, - ctx.broadcastedHadoopConfForEncryption, - ctx.encryptedFilePaths, - ctx.shuffleScanIndices) { - override def compute( - split: Partition, - context: TaskContext): Iterator[ColumnarBatch] = { - val res = super.compute(split, context) - - // Report scan input metrics only when the native plan contains a scan. - if (ctx.hasScanInput) { - Option(context).foreach(nativeMetrics.reportScanInputMetrics) - } + override def doExecuteColumnar(): RDD[ColumnarBatch] = + executeColumnarWithContext(buildNativeContext()) - res - } + /** + * Build a [[CometExecRDD]] from a precomputed [[NativeExecContext]]. Public so the native + * shuffle path can sample (RangePartitioning) without re-walking the SparkPlan tree and + * re-broadcasting the encryption Hadoop conf. + */ + private[comet] def executeColumnarWithContext(ctx: NativeExecContext): RDD[ColumnarBatch] = { + val serializedPlan = serializedPlanOpt.plan.getOrElse( + throw new CometRuntimeException( + s"CometNativeExec should not be executed directly without a serialized plan: $this")) + val nativeMetrics = CometMetricNode.fromCometPlan(this) + + new CometExecRDD( + sparkContext, + ctx.inputs, + ctx.commonByKey, + ctx.perPartitionByKey, + serializedPlan, + ctx.numPartitions, + output.length, + nativeMetrics, + ctx.subqueries, + ctx.broadcastedHadoopConfForEncryption, + ctx.encryptedFilePaths, + ctx.shuffleScanIndices) { + override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = { + val res = super.compute(split, context) + if (ctx.hasScanInput) { + Option(context).foreach(nativeMetrics.reportScanInputMetrics) } + res + } } } /** * Walk this CometNativeExec subtree once and gather everything needed to launch native - * execution: leaf input RDDs (with broadcast partition counts aligned to the first - * non-broadcast plan), per-partition planning data, subqueries, encryption options, and - * shuffle-scan indices. See [[NativeExecContext]] for the full set of fields. - * - * Used by [[doExecuteColumnar]] (CometExecRDD path) and by the native-shuffle path - * (CometShuffleExchangeExec) so both observe the same input alignment. + * execution. See [[NativeExecContext]] for the field set. */ private[comet] def buildNativeContext(): NativeExecContext = { - // Go over all the native scans, in order to see if they need encryption options. - // For each relation in a CometNativeScan generate a hadoopConf, - // for each file path in a relation associate with hadoopConf - // This is done per native plan, so only count scans until a comet input is reached. + // Find native scans that need encryption: build a hadoopConf per relation, broadcast it once + // so executors can decrypt on read. Capped at one because we only broadcast one conf per + // CometExecIterator (see #2447 for history of the per-relation map approach). val encryptionOptions = mutable.ArrayBuffer.empty[(Broadcast[SerializableConfiguration], Seq[String])] foreachUntilCometInput(this) { case scan: CometNativeScanExec => - // This creates a hadoopConf that brings in any SQLConf "spark.hadoop.*" configs and - // per-relation configs since different tables might have different decryption - // properties. val hadoopConf = scan.relation.sparkSession.sessionState .newHadoopConfWithOptions(scan.relation.options) - val encryptionEnabled = CometParquetUtils.encryptionEnabled(hadoopConf) - if (encryptionEnabled) { - // hadoopConf isn't serializable, so we have to do a broadcasted config. - val broadcastedConf = - scan.relation.sparkSession.sparkContext - .broadcast(new SerializableConfiguration(hadoopConf)) - - val optsTuple: (Broadcast[SerializableConfiguration], Seq[String]) = - (broadcastedConf, scan.relation.inputFiles.toSeq) - encryptionOptions += optsTuple + if (CometParquetUtils.encryptionEnabled(hadoopConf)) { + val broadcastedConf = scan.relation.sparkSession.sparkContext + .broadcast(new SerializableConfiguration(hadoopConf)) + encryptionOptions += ((broadcastedConf, scan.relation.inputFiles.toSeq)) } - case _ => // no-op + case _ => } assert( encryptionOptions.size <= 1, "We expect one native scan that requires encryption reading in a Comet plan," + " since we will broadcast one hadoopConf.") - // If this assumption changes in the future, you can look at the commit history of #2447 - // to see how there used to be a map of relations to broadcasted confs in case multiple - // relations in a single plan. The example that came up was UNION. See discussion at: - // https://github.com/apache/datafusion-comet/pull/2447#discussion_r2406118264 val (broadcastedHadoopConfForEncryption, encryptedFilePaths) = encryptionOptions.headOption match { case Some((conf, paths)) => (Some(conf), paths) From cc7c5bedf9665f6671038cdeac273c09ca5c7fa9 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 10:09:16 -0400 Subject: [PATCH 04/13] handle arrow type mismatches on child stream in native shuffle. --- native/core/src/execution/operators/mod.rs | 2 + .../src/execution/operators/schema_align.rs | 266 ++++++++++++++++++ native/core/src/execution/planner.rs | 27 +- native/proto/src/proto/operator.proto | 6 + .../shuffle/CometNativeShuffleWriter.scala | 9 + 5 files changed, 307 insertions(+), 3 deletions(-) create mode 100644 native/core/src/execution/operators/schema_align.rs diff --git a/native/core/src/execution/operators/mod.rs b/native/core/src/execution/operators/mod.rs index 4b2c06575d..b679d35b19 100644 --- a/native/core/src/execution/operators/mod.rs +++ b/native/core/src/execution/operators/mod.rs @@ -32,6 +32,8 @@ pub use parquet_writer::ParquetWriterExec; mod csv_scan; pub mod projection; mod scan; +mod schema_align; mod shuffle_scan; pub use csv_scan::init_csv_datasource_exec; +pub use schema_align::SchemaAlignExec; pub use shuffle_scan::ShuffleScanExec; diff --git a/native/core/src/execution/operators/schema_align.rs b/native/core/src/execution/operators/schema_align.rs new file mode 100644 index 0000000000..3d207e0202 --- /dev/null +++ b/native/core/src/execution/operators/schema_align.rs @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! `SchemaAlignExec` reshapes its child's output so the per-column Arrow type and field-level +//! nullability match what Spark catalyst declared. Used between an inlined native subtree and +//! `ShuffleWriterExec` when the FFI deep-copy + `ScanExec` cast in `build_record_batch` are both +//! gone, so DataFusion / `datafusion-spark` return-type drift would otherwise be written into +//! shuffle blocks. See for the running +//! list of mismatched functions. + +use arrow::array::{ArrayRef, RecordBatch, RecordBatchOptions}; +use arrow::compute::{cast_with_options, CastOptions}; +use arrow::datatypes::{Field, Schema, SchemaRef}; +use datafusion::common::DataFusionError; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::{ + execution::TaskContext, + physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + RecordBatchStream, SendableRecordBatchStream, + }, +}; +use futures::{Stream, StreamExt}; +use std::{ + any::Any, + collections::HashSet, + pin::Pin, + sync::{Arc, Mutex, OnceLock}, + task::{Context, Poll}, +}; + +/// Process-wide set of `(column, actual, expected)` signatures we have already warned about. +/// Each schema drift produces the same warning on every partition of every query that runs +/// the offending expression; deduping here keeps logs readable while still surfacing each +/// distinct mismatch once. +fn warn_dedup() -> &'static Mutex> { + static SET: OnceLock>> = OnceLock::new(); + SET.get_or_init(|| Mutex::new(HashSet::new())) +} + +#[derive(Debug)] +pub struct SchemaAlignExec { + child: Arc, + target_schema: SchemaRef, + column_actions: Arc>, + cache: Arc, +} + +#[derive(Debug, Clone)] +enum ColumnAction { + /// Pass the input column through unchanged. Any nullability/metadata difference is + /// absorbed when the batch is re-stamped via `RecordBatch::try_new_with_options`. + Passthrough, + /// Cast the input column to the target data_type. + Cast, +} + +impl SchemaAlignExec { + /// Build a SchemaAlignExec that aligns `child`'s output to `expected`. Returns + /// `Ok(child)` unchanged when no per-column reshape is needed; otherwise wraps `child` + /// in a SchemaAlignExec whose target schema preserves `expected`'s data_type and metadata + /// but widens nullability to `actual.nullable || expected.nullable` (matching the + /// reconciliation rule used at the FFI boundary on `main`). + pub fn try_new_or_passthrough( + child: Arc, + expected: &SchemaRef, + ) -> Result, DataFusionError> { + let actual = child.schema(); + if actual.fields().len() != expected.fields().len() { + return Err(DataFusionError::Plan(format!( + "SchemaAlignExec: expected {} fields, child produces {}", + expected.fields().len(), + actual.fields().len() + ))); + } + let mut needs_alignment = false; + let mut actions = Vec::with_capacity(actual.fields().len()); + let mut target_fields = Vec::with_capacity(actual.fields().len()); + for (idx, (actual_field, expected_field)) in actual + .fields() + .iter() + .zip(expected.fields().iter()) + .enumerate() + { + let action = if actual_field.data_type() == expected_field.data_type() { + ColumnAction::Passthrough + } else { + let signature = format!( + "{}|{:?}|{:?}", + expected_field.name(), + actual_field.data_type(), + expected_field.data_type() + ); + if warn_dedup().lock().unwrap().insert(signature) { + log::warn!( + "ShuffleWriter input schema mismatch on col[{idx}] '{}': child produced \ + {:?}, catalyst declared {:?}. Inserting a cast; please file the upstream \ + function bug at https://github.com/apache/datafusion-comet/issues/4515.", + expected_field.name(), + actual_field.data_type(), + expected_field.data_type() + ); + } + ColumnAction::Cast + }; + let target_nullable = actual_field.is_nullable() || expected_field.is_nullable(); + let field_changed = !matches!(action, ColumnAction::Passthrough) + || target_nullable != actual_field.is_nullable() + || expected_field.metadata() != actual_field.metadata() + || expected_field.name() != actual_field.name(); + if field_changed { + needs_alignment = true; + } + target_fields.push( + Field::new( + expected_field.name(), + expected_field.data_type().clone(), + target_nullable, + ) + .with_metadata(expected_field.metadata().clone()), + ); + actions.push(action); + } + if !needs_alignment { + return Ok(child); + } + let target_schema: SchemaRef = Arc::new(Schema::new(target_fields)); + let cache = Arc::new(PlanProperties::new( + EquivalenceProperties::new(Arc::clone(&target_schema)), + child.output_partitioning().clone(), + EmissionType::Incremental, + Boundedness::Bounded, + )); + Ok(Arc::new(Self { + child, + target_schema, + column_actions: Arc::new(actions), + cache, + })) + } +} + +impl DisplayAs for SchemaAlignExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "CometSchemaAlignExec") + } + DisplayFormatType::TreeRender => unimplemented!(), + } + } +} + +impl ExecutionPlan for SchemaAlignExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.target_schema) + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.child] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> datafusion::common::Result> { + // Rebuild PlanProperties from the new child since `output_partitioning` may differ. + let new_child = Arc::clone(&children[0]); + let cache = Arc::new(PlanProperties::new( + EquivalenceProperties::new(Arc::clone(&self.target_schema)), + new_child.output_partitioning().clone(), + EmissionType::Incremental, + Boundedness::Bounded, + )); + Ok(Arc::new(Self { + child: new_child, + target_schema: Arc::clone(&self.target_schema), + column_actions: Arc::clone(&self.column_actions), + cache, + })) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> datafusion::common::Result { + let child_stream = self.child.execute(partition, context)?; + Ok(Box::pin(SchemaAlignStream { + child_stream, + target_schema: Arc::clone(&self.target_schema), + column_actions: Arc::clone(&self.column_actions), + })) + } + + fn properties(&self) -> &Arc { + &self.cache + } + + fn name(&self) -> &str { + "CometSchemaAlignExec" + } +} + +struct SchemaAlignStream { + child_stream: SendableRecordBatchStream, + target_schema: SchemaRef, + column_actions: Arc>, +} + +impl SchemaAlignStream { + fn align(&self, batch: RecordBatch) -> Result { + let mut columns: Vec = Vec::with_capacity(batch.num_columns()); + for (idx, action) in self.column_actions.iter().enumerate() { + let column = batch.column(idx); + let aligned = match action { + ColumnAction::Passthrough => Arc::clone(column), + ColumnAction::Cast => cast_with_options( + column, + self.target_schema.field(idx).data_type(), + &CastOptions::default(), + )?, + }; + columns.push(aligned); + } + let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); + RecordBatch::try_new_with_options(Arc::clone(&self.target_schema), columns, &options) + .map_err(DataFusionError::from) + } +} + +impl Stream for SchemaAlignStream { + type Item = datafusion::common::Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.child_stream.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => Poll::Ready(Some(self.align(batch))), + other => other, + } + } +} + +impl RecordBatchStream for SchemaAlignStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.target_schema) + } +} diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index c6160bddd4..dff8275f84 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -26,7 +26,9 @@ use crate::execution::operators::IcebergScanExec; use crate::execution::{ expressions::list_positions::ListPositionsExpr, expressions::subquery::Subquery, - operators::{ExecutionError, ExpandExec, ParquetWriterExec, ScanExec, ShuffleScanExec}, + operators::{ + ExecutionError, ExpandExec, ParquetWriterExec, ScanExec, SchemaAlignExec, ShuffleScanExec, + }, planner::expression_registry::ExpressionRegistry, planner::operator_registry::OperatorRegistry, serde::to_arrow_datatype, @@ -1515,9 +1517,14 @@ impl PhysicalPlanner { let (scans, shuffle_scans, child) = self.create_plan(&children[0], inputs, partition_count)?; + let writer_input = align_shuffle_writer_input( + Arc::clone(&child.native_plan), + &writer.expected_output_schema, + )?; + let partitioning = self.create_partitioning( writer.partitioning.as_ref().unwrap(), - child.native_plan.schema(), + writer_input.schema(), )?; let codec = match writer.codec.try_into() { @@ -1535,7 +1542,7 @@ impl PhysicalPlanner { let write_buffer_size = writer.write_buffer_size as usize; let shuffle_writer = Arc::new(ShuffleWriterExec::try_new( - Arc::clone(&child.native_plan), + writer_input, partitioning, codec, writer.output_data_file.clone(), @@ -3124,6 +3131,20 @@ fn convert_spark_types_to_arrow_schema( arrow_schema } +/// Wrap `child` in a `SchemaAlignExec` when its output drifts from what Spark catalyst +/// declared. See . +fn align_shuffle_writer_input( + child: Arc, + expected_proto: &[spark_operator::SparkStructField], +) -> Result, ExecutionError> { + if expected_proto.is_empty() { + return Ok(child); + } + let expected = convert_spark_types_to_arrow_schema(expected_proto); + SchemaAlignExec::try_new_or_passthrough(child, &expected) + .map_err(|e| ExecutionError::DataFusionError(e.to_string())) +} + /// Converts a protobuf PartitionValue to an iceberg Literal. /// fn partition_value_to_literal( diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index 7f50aa928c..da498087b9 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -327,6 +327,12 @@ message ShuffleWriter { // Size of the write buffer in bytes used when writing shuffle data to disk. // Larger values may improve write performance but use more memory. int32 write_buffer_size = 8; + // Spark-declared output schema of the writer's child. When the child is an inlined native + // subtree, the native planner casts the child's actual output to this schema before + // serializing to shuffle blocks, since there is no FFI boundary or ScanExec between them + // to absorb DataFusion-vs-Spark type drift. Empty when the child is a placeholder Scan; + // that path already has a cast point upstream. + repeated SparkStructField expected_output_schema = 9; } message ParquetWriter { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala index 21395c58fd..9fe70f16b5 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometNativeShuffleWriter.scala @@ -34,10 +34,12 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Literal import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning, SinglePartition} import org.apache.spark.sql.comet.{CometExec, CometMetricNode, CometScalarSubquery, PlanDataInjector} import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.types.StructField import org.apache.comet.{CometConf, CometExecIterator} import org.apache.comet.serde.{OperatorOuterClass, PartitioningOuterClass, QueryPlanSerde} import org.apache.comet.serde.OperatorOuterClass.{CompressionCodec, Operator} +import org.apache.comet.serde.operator.schema2Proto /** * Drives the native shuffle write in a single [[CometExecIterator]] per partition. The plan is @@ -329,6 +331,13 @@ class CometNativeShuffleWriter[K, V]( shuffleWriterBuilder.setTracingEnabled(CometConf.COMET_TRACING_ENABLED.get()) + // Used by the native planner to cast the inlined child's output when DataFusion's + // declared return type drifts from Spark catalyst (see comet#4515). + val expectedFields = outputAttributes + .map(a => StructField(a.name, a.dataType, a.nullable, a.metadata)) + .toArray + schema2Proto(expectedFields).foreach(shuffleWriterBuilder.addExpectedOutputSchema) + OperatorOuterClass.Operator .newBuilder() .setShuffleWriter(shuffleWriterBuilder) From c76a26399357576a3f699dde6eb767c58ea7018f Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 14:08:04 -0400 Subject: [PATCH 05/13] stash --- native/core/src/execution/planner.rs | 30 ++++- native/proto/src/proto/operator.proto | 6 + .../org/apache/comet/CometBatchIterator.java | 16 +++ .../org/apache/comet/CometExecIterator.scala | 21 ++++ .../apache/comet/rules/CometExecRule.scala | 24 ++++ .../apache/comet/rules/CometScanRule.scala | 11 ++ .../comet/serde/operator/CometSink.scala | 74 ++++++++++++ .../org/apache/comet/vector/NativeUtil.scala | 22 ++++ .../shuffle/CometShuffleExchangeExec.scala | 33 ++++++ .../shuffle/CometShuffledRowRDD.scala | 29 ++++- .../apache/spark/sql/comet/operators.scala | 48 +++++++- .../comet/exec/CometAggregateSuite.scala | 93 +++++++++++++++ .../apache/comet/exec/CometExecSuite.scala | 111 ++++++++++++++++++ 13 files changed, 515 insertions(+), 3 deletions(-) diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index dff8275f84..bf90eb20cc 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1034,6 +1034,14 @@ impl PhysicalPlanner { )) } OpStruct::HashAgg(agg) => { + // [#4515 instrumentation] Every HashAgg construction with proto sizes. + log::warn!( + "[#4515] OpStruct::HashAgg grouping_exprs.len={} agg_exprs.len={} result_exprs.len={} mode={}", + agg.grouping_exprs.len(), + agg.agg_exprs.len(), + agg.result_exprs.len(), + agg.mode + ); assert_eq!(children.len(), 1); let (scans, shuffle_scans, child) = self.create_plan(&children[0], inputs, partition_count)?; @@ -1173,7 +1181,17 @@ impl PhysicalPlanner { }) .collect(); - if agg.result_exprs.is_empty() { + if !agg.apply_result_projection { + // [#4515 instrumentation] Confirm whether a native HashAggregate is being + // built with empty result_exprs (catalyst-pruned EXISTS / count(*) subquery) + // and what its natural schema would be. + log::warn!( + "[#4515] HashAgg apply_result_projection=false: emitting aggregate natural schema={:?} grouping_exprs.len={} agg_exprs.len={} result_exprs.len={}", + aggregate.schema(), + agg.grouping_exprs.len(), + agg.agg_exprs.len(), + agg.result_exprs.len() + ); Ok(( scans, shuffle_scans, @@ -1443,6 +1461,16 @@ impl PhysicalPlanner { OpStruct::Scan(scan) => { let data_types = scan.fields.iter().map(to_arrow_datatype).collect_vec(); + // [#4515 instrumentation] Log every JVM-bridge Scan's declared column count. + // A 0-column scan paired with a JVM iterator producing batches with columns is + // the AIOOBE-on-exportBatch shape; a non-empty list confirms the inverse. + log::info!( + "[#4515] ScanExec source='{}' declared {} cols: {:?}", + scan.source, + data_types.len(), + data_types + ); + // If it is not test execution context for unit test, we should have at least one // input source if self.exec_context_id != TEST_EXEC_CONTEXT_ID && inputs.is_empty() { diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index da498087b9..921e5bbf35 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -303,6 +303,12 @@ message HashAggregate { // Offset in the child's output where aggregate buffer attributes start. // Used by PartialMerge to locate state fields in the input. int32 initial_input_buffer_offset = 7; + // Whether the native planner should wrap the aggregate in a ProjectionExec using + // `result_exprs`. Disambiguates `result_exprs` empty-because-absent (no projection + // needed) from empty-because-catalyst-pruned-to-zero-cols (project to nothing). + // Without this bit, both wire as `repeated` length 0 and were collapsed to "no + // projection," leaking grouping keys when Spark intent was 0 columns. + bool apply_result_projection = 8; } message Limit { diff --git a/spark/src/main/java/org/apache/comet/CometBatchIterator.java b/spark/src/main/java/org/apache/comet/CometBatchIterator.java index 9b48a47c57..1392f028c0 100644 --- a/spark/src/main/java/org/apache/comet/CometBatchIterator.java +++ b/spark/src/main/java/org/apache/comet/CometBatchIterator.java @@ -38,6 +38,8 @@ public class CometBatchIterator { private final NativeUtil nativeUtil; private ColumnarBatch previousBatch = null; private ColumnarBatch currentBatch = null; + // [#4515 instrumentation] gate first-batch log per instance + private boolean loggedFirstBatch = false; CometBatchIterator(Iterator input, NativeUtil nativeUtil) { this.input = input; @@ -79,6 +81,20 @@ public int next(long[] arrayAddrs, long[] schemaAddrs) { return -1; } + // [#4515 instrumentation] Log first-batch shape per CometBatchIterator instance. + if (!loggedFirstBatch) { + loggedFirstBatch = true; + org.slf4j.LoggerFactory.getLogger("[#4515]") + .warn( + "CometBatchIterator.next first batch: numCols={} numRows={} arrayAddrs.length={} schemaAddrs.length={} inputCls={} this={}", + currentBatch.numCols(), + currentBatch.numRows(), + arrayAddrs.length, + schemaAddrs.length, + input.getClass().getName(), + System.identityHashCode(this)); + } + // export the batch using the Arrow C Data Interface int numRows = nativeUtil.exportBatch(arrayAddrs, schemaAddrs, currentBatch); diff --git a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala index 6140eca553..98a9bc0684 100644 --- a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala +++ b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala @@ -92,6 +92,27 @@ class CometExecIterator( val conf = SparkEnv.get.conf val localDiskDirs = SparkEnv.get.blockManager.getLocalDiskDirs + // [#4515 instrumentation] Dump proto plan + per-input iterator class so we can correlate + // a 0-col Scan in the proto with the JVM iterator that will feed it. + { + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + val parsed = + scala.util + .Try( + org.apache.comet.serde.OperatorOuterClass.Operator + .parseFrom(protobufQueryPlan) + .toString) + .toOption + .getOrElse("") + val itersDesc = inputIterators.zipWithIndex + .map { case (it, idx) => s" inputIterators[$idx] cls=${it.getClass.getName}" } + .mkString("\n") + log.warn( + s"CometExecIterator constructing plan id=$id partition=$partitionIndex " + + s"numParts=$numParts numOutputCols=$numOutputCols\n$itersDesc\n" + + s" proto plan:\n$parsed") + } + // serialize Comet related Spark configs in protobuf format val protobufSparkConfigs = CometExecIterator.serializeCometSQLConfs() diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index aeb7db40ad..6eb314bbb6 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -166,6 +166,16 @@ case class CometExecRule(session: SparkSession) val reverted = s.originalPlan.withNewChildren(Seq(s.child)).asInstanceOf[ShuffleExchangeExec] reverted.setTagValue(CometExecRule.SKIP_COMET_SHUFFLE_TAG, ()) + // [#4515 instrumentation] Log every revert: this is a primary place where + // vanilla ShuffleExchangeExec instances enter the plan post-Comet-rewrite. + org.slf4j.LoggerFactory + .getLogger("[#4515]") + .warn( + s"revertRedundantColumnarShuffle revert: parentAgg=${op.getClass.getSimpleName} " + + s"(output=${op.output}) cometShuffle.output=${s.output} " + + s"reverted.output=${reverted.output} reverted.identityHash=${System + .identityHashCode(reverted)}\n" + + s" reverted tree:\n${reverted.treeString(verbose = true, addSuffix = false)}") logInfo( "Reverting Comet columnar shuffle to Spark shuffle between " + s"${op.getClass.getSimpleName} and ${s.child.getClass.getSimpleName} " + @@ -546,6 +556,20 @@ case class CometExecRule(session: SparkSession) |${sideBySide(plan.treeString, newPlan.treeString).mkString("\n")} |""".stripMargin) } + // [#4515 instrumentation] Dump the plan in/out of CometExecRule to correlate with the + // 0-col Scan synthesized later. Logs the operator-class diff so we can see what + // CometExecRule did or didn't replace, especially around ShuffleExchangeExec wrappers + // inside subqueries. + if (!newPlan.fastEquals(plan)) { + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + log.warn( + "CometExecRule rewrote plan:\n IN classes: " + + plan.collect { case p => p.getClass.getSimpleName }.mkString(",") + + "\n OUT classes: " + + newPlan.collect { case p => p.getClass.getSimpleName }.mkString(",") + + s"\n IN tree:\n${plan.treeString(verbose = true, addSuffix = false)}" + + s"\n OUT tree:\n${newPlan.treeString(verbose = true, addSuffix = false)}") + } newPlan } diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 7601fa1c6b..1f7f509db0 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -68,6 +68,17 @@ case class CometScanRule(session: SparkSession) |${sideBySide(plan.treeString, newPlan.treeString).mkString("\n")} |""".stripMargin) } + // [#4515 instrumentation] + if (!newPlan.fastEquals(plan)) { + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + log.warn( + "CometScanRule rewrote plan:\n IN classes: " + + plan.collect { case p => p.getClass.getSimpleName }.mkString(",") + + "\n OUT classes: " + + newPlan.collect { case p => p.getClass.getSimpleName }.mkString(",") + + s"\n IN tree:\n${plan.treeString(verbose = true, addSuffix = false)}" + + s"\n OUT tree:\n${newPlan.treeString(verbose = true, addSuffix = false)}") + } newPlan } diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala index 845803d133..8a1c576a2d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala +++ b/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala @@ -49,6 +49,12 @@ abstract class CometSink[T <: SparkPlan] extends CometOperatorSerde[T] { op: T, builder: Operator.Builder, childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = { + // [#4515 instrumentation] + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + log.warn( + s"CometSink[${this.getClass.getSimpleName}].convert op=${op.getClass.getName} " + + s"simpleString='${op.simpleStringWithNodeId()}' output=${op.output} " + + s"output.size=${op.output.size}") val supportedTypes = op.output.forall(a => supportedDataType(a.dataType, allowComplex = true)) @@ -72,6 +78,53 @@ abstract class CometSink[T <: SparkPlan] extends CometOperatorSerde[T] { } if (scanTypes.length == op.output.length) { + // [#4515 instrumentation] Log when we synthesize a Scan with zero declared columns. + // The runtime JVM iterator may still produce columns (subquery output shrunk by + // catalyst before serialization while the underlying RDD reflects the pre-shrink shape), + // tripping the column-count guard in NativeUtil.exportBatch. + if (scanTypes.isEmpty) { + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + // scalastyle:off line.size.limit + val childInfo = op.children.zipWithIndex + .map { case (c, i) => + val canonOut = scala.util + .Try(c.canonicalized.output) + .toOption + .map(_.toString) + .getOrElse("") + s" child[$i] cls=${c.getClass.getName} simpleString='${c.simpleString( + 80)}' output=${c.output} outputSize=${c.output.size} identityHash=${System + .identityHashCode(c)} canonicalized.output=$canonOut" + } + .mkString("\n") + val opCanonOut = scala.util + .Try(op.canonicalized.output) + .toOption + .map(_.toString) + .getOrElse("") + val subqueryInfo = scala.util + .Try(op.subqueries.map(s => s"${s.getClass.getName}(output=${s.output}, prepared=?)")) + .toOption + .getOrElse(Nil) + .mkString("[", ", ", "]") + val callerStack = + new RuntimeException("[#4515] CometSink 0-col Scan caller").getStackTrace + .take(20) + .map(f => s" at ${f}") + .mkString("\n") + log.warn(s"CometSink synthesizing 0-col Scan for op=${op.getClass.getName}\n" + + s" simpleString='${op.simpleStringWithNodeId()}'\n" + + s" op.output=${op.output} op.outputSet=${op.outputSet} op.references=${op.references}\n" + + s" op.canonicalized.output=$opCanonOut\n" + + s" op.subqueries=$subqueryInfo\n" + + s" op identityHash=${System.identityHashCode(op)}\n" + + s" children classes=${op.children.map(_.getClass.getSimpleName).mkString("[", ",", "]")}\n" + + childInfo + "\n" + + s" caller stack:\n$callerStack\n" + + s" op tree:\n${op.treeString(verbose = true, addSuffix = false)}") + // scalastyle:on line.size.limit + } + scanBuilder.addAllFields(scanTypes.asJava) // Sink operators don't have children @@ -94,6 +147,27 @@ object CometExchangeSink extends CometSink[SparkPlan] { op: SparkPlan, builder: Operator.Builder, childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = { + // [#4515 instrumentation] + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + val isVanillaSparkExchange = + op.getClass.getName == "org.apache.spark.sql.execution.exchange.ShuffleExchangeExec" + log.warn( + s"CometExchangeSink.convert op=${op.getClass.getName} " + + s"simpleString='${op.simpleStringWithNodeId()}' output=${op.output} " + + s"useShuffleScan=${shouldUseShuffleScan(op)} " + + s"children=${op.children.map(_.getClass.getSimpleName).mkString("[", ",", "]")}") + if (isVanillaSparkExchange) { + val callerStack = + new RuntimeException("[#4515] vanilla ShuffleExchangeExec caller").getStackTrace + .take(20) + .map(f => s" at ${f}") + .mkString("\n") + log.warn( + " vanilla ShuffleExchangeExec being processed by CometExchangeSink:\n" + + s" output=${op.output}\n" + + s" caller stack:\n$callerStack\n" + + s" op tree:\n${op.treeString(verbose = true, addSuffix = false)}") + } if (shouldUseShuffleScan(op)) { convertToShuffleScan(op, builder) } else { diff --git a/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala b/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala index 4f027cd9e7..388f5114e6 100644 --- a/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala +++ b/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala @@ -114,6 +114,28 @@ class NativeUtil { batch: ColumnarBatch): Int = { val numRows = mutable.ArrayBuffer.empty[Int] + if (arrayAddrs.length != batch.numCols() || schemaAddrs.length != batch.numCols()) { + val schemaSummary = (0 until batch.numCols()) + .map { i => + val v = batch.column(i) match { + case cv: CometVector => cv.getValueVector + case _ => null + } + if (v != null) s"col[$i]: ${v.getField}" + else s"col[$i]: ${batch.column(i).getClass.getName}" + } + .mkString("; ") + val taskAttempt = Option(org.apache.spark.TaskContext.get()) + .map(c => s"stage=${c.stageId} task=${c.taskAttemptId} partition=${c.partitionId}") + .getOrElse("no-task") + throw new SparkException( + "CometBatchIterator column-count mismatch [#4515 instrumentation]: " + + s"native expected arrayAddrs=${arrayAddrs.length}, schemaAddrs=${schemaAddrs.length}; " + + s"JVM iterator produced batch.numCols=${batch.numCols()} ($taskAttempt). " + + s"Batch schema: $schemaSummary", + new RuntimeException("placeholder for exportBatch column-count mismatch")) + } + (0 until batch.numCols()).foreach { index => batch.column(index) match { case a: CometVector => diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala index 1470d637d9..ba152c2f13 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala @@ -243,6 +243,23 @@ case class CometShuffleExchangeExec( * Comet returns RDD[ColumnarBatch] for columnar execution. */ protected override def doExecuteColumnar(): RDD[ColumnarBatch] = { + // [#4515 instrumentation] Track every doExecuteColumnar call: which CometShuffleExchange + // instance, its output, and the call site. Helps confirm whether this PR's changes + // cause an extra EnsureRequirements-inserted vanilla Exchange to wrap us, and what + // RDD is being plumbed where. + { + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + val callerStack = new RuntimeException( + "[#4515] CometShuffleExchangeExec.doExecuteColumnar caller").getStackTrace + .take(15) + .map(f => s" at ${f}") + .mkString("\n") + log.warn( + s"CometShuffleExchangeExec.doExecuteColumnar this=${System.identityHashCode(this)} " + + s"shuffleType=$shuffleType outputPartitioning=$outputPartitioning " + + s"output=$output\n" + + s" caller stack:\n$callerStack") + } // Returns the same CometShuffledBatchRDD if this plan is used by multiple plans. if (cachedShuffleRDD == null) { cachedShuffleRDD = new CometShuffledBatchRDD(shuffleDependency, readMetrics) @@ -687,6 +704,22 @@ object CometShuffleExchangeExec serializer: Serializer, metrics: Map[String, SQLMetric]): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = { + // [#4515 instrumentation] Track every placeholder-Scan ShuffleDependency we build, with + // outputAttributes (drives Scan declared schema) and call site. + { + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + val callerStack = + new RuntimeException("[#4515] prepareShuffleDependency caller").getStackTrace + .take(15) + .map(f => s" at ${f}") + .mkString("\n") + log.warn( + s"prepareShuffleDependency outputAttributes=$outputAttributes " + + s"outputAttributes.size=${outputAttributes.size} " + + s"outputPartitioning=$outputPartitioning rdd.numPartitions=${rdd.getNumPartitions}\n" + + s" caller stack:\n$callerStack") + } + val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput") val scanTypes = outputAttributes.flatten { attr => QueryPlanSerde.serializeDataType(attr.dataType) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala index 7604910b06..e0e5c45b2a 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala @@ -162,7 +162,34 @@ class CometShuffledBatchRDD( override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = { val reader = createReader(split, context) // TODO: Reads IPC by native code - reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2) + val raw = reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2) + // [#4515 instrumentation] Peek the first decoded batch to confirm wire schema vs caller + // expectations. Wraps so we don't consume the iterator. + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + new Iterator[ColumnarBatch] { + private var logged = false + override def hasNext: Boolean = raw.hasNext + override def next(): ColumnarBatch = { + val b = raw.next() + if (!logged) { + logged = true + val schemaSummary = (0 until b.numCols()) + .map { i => + val v = b.column(i) match { + case cv: org.apache.comet.vector.CometVector => cv.getValueVector + case _ => null + } + if (v != null) s"col[$i]: ${v.getField}" + else s"col[$i]: ${b.column(i).getClass.getName}" + } + .mkString("; ") + log.warn(s"CometShuffledBatchRDD.compute first decoded batch: numCols=${b.numCols()} " + + s"numRows=${b.numRows()} stage=${context.stageId()} task=${context.taskAttemptId()} " + + s"partition=${split.index} schema=[$schemaSummary]") + } + b + } + } } override def clearDependencies(): Unit = { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index fa7aa6f1b2..310767489d 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -422,7 +422,15 @@ abstract class CometNativeExec extends CometExec { /** The Comet native operator */ def nativeOp: Operator - override protected def doPrepare(): Unit = prepareSubqueries(this) + override protected def doPrepare(): Unit = { + // [#4515 instrumentation] Track when subqueries are prepared for this CometNativeExec. + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + log.warn( + s"CometNativeExec.doPrepare this=${System.identityHashCode(this)} " + + s"cls=${this.getClass.getName} " + + s"originalPlan.cls=${originalPlan.getClass.getName}") + prepareSubqueries(this) + } override lazy val metrics: Map[String, SQLMetric] = CometMetricNode.baselineMetrics(sparkContext) @@ -563,6 +571,14 @@ abstract class CometNativeExec extends CometExec { // same partition number. But for Comet, we need to zip them so we need to adjust the // partition number of Broadcast RDDs to make sure they have the same partition number. sparkPlans.zipWithIndex.foreach { case (plan, idx) => + // [#4515 instrumentation] Log every JVM-side input plan we wire to native, so we can + // correlate the Scan's declared schema with the runtime plan whose RDD feeds it. + val log = org.slf4j.LoggerFactory.getLogger("[#4515]") + log.warn( + s"buildNativeContext binding input[$idx] cls=${plan.getClass.getName} " + + s"simpleString='${plan.simpleStringWithNodeId()}' output=${plan.output} " + + s"output.size=${plan.output.size} identityHash=${System.identityHashCode(plan)}\n" + + s" subtree:\n${plan.treeString(verbose = true, addSuffix = false)}") plan match { case c: CometBroadcastExchangeExec => inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) @@ -1516,6 +1532,25 @@ trait CometBaseAggregate { // If the aggregateExpressions is empty, we only want to build groupingExpressions, // and skip processing of aggregateExpressions. if (aggregateExpressions.isEmpty) { + // [#4515 instrumentation] Track HashAgg serializations with empty resultExpressions. + // Catalyst prunes resultExpressions for EXISTS / row-existence-only subqueries; the + // native side currently interprets empty result_exprs as "use aggregate natural + // schema", which leaks grouping keys into the output. + org.slf4j.LoggerFactory + .getLogger("[#4515]") + .warn( + "HashAgg empty-aggExprs branch: " + + s"groupingExprs=${groupingExpressions} " + + s"resultExpressions=${resultExpressions} " + + s"resultExpressions.size=${resultExpressions.size} " + + s"aggregate.output=${aggregate.output} " + + s"aggregate.output.size=${aggregate.output.size} " + + s"modes(from aggExprs)=${modes} " + + s"sparkFinalMode=$sparkFinalMode " + + s"requiredChildDistribution=${aggregate.requiredChildDistribution} " + + s"isProjectionToEmpty=${resultExpressions.isEmpty && aggregate.output.isEmpty} " + + s"naturalEqualsIntent=${resultExpressions.map(_.toAttribute) == groupingExpressions + .map(_.toAttribute)}") val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder() hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava) val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes @@ -1528,6 +1563,14 @@ trait CometBaseAggregate { return None } hashAggBuilder.addAllResultExprs(resultExprs.map(_.get).asJava) + // Force native to apply the projection when Spark's expected output (which may + // be empty for catalyst-pruned EXISTS / row-existence-only subqueries) differs + // from the aggregate's natural grouping output. Without this, an empty proto + // result_exprs is indistinguishable from "no projection needed" and the natural + // grouping keys leak through. + val naturalEqualsIntent = + resultExpressions.map(_.toAttribute) == groupingExpressions.map(_.toAttribute) + hashAggBuilder.setApplyResultProjection(!naturalEqualsIntent) Some(builder.setHashAgg(hashAggBuilder).build()) } else { // Validate mode combinations. We support: @@ -1608,6 +1651,9 @@ trait CometBaseAggregate { return None } hashAggBuilder.addAllResultExprs(resultExprs.map(_.get).asJava) + // Final aggs always need the projection (matches the existing condition under + // which we serialize result_exprs at all). See comet#4515. + hashAggBuilder.setApplyResultProjection(true) } hashAggBuilder.setModeValue(mode.getNumber) diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala index cd0beb56cc..f098040878 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala @@ -2109,4 +2109,97 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } + // Regression for comet#4515: a HashAggregateExec whose `resultExpressions` (and therefore + // `output`) catalyst has pruned to empty must still produce 0-col batches at runtime. + // Catalyst prunes `resultExpressions=[]` for plans where the aggregate's column values are + // unused downstream - classically EXISTS subqueries that get rewritten into a literal-`1` + // wrapper. Before the fix, the native HashAggregate emitted its natural output (the + // grouping keys) regardless of the pruned JVM `output`, so any boundary that derived a + // schema from `output` (e.g. a wrapping vanilla Spark `ShuffleExchangeExec.output = + // child.output = []`, or a JVM-bridge Scan synthesized from the same `output`) declared + // 0 columns while the runtime RDD produced 1. The mismatch tripped + // `NativeUtil.exportBatch` with an ArrayIndexOutOfBoundsException on a length-0 + // schemaAddrs[]. + // + // The bug needs a specific catalyst optimizer state: `HashAggregateExec. + // resultExpressions.isEmpty`. Whether the optimizer reaches that state from a given SQL + // depends on Spark version, scan source (LocalTableScan vs Parquet/native), AQE state, + // and which Comet rules already fired - we observed it in the SQL-tests harness running + // `subquery/exists-subquery/exists-orderby-limit.sql` on Spark 4.0.2 with parquet-backed + // temp views, but not via `Dataset.collect` over `LocalTableScan`-backed temp views + // under `CometTestBase`. So the test below writes parquet (matching the harness's scan + // shape), tries several known triggers, runs whichever (if any) produces the bug shape + // under `checkSparkAnswer`, and skips cleanly otherwise. The upstream SQL-tests run + // remains the primary safety net for the harness-only path. + test("HashAggregate with catalyst-pruned resultExpressions returns 0-col output (#4515)") { + withTempDir { dir => + withTempView("emp", "dept") { + // Write parquet so Comet's native scan path (vs LocalTableScan) is the source - + // matches the SQL-tests harness setup that surfaced the bug. + val empPath = new java.io.File(dir, "emp").getAbsolutePath + val deptPath = new java.io.File(dir, "dept").getAbsolutePath + + spark + .sql("""SELECT * FROM VALUES + | (100, 'emp 1', 100.0D, 10), + | (200, 'emp 2', 200.0D, 10), + | (300, 'emp 3', 300.0D, 20), + | (400, 'emp 4', 400.0D, 30), + | (500, 'emp 5', 400.0D, NULL), + | (700, 'emp 7', 400.0D, 100), + | (800, 'emp 8', 150.0D, 70) + |AS t(id, emp_name, salary, dept_id)""".stripMargin) + .write + .parquet(empPath) + spark + .sql("""SELECT * FROM VALUES + | (10, 'CA'), (20, 'NY'), (30, 'TX'), + | (40, 'OR'), (50, 'NJ'), (70, 'FL') + |AS t(dept_id, state)""".stripMargin) + .write + .parquet(deptPath) + + spark.read.parquet(empPath).createOrReplaceTempView("emp") + spark.read.parquet(deptPath).createOrReplaceTempView("dept") + + val candidates = Seq( + // The original failing SQL from the harness - EXISTS with grouped agg + LIMIT/OFFSET. + """SELECT * FROM emp + |WHERE EXISTS ( + | SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1 OFFSET 2)""".stripMargin, + // Inline view + outer constant: ColumnPruning may strip the inner agg's output. + """SELECT 1 FROM ( + | SELECT max(dept_id) FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin, + // Scalar subquery returning a constant. + """SELECT (SELECT 1 FROM dept GROUP BY state LIMIT 1 OFFSET 2)""".stripMargin, + // count(*) over a derived table: outer doesn't reference inner cols. + """SELECT count(*) FROM ( + | SELECT max(dept_id) AS m FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin) + + // Find a candidate whose plan has a HashAggregateExec (Spark or Comet) with empty + // resultExpressions. collectWithSubqueries traverses Subquery nodes too. + val triggering = candidates.find { sql => + val plan = spark.sql(sql).queryExecution.executedPlan + collectWithSubqueries(plan) { + case a: org.apache.spark.sql.execution.aggregate.HashAggregateExec + if a.resultExpressions.isEmpty => + a + case a: CometHashAggregateExec if a.resultExpressions.isEmpty => a + }.nonEmpty + } + + triggering match { + case Some(sql) => checkSparkAnswer(sql) + case None => + cancel( + "No candidate query produced a HashAggregateExec with empty resultExpressions " + + "in this environment. The catalyst-pruned shape that exercises #4515 only " + + "appears under specific optimizer/AQE state we couldn't reproduce here. The " + + "upstream SQL-tests run (subquery/exists-subquery/exists-orderby-limit.sql) " + + "covers this path.") + } + } + } + } + } diff --git a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala index 16601d056b..fded42d050 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala @@ -3959,6 +3959,117 @@ class CometExecSuite extends CometTestBase { } } + // Repro for the Spark 3.5 SQL-tests failure on subquery/exists-subquery/exists-orderby-limit.sql + // (query #19) on branch opt_native_shuffle. Crashes with NativeUtil.exportBatch:132 AIOOBE + // on this branch but passes on main. The SQL file declares three CONFIG_DIM1 combos for + // codegen; the failure trace doesn't say which one fires it, so sweep all three. + test("EXISTS subquery with GROUP BY + LIMIT + OFFSET") { + withTempView("emp", "dept") { + spark + .sql("""SELECT * FROM VALUES + | (100, 'emp 1', date '2005-01-01', 100.0D, 10), + | (200, 'emp 2', date '2003-01-01', 200.0D, 10), + | (300, 'emp 3', date '2002-01-01', 300.0D, 20), + | (400, 'emp 4', date '2005-01-01', 400.0D, 30), + | (500, 'emp 5', date '2001-01-01', 400.0D, NULL), + | (700, 'emp 7', date '2010-01-01', 400.0D, 100), + | (800, 'emp 8', date '2016-01-01', 150.0D, 70) + |AS t(id, emp_name, hiredate, salary, dept_id)""".stripMargin) + .createOrReplaceTempView("emp") + spark + .sql("""SELECT * FROM VALUES + | (10, 'dept 1', 'CA'), + | (20, 'dept 2', 'NY'), + | (30, 'dept 3', 'TX'), + | (40, 'dept 4 - unassigned', 'OR'), + | (50, 'dept 5 - unassigned', 'NJ'), + | (70, 'dept 7', 'FL') + |AS t(dept_id, dept_name, state)""".stripMargin) + .createOrReplaceTempView("dept") + + val configDims = Seq( + Map("spark.sql.codegen.wholeStage" -> "true"), + Map( + "spark.sql.codegen.wholeStage" -> "false", + "spark.sql.codegen.factoryMode" -> "CODEGEN_ONLY"), + Map( + "spark.sql.codegen.wholeStage" -> "false", + "spark.sql.codegen.factoryMode" -> "NO_CODEGEN")) + + // Mirror the SQL test order: queries #1 through #16 from the file run before #17 (the + // one whose CI failure we're chasing). Query #11's subquery is identical except for + // the OFFSET; #13 and #15 share similar shapes. Subquery materialization / AQE plan + // cache state from running them first may alter query #17's executed shape. + val priorQueries = Seq( + // #1 + """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate""", + // #2 + """SELECT id, hiredate FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate DESC""", + // #3 + """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 1) ORDER BY hiredate""", + // #4 + """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 0) ORDER BY hiredate""", + // #5 + """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate""", + // #6 + """SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state)""", + // #7 + """SELECT count(*) FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY dept_id ORDER BY dept_id)""", + // #8 + """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 1) ORDER BY hiredate""", + // #9 + """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 0) ORDER BY hiredate""", + // #10 + """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1)""", + // #11 - same subquery as #17 minus the OFFSET + """SELECT * FROM emp WHERE EXISTS (SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1)""", + // #12 + """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 100 LIMIT 1)""", + // #13 + """SELECT * FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) FROM dept WHERE dept.dept_id > 100 GROUP BY state LIMIT 1)""", + // #14 + """SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state LIMIT 2 OFFSET 1)""", + // #15 + """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1 OFFSET 2)""", + // #16 + """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > emp.dept_id LIMIT 1)""") + + for (dim <- configDims) { + // SQL-tests harness (dev/diffs/3.5.8.diff, 4.1.1.diff) sets only: + // spark.comet.enabled, spark.comet.exec.enabled, spark.comet.exec.shuffle.enabled, + // spark.comet.parquet.respectFilterPushdown, spark.shuffle.manager, + // spark.comet.memoryOverhead. + // It does NOT enable spark.comet.sparkToColumnar.enabled, but CometTestBase does. + // Force the harness shape so the reproducer matches the failing config. + val harnessConf = dim ++ Map(CometConf.COMET_SPARK_TO_ARROW_ENABLED.key -> "false") + withSQLConf(harnessConf.toSeq: _*) { + // scalastyle:off println + println(s"\n===== config dim: $harnessConf =====") + for ((q, idx) <- priorQueries.zipWithIndex) { + spark.sql(q).collect() + println(s"--- ran prior query #${idx + 1} ---") + } + val sql = """SELECT * + |FROM emp + |WHERE EXISTS (SELECT max(dept.dept_id) + | FROM dept + | GROUP BY state + | LIMIT 1 + | OFFSET 2)""".stripMargin + val df = spark.sql(sql) + println("--- query #17 initial executedPlan ---") + println(df.queryExecution.executedPlan) + val rows = df.collect() + println("--- query #17 final (post-AQE) executedPlan ---") + println(df.queryExecution.executedPlan) + println(s"--- ${rows.length} rows ---") + // scalastyle:on println + checkSparkAnswer(sql) + } + } + } + } + } case class BucketedTableTestSpec( From 1b55b97132f627b4e5257f44ca144f8d70c40494 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 14:21:14 -0400 Subject: [PATCH 06/13] refactor to handle on JVM side. --- native/core/src/execution/planner.rs | 63 ++-------- native/proto/src/proto/operator.proto | 14 +-- .../apache/spark/sql/comet/operators.scala | 113 ++++++++++++------ 3 files changed, 99 insertions(+), 91 deletions(-) diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index bf90eb20cc..8abc19c1f5 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1036,10 +1036,9 @@ impl PhysicalPlanner { OpStruct::HashAgg(agg) => { // [#4515 instrumentation] Every HashAgg construction with proto sizes. log::warn!( - "[#4515] OpStruct::HashAgg grouping_exprs.len={} agg_exprs.len={} result_exprs.len={} mode={}", + "[#4515] OpStruct::HashAgg grouping_exprs.len={} agg_exprs.len={} mode={}", agg.grouping_exprs.len(), agg.agg_exprs.len(), - agg.result_exprs.len(), agg.mode ); assert_eq!(children.len(), 1); @@ -1171,55 +1170,19 @@ impl PhysicalPlanner { Arc::clone(&schema), )?, ); - let result_exprs: PhyExprResult = agg - .result_exprs - .iter() - .enumerate() - .map(|(idx, expr)| { - self.create_expr(expr, aggregate.schema()) - .map(|r| (r, format!("col_{idx}"))) - }) - .collect(); - if !agg.apply_result_projection { - // [#4515 instrumentation] Confirm whether a native HashAggregate is being - // built with empty result_exprs (catalyst-pruned EXISTS / count(*) subquery) - // and what its natural schema would be. - log::warn!( - "[#4515] HashAgg apply_result_projection=false: emitting aggregate natural schema={:?} grouping_exprs.len={} agg_exprs.len={} result_exprs.len={}", - aggregate.schema(), - agg.grouping_exprs.len(), - agg.agg_exprs.len(), - agg.result_exprs.len() - ); - Ok(( - scans, - shuffle_scans, - Arc::new(SparkPlan::new(spark_plan.plan_id, aggregate, vec![child])), - )) - } else { - // For final aggregation, DF's hash aggregate exec doesn't support Spark's - // aggregate result expressions like `COUNT(col) + 1`, but instead relying - // on additional `ProjectionExec` to handle the case. Therefore, here we'll - // add a projection node on top of the aggregate node. - // - // Note that `result_exprs` should only be set for final aggregation on the - // Spark side. - let projection = Arc::new(ProjectionExec::try_new( - result_exprs?, - Arc::clone(&aggregate), - )?); - Ok(( - scans, - shuffle_scans, - Arc::new(SparkPlan::new_with_additional( - spark_plan.plan_id, - projection, - vec![child], - vec![aggregate], - )), - )) - } + // The native HashAggregate emits its natural shape (group keys + agg + // results / state). Any post-aggregate projection Spark catalyst declares + // (`COUNT(col) + 1`, EXISTS-pruned-to-empty output, alias renames, etc.) is + // expressed as an explicit `OpStruct::Projection` op above the aggregate + // by the JVM serializer (see `CometBaseAggregate.doConvert`). Keeping that + // logic on the JVM side means only one place decides plan shape, and the + // native side stays a faithful executor. See comet#4515. + Ok(( + scans, + shuffle_scans, + Arc::new(SparkPlan::new(spark_plan.plan_id, aggregate, vec![child])), + )) } OpStruct::Limit(limit) => { assert_eq!(children.len(), 1); diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index 921e5bbf35..aaff3cc4de 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -294,7 +294,13 @@ message Sort { message HashAggregate { repeated spark.spark_expression.Expr grouping_exprs = 1; repeated spark.spark_expression.AggExpr agg_exprs = 2; - repeated spark.spark_expression.Expr result_exprs = 3; + // Field 3 (`result_exprs`) and field 8 (`apply_result_projection`) were used to apply a + // post-aggregate projection inside the HashAggregate operator. The same effect is now + // expressed by emitting an explicit `Projection` op above the `HashAggregate` from the + // JVM serializer when needed (see `CometBaseAggregate.doConvert`). Reserved to avoid + // accidental reuse at incompatible semantics. + reserved 3, 8; + reserved "result_exprs", "apply_result_projection"; AggregateMode mode = 5; // Per-expression modes for mixed-mode aggregates (e.g., PartialMerge + Partial). // When set, each entry corresponds to agg_exprs at the same index. @@ -303,12 +309,6 @@ message HashAggregate { // Offset in the child's output where aggregate buffer attributes start. // Used by PartialMerge to locate state fields in the input. int32 initial_input_buffer_offset = 7; - // Whether the native planner should wrap the aggregate in a ProjectionExec using - // `result_exprs`. Disambiguates `result_exprs` empty-because-absent (no projection - // needed) from empty-because-catalyst-pruned-to-zero-cols (project to nothing). - // Without this bit, both wire as `repeated` length 0 and were collapsed to "no - // projection," leaking grouping keys when Spark intent was 0 columns. - bool apply_result_projection = 8; } message Limit { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 310767489d..3240adcc01 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -1553,25 +1553,48 @@ trait CometBaseAggregate { .map(_.toAttribute)}") val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder() hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava) - val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes - val resultExprs = resultExpressions.map(exprToProto(_, attributes)) - if (resultExprs.exists(_.isEmpty)) { - withInfo( - aggregate, - s"Unsupported result expressions found in: $resultExpressions", - resultExpressions: _*) - return None + // The native HashAggregate emits its natural shape (the grouping keys, since there + // are no aggregate functions). When Spark catalyst declares a different output - + // either column-renaming via aliases, or an entirely empty output for catalyst-pruned + // EXISTS / row-existence-only subqueries - we wrap the HashAggregate in an explicit + // Projection op so the native side reshapes accordingly. See comet#4515: an empty + // declared output paired with the natural grouping-key output crashed downstream + // boundaries that derived their schema from the declared output. + val naturalOutput = groupingExpressions.map(_.toAttribute) + val needsProjection = resultExpressions.map(_.toAttribute) != naturalOutput + if (needsProjection) { + val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes + val resultExprs = resultExpressions.map(exprToProto(_, attributes)) + if (resultExprs.exists(_.isEmpty)) { + withInfo( + aggregate, + s"Unsupported result expressions found in: $resultExpressions", + resultExpressions: _*) + return None + } + // Build the inner HashAgg op carrying the original child operators from `builder`. + // Use a fresh outer builder for the Projection so it gets a single child (the + // HashAgg op), not the original children appended on top. Both ops share the same + // plan_id so the inner aggregate's native metrics roll up under the same Spark + // operator in the metric tree (otherwise they'd orphan against plan_id=0). + val hashAggOp = OperatorOuterClass.Operator + .newBuilder() + .setPlanId(builder.getPlanId) + .addAllChildren(builder.getChildrenList) + .setHashAgg(hashAggBuilder) + .build() + val projectionBuilder = OperatorOuterClass.Projection.newBuilder() + projectionBuilder.addAllProjectList(resultExprs.map(_.get).asJava) + Some( + OperatorOuterClass.Operator + .newBuilder() + .setPlanId(builder.getPlanId) + .addChildren(hashAggOp) + .setProjection(projectionBuilder) + .build()) + } else { + Some(builder.setHashAgg(hashAggBuilder).build()) } - hashAggBuilder.addAllResultExprs(resultExprs.map(_.get).asJava) - // Force native to apply the projection when Spark's expected output (which may - // be empty for catalyst-pruned EXISTS / row-existence-only subqueries) differs - // from the aggregate's natural grouping output. Without this, an empty proto - // result_exprs is indistinguishable from "no projection needed" and the natural - // grouping keys leak through. - val naturalEqualsIntent = - resultExpressions.map(_.toAttribute) == groupingExpressions.map(_.toAttribute) - hashAggBuilder.setApplyResultProjection(!naturalEqualsIntent) - Some(builder.setHashAgg(hashAggBuilder).build()) } else { // Validate mode combinations. We support: // - All Partial @@ -1640,21 +1663,6 @@ trait CometBaseAggregate { val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder() hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava) hashAggBuilder.addAllAggExprs(aggExprs.map(_.get).asJava) - if (mode == CometAggregateMode.Final) { - val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes - val resultExprs = resultExpressions.map(exprToProto(_, attributes)) - if (resultExprs.exists(_.isEmpty)) { - withInfo( - aggregate, - s"Unsupported result expressions found in: $resultExpressions", - resultExpressions: _*) - return None - } - hashAggBuilder.addAllResultExprs(resultExprs.map(_.get).asJava) - // Final aggs always need the projection (matches the existing condition under - // which we serialize result_exprs at all). See comet#4515. - hashAggBuilder.setApplyResultProjection(true) - } hashAggBuilder.setModeValue(mode.getNumber) // Send per-expression modes and buffer offset for PartialMerge handling @@ -1673,7 +1681,44 @@ trait CometBaseAggregate { hashAggBuilder.setInitialInputBufferOffset(aggregate.initialInputBufferOffset) } - Some(builder.setHashAgg(hashAggBuilder).build()) + // Final aggregations may carry a result projection (e.g. `COUNT(col) + 1`) that + // catalyst encodes via `resultExpressions`. DataFusion's hash aggregate only emits + // its natural shape (group keys + agg results), so we wrap the HashAggregate in + // an explicit Projection op to apply Spark's result expressions. Partial / + // PartialMerge aggregates emit raw state buffers and never need the projection. + // See comet#4515. + if (mode == CometAggregateMode.Final) { + val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes + val resultExprs = resultExpressions.map(exprToProto(_, attributes)) + if (resultExprs.exists(_.isEmpty)) { + withInfo( + aggregate, + s"Unsupported result expressions found in: $resultExpressions", + resultExpressions: _*) + return None + } + // Inner HashAgg keeps the original input children from `builder`. Outer + // Projection uses a fresh builder so it has a single child (the HashAgg op). + // Both ops share the same plan_id so the aggregate's native metrics aggregate + // under the same Spark operator (else they'd orphan against plan_id=0). + val hashAggOp = OperatorOuterClass.Operator + .newBuilder() + .setPlanId(builder.getPlanId) + .addAllChildren(builder.getChildrenList) + .setHashAgg(hashAggBuilder) + .build() + val projectionBuilder = OperatorOuterClass.Projection.newBuilder() + projectionBuilder.addAllProjectList(resultExprs.map(_.get).asJava) + Some( + OperatorOuterClass.Operator + .newBuilder() + .setPlanId(builder.getPlanId) + .addChildren(hashAggOp) + .setProjection(projectionBuilder) + .build()) + } else { + Some(builder.setHashAgg(hashAggBuilder).build()) + } } else { val allChildren: Seq[Expression] = groupingExpressions ++ aggregateExpressions ++ aggregateAttributes From d736dd5b7194c4363fb8a62d17e37b56efb2d8a3 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 14:36:21 -0400 Subject: [PATCH 07/13] remove instrumentation. --- native/core/src/execution/planner.rs | 17 --- .../org/apache/comet/CometBatchIterator.java | 16 --- .../org/apache/comet/CometExecIterator.scala | 21 ---- .../apache/comet/rules/CometExecRule.scala | 24 ---- .../apache/comet/rules/CometScanRule.scala | 11 -- .../comet/serde/operator/CometSink.scala | 74 ------------ .../org/apache/comet/vector/NativeUtil.scala | 22 ---- .../shuffle/CometShuffleExchangeExec.scala | 33 ------ .../shuffle/CometShuffledRowRDD.scala | 29 +---- .../apache/spark/sql/comet/operators.scala | 37 +----- .../apache/comet/exec/CometExecSuite.scala | 111 ------------------ 11 files changed, 2 insertions(+), 393 deletions(-) diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 8abc19c1f5..18c3acf2dd 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1034,13 +1034,6 @@ impl PhysicalPlanner { )) } OpStruct::HashAgg(agg) => { - // [#4515 instrumentation] Every HashAgg construction with proto sizes. - log::warn!( - "[#4515] OpStruct::HashAgg grouping_exprs.len={} agg_exprs.len={} mode={}", - agg.grouping_exprs.len(), - agg.agg_exprs.len(), - agg.mode - ); assert_eq!(children.len(), 1); let (scans, shuffle_scans, child) = self.create_plan(&children[0], inputs, partition_count)?; @@ -1424,16 +1417,6 @@ impl PhysicalPlanner { OpStruct::Scan(scan) => { let data_types = scan.fields.iter().map(to_arrow_datatype).collect_vec(); - // [#4515 instrumentation] Log every JVM-bridge Scan's declared column count. - // A 0-column scan paired with a JVM iterator producing batches with columns is - // the AIOOBE-on-exportBatch shape; a non-empty list confirms the inverse. - log::info!( - "[#4515] ScanExec source='{}' declared {} cols: {:?}", - scan.source, - data_types.len(), - data_types - ); - // If it is not test execution context for unit test, we should have at least one // input source if self.exec_context_id != TEST_EXEC_CONTEXT_ID && inputs.is_empty() { diff --git a/spark/src/main/java/org/apache/comet/CometBatchIterator.java b/spark/src/main/java/org/apache/comet/CometBatchIterator.java index 1392f028c0..9b48a47c57 100644 --- a/spark/src/main/java/org/apache/comet/CometBatchIterator.java +++ b/spark/src/main/java/org/apache/comet/CometBatchIterator.java @@ -38,8 +38,6 @@ public class CometBatchIterator { private final NativeUtil nativeUtil; private ColumnarBatch previousBatch = null; private ColumnarBatch currentBatch = null; - // [#4515 instrumentation] gate first-batch log per instance - private boolean loggedFirstBatch = false; CometBatchIterator(Iterator input, NativeUtil nativeUtil) { this.input = input; @@ -81,20 +79,6 @@ public int next(long[] arrayAddrs, long[] schemaAddrs) { return -1; } - // [#4515 instrumentation] Log first-batch shape per CometBatchIterator instance. - if (!loggedFirstBatch) { - loggedFirstBatch = true; - org.slf4j.LoggerFactory.getLogger("[#4515]") - .warn( - "CometBatchIterator.next first batch: numCols={} numRows={} arrayAddrs.length={} schemaAddrs.length={} inputCls={} this={}", - currentBatch.numCols(), - currentBatch.numRows(), - arrayAddrs.length, - schemaAddrs.length, - input.getClass().getName(), - System.identityHashCode(this)); - } - // export the batch using the Arrow C Data Interface int numRows = nativeUtil.exportBatch(arrayAddrs, schemaAddrs, currentBatch); diff --git a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala index 98a9bc0684..6140eca553 100644 --- a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala +++ b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala @@ -92,27 +92,6 @@ class CometExecIterator( val conf = SparkEnv.get.conf val localDiskDirs = SparkEnv.get.blockManager.getLocalDiskDirs - // [#4515 instrumentation] Dump proto plan + per-input iterator class so we can correlate - // a 0-col Scan in the proto with the JVM iterator that will feed it. - { - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - val parsed = - scala.util - .Try( - org.apache.comet.serde.OperatorOuterClass.Operator - .parseFrom(protobufQueryPlan) - .toString) - .toOption - .getOrElse("") - val itersDesc = inputIterators.zipWithIndex - .map { case (it, idx) => s" inputIterators[$idx] cls=${it.getClass.getName}" } - .mkString("\n") - log.warn( - s"CometExecIterator constructing plan id=$id partition=$partitionIndex " + - s"numParts=$numParts numOutputCols=$numOutputCols\n$itersDesc\n" + - s" proto plan:\n$parsed") - } - // serialize Comet related Spark configs in protobuf format val protobufSparkConfigs = CometExecIterator.serializeCometSQLConfs() diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index 6eb314bbb6..aeb7db40ad 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -166,16 +166,6 @@ case class CometExecRule(session: SparkSession) val reverted = s.originalPlan.withNewChildren(Seq(s.child)).asInstanceOf[ShuffleExchangeExec] reverted.setTagValue(CometExecRule.SKIP_COMET_SHUFFLE_TAG, ()) - // [#4515 instrumentation] Log every revert: this is a primary place where - // vanilla ShuffleExchangeExec instances enter the plan post-Comet-rewrite. - org.slf4j.LoggerFactory - .getLogger("[#4515]") - .warn( - s"revertRedundantColumnarShuffle revert: parentAgg=${op.getClass.getSimpleName} " + - s"(output=${op.output}) cometShuffle.output=${s.output} " + - s"reverted.output=${reverted.output} reverted.identityHash=${System - .identityHashCode(reverted)}\n" + - s" reverted tree:\n${reverted.treeString(verbose = true, addSuffix = false)}") logInfo( "Reverting Comet columnar shuffle to Spark shuffle between " + s"${op.getClass.getSimpleName} and ${s.child.getClass.getSimpleName} " + @@ -556,20 +546,6 @@ case class CometExecRule(session: SparkSession) |${sideBySide(plan.treeString, newPlan.treeString).mkString("\n")} |""".stripMargin) } - // [#4515 instrumentation] Dump the plan in/out of CometExecRule to correlate with the - // 0-col Scan synthesized later. Logs the operator-class diff so we can see what - // CometExecRule did or didn't replace, especially around ShuffleExchangeExec wrappers - // inside subqueries. - if (!newPlan.fastEquals(plan)) { - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - log.warn( - "CometExecRule rewrote plan:\n IN classes: " + - plan.collect { case p => p.getClass.getSimpleName }.mkString(",") + - "\n OUT classes: " + - newPlan.collect { case p => p.getClass.getSimpleName }.mkString(",") + - s"\n IN tree:\n${plan.treeString(verbose = true, addSuffix = false)}" + - s"\n OUT tree:\n${newPlan.treeString(verbose = true, addSuffix = false)}") - } newPlan } diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 1f7f509db0..7601fa1c6b 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -68,17 +68,6 @@ case class CometScanRule(session: SparkSession) |${sideBySide(plan.treeString, newPlan.treeString).mkString("\n")} |""".stripMargin) } - // [#4515 instrumentation] - if (!newPlan.fastEquals(plan)) { - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - log.warn( - "CometScanRule rewrote plan:\n IN classes: " + - plan.collect { case p => p.getClass.getSimpleName }.mkString(",") + - "\n OUT classes: " + - newPlan.collect { case p => p.getClass.getSimpleName }.mkString(",") + - s"\n IN tree:\n${plan.treeString(verbose = true, addSuffix = false)}" + - s"\n OUT tree:\n${newPlan.treeString(verbose = true, addSuffix = false)}") - } newPlan } diff --git a/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala b/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala index 8a1c576a2d..845803d133 100644 --- a/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala +++ b/spark/src/main/scala/org/apache/comet/serde/operator/CometSink.scala @@ -49,12 +49,6 @@ abstract class CometSink[T <: SparkPlan] extends CometOperatorSerde[T] { op: T, builder: Operator.Builder, childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = { - // [#4515 instrumentation] - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - log.warn( - s"CometSink[${this.getClass.getSimpleName}].convert op=${op.getClass.getName} " + - s"simpleString='${op.simpleStringWithNodeId()}' output=${op.output} " + - s"output.size=${op.output.size}") val supportedTypes = op.output.forall(a => supportedDataType(a.dataType, allowComplex = true)) @@ -78,53 +72,6 @@ abstract class CometSink[T <: SparkPlan] extends CometOperatorSerde[T] { } if (scanTypes.length == op.output.length) { - // [#4515 instrumentation] Log when we synthesize a Scan with zero declared columns. - // The runtime JVM iterator may still produce columns (subquery output shrunk by - // catalyst before serialization while the underlying RDD reflects the pre-shrink shape), - // tripping the column-count guard in NativeUtil.exportBatch. - if (scanTypes.isEmpty) { - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - // scalastyle:off line.size.limit - val childInfo = op.children.zipWithIndex - .map { case (c, i) => - val canonOut = scala.util - .Try(c.canonicalized.output) - .toOption - .map(_.toString) - .getOrElse("") - s" child[$i] cls=${c.getClass.getName} simpleString='${c.simpleString( - 80)}' output=${c.output} outputSize=${c.output.size} identityHash=${System - .identityHashCode(c)} canonicalized.output=$canonOut" - } - .mkString("\n") - val opCanonOut = scala.util - .Try(op.canonicalized.output) - .toOption - .map(_.toString) - .getOrElse("") - val subqueryInfo = scala.util - .Try(op.subqueries.map(s => s"${s.getClass.getName}(output=${s.output}, prepared=?)")) - .toOption - .getOrElse(Nil) - .mkString("[", ", ", "]") - val callerStack = - new RuntimeException("[#4515] CometSink 0-col Scan caller").getStackTrace - .take(20) - .map(f => s" at ${f}") - .mkString("\n") - log.warn(s"CometSink synthesizing 0-col Scan for op=${op.getClass.getName}\n" + - s" simpleString='${op.simpleStringWithNodeId()}'\n" + - s" op.output=${op.output} op.outputSet=${op.outputSet} op.references=${op.references}\n" + - s" op.canonicalized.output=$opCanonOut\n" + - s" op.subqueries=$subqueryInfo\n" + - s" op identityHash=${System.identityHashCode(op)}\n" + - s" children classes=${op.children.map(_.getClass.getSimpleName).mkString("[", ",", "]")}\n" + - childInfo + "\n" + - s" caller stack:\n$callerStack\n" + - s" op tree:\n${op.treeString(verbose = true, addSuffix = false)}") - // scalastyle:on line.size.limit - } - scanBuilder.addAllFields(scanTypes.asJava) // Sink operators don't have children @@ -147,27 +94,6 @@ object CometExchangeSink extends CometSink[SparkPlan] { op: SparkPlan, builder: Operator.Builder, childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = { - // [#4515 instrumentation] - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - val isVanillaSparkExchange = - op.getClass.getName == "org.apache.spark.sql.execution.exchange.ShuffleExchangeExec" - log.warn( - s"CometExchangeSink.convert op=${op.getClass.getName} " + - s"simpleString='${op.simpleStringWithNodeId()}' output=${op.output} " + - s"useShuffleScan=${shouldUseShuffleScan(op)} " + - s"children=${op.children.map(_.getClass.getSimpleName).mkString("[", ",", "]")}") - if (isVanillaSparkExchange) { - val callerStack = - new RuntimeException("[#4515] vanilla ShuffleExchangeExec caller").getStackTrace - .take(20) - .map(f => s" at ${f}") - .mkString("\n") - log.warn( - " vanilla ShuffleExchangeExec being processed by CometExchangeSink:\n" + - s" output=${op.output}\n" + - s" caller stack:\n$callerStack\n" + - s" op tree:\n${op.treeString(verbose = true, addSuffix = false)}") - } if (shouldUseShuffleScan(op)) { convertToShuffleScan(op, builder) } else { diff --git a/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala b/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala index 388f5114e6..4f027cd9e7 100644 --- a/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala +++ b/spark/src/main/scala/org/apache/comet/vector/NativeUtil.scala @@ -114,28 +114,6 @@ class NativeUtil { batch: ColumnarBatch): Int = { val numRows = mutable.ArrayBuffer.empty[Int] - if (arrayAddrs.length != batch.numCols() || schemaAddrs.length != batch.numCols()) { - val schemaSummary = (0 until batch.numCols()) - .map { i => - val v = batch.column(i) match { - case cv: CometVector => cv.getValueVector - case _ => null - } - if (v != null) s"col[$i]: ${v.getField}" - else s"col[$i]: ${batch.column(i).getClass.getName}" - } - .mkString("; ") - val taskAttempt = Option(org.apache.spark.TaskContext.get()) - .map(c => s"stage=${c.stageId} task=${c.taskAttemptId} partition=${c.partitionId}") - .getOrElse("no-task") - throw new SparkException( - "CometBatchIterator column-count mismatch [#4515 instrumentation]: " + - s"native expected arrayAddrs=${arrayAddrs.length}, schemaAddrs=${schemaAddrs.length}; " + - s"JVM iterator produced batch.numCols=${batch.numCols()} ($taskAttempt). " + - s"Batch schema: $schemaSummary", - new RuntimeException("placeholder for exportBatch column-count mismatch")) - } - (0 until batch.numCols()).foreach { index => batch.column(index) match { case a: CometVector => diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala index ba152c2f13..1470d637d9 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala @@ -243,23 +243,6 @@ case class CometShuffleExchangeExec( * Comet returns RDD[ColumnarBatch] for columnar execution. */ protected override def doExecuteColumnar(): RDD[ColumnarBatch] = { - // [#4515 instrumentation] Track every doExecuteColumnar call: which CometShuffleExchange - // instance, its output, and the call site. Helps confirm whether this PR's changes - // cause an extra EnsureRequirements-inserted vanilla Exchange to wrap us, and what - // RDD is being plumbed where. - { - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - val callerStack = new RuntimeException( - "[#4515] CometShuffleExchangeExec.doExecuteColumnar caller").getStackTrace - .take(15) - .map(f => s" at ${f}") - .mkString("\n") - log.warn( - s"CometShuffleExchangeExec.doExecuteColumnar this=${System.identityHashCode(this)} " + - s"shuffleType=$shuffleType outputPartitioning=$outputPartitioning " + - s"output=$output\n" + - s" caller stack:\n$callerStack") - } // Returns the same CometShuffledBatchRDD if this plan is used by multiple plans. if (cachedShuffleRDD == null) { cachedShuffleRDD = new CometShuffledBatchRDD(shuffleDependency, readMetrics) @@ -704,22 +687,6 @@ object CometShuffleExchangeExec serializer: Serializer, metrics: Map[String, SQLMetric]): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = { - // [#4515 instrumentation] Track every placeholder-Scan ShuffleDependency we build, with - // outputAttributes (drives Scan declared schema) and call site. - { - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - val callerStack = - new RuntimeException("[#4515] prepareShuffleDependency caller").getStackTrace - .take(15) - .map(f => s" at ${f}") - .mkString("\n") - log.warn( - s"prepareShuffleDependency outputAttributes=$outputAttributes " + - s"outputAttributes.size=${outputAttributes.size} " + - s"outputPartitioning=$outputPartitioning rdd.numPartitions=${rdd.getNumPartitions}\n" + - s" caller stack:\n$callerStack") - } - val scanBuilder = OperatorOuterClass.Scan.newBuilder().setSource("ShuffleWriterInput") val scanTypes = outputAttributes.flatten { attr => QueryPlanSerde.serializeDataType(attr.dataType) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala index e0e5c45b2a..7604910b06 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala @@ -162,34 +162,7 @@ class CometShuffledBatchRDD( override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = { val reader = createReader(split, context) // TODO: Reads IPC by native code - val raw = reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2) - // [#4515 instrumentation] Peek the first decoded batch to confirm wire schema vs caller - // expectations. Wraps so we don't consume the iterator. - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - new Iterator[ColumnarBatch] { - private var logged = false - override def hasNext: Boolean = raw.hasNext - override def next(): ColumnarBatch = { - val b = raw.next() - if (!logged) { - logged = true - val schemaSummary = (0 until b.numCols()) - .map { i => - val v = b.column(i) match { - case cv: org.apache.comet.vector.CometVector => cv.getValueVector - case _ => null - } - if (v != null) s"col[$i]: ${v.getField}" - else s"col[$i]: ${b.column(i).getClass.getName}" - } - .mkString("; ") - log.warn(s"CometShuffledBatchRDD.compute first decoded batch: numCols=${b.numCols()} " + - s"numRows=${b.numRows()} stage=${context.stageId()} task=${context.taskAttemptId()} " + - s"partition=${split.index} schema=[$schemaSummary]") - } - b - } - } + reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2) } override def clearDependencies(): Unit = { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 3240adcc01..7ec8a2e898 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -422,15 +422,7 @@ abstract class CometNativeExec extends CometExec { /** The Comet native operator */ def nativeOp: Operator - override protected def doPrepare(): Unit = { - // [#4515 instrumentation] Track when subqueries are prepared for this CometNativeExec. - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - log.warn( - s"CometNativeExec.doPrepare this=${System.identityHashCode(this)} " + - s"cls=${this.getClass.getName} " + - s"originalPlan.cls=${originalPlan.getClass.getName}") - prepareSubqueries(this) - } + override protected def doPrepare(): Unit = prepareSubqueries(this) override lazy val metrics: Map[String, SQLMetric] = CometMetricNode.baselineMetrics(sparkContext) @@ -571,14 +563,6 @@ abstract class CometNativeExec extends CometExec { // same partition number. But for Comet, we need to zip them so we need to adjust the // partition number of Broadcast RDDs to make sure they have the same partition number. sparkPlans.zipWithIndex.foreach { case (plan, idx) => - // [#4515 instrumentation] Log every JVM-side input plan we wire to native, so we can - // correlate the Scan's declared schema with the runtime plan whose RDD feeds it. - val log = org.slf4j.LoggerFactory.getLogger("[#4515]") - log.warn( - s"buildNativeContext binding input[$idx] cls=${plan.getClass.getName} " + - s"simpleString='${plan.simpleStringWithNodeId()}' output=${plan.output} " + - s"output.size=${plan.output.size} identityHash=${System.identityHashCode(plan)}\n" + - s" subtree:\n${plan.treeString(verbose = true, addSuffix = false)}") plan match { case c: CometBroadcastExchangeExec => inputs += c.executeColumnar(firstNonBroadcastPlanNumPartitions) @@ -1532,25 +1516,6 @@ trait CometBaseAggregate { // If the aggregateExpressions is empty, we only want to build groupingExpressions, // and skip processing of aggregateExpressions. if (aggregateExpressions.isEmpty) { - // [#4515 instrumentation] Track HashAgg serializations with empty resultExpressions. - // Catalyst prunes resultExpressions for EXISTS / row-existence-only subqueries; the - // native side currently interprets empty result_exprs as "use aggregate natural - // schema", which leaks grouping keys into the output. - org.slf4j.LoggerFactory - .getLogger("[#4515]") - .warn( - "HashAgg empty-aggExprs branch: " + - s"groupingExprs=${groupingExpressions} " + - s"resultExpressions=${resultExpressions} " + - s"resultExpressions.size=${resultExpressions.size} " + - s"aggregate.output=${aggregate.output} " + - s"aggregate.output.size=${aggregate.output.size} " + - s"modes(from aggExprs)=${modes} " + - s"sparkFinalMode=$sparkFinalMode " + - s"requiredChildDistribution=${aggregate.requiredChildDistribution} " + - s"isProjectionToEmpty=${resultExpressions.isEmpty && aggregate.output.isEmpty} " + - s"naturalEqualsIntent=${resultExpressions.map(_.toAttribute) == groupingExpressions - .map(_.toAttribute)}") val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder() hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava) // The native HashAggregate emits its natural shape (the grouping keys, since there diff --git a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala index fded42d050..16601d056b 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala @@ -3959,117 +3959,6 @@ class CometExecSuite extends CometTestBase { } } - // Repro for the Spark 3.5 SQL-tests failure on subquery/exists-subquery/exists-orderby-limit.sql - // (query #19) on branch opt_native_shuffle. Crashes with NativeUtil.exportBatch:132 AIOOBE - // on this branch but passes on main. The SQL file declares three CONFIG_DIM1 combos for - // codegen; the failure trace doesn't say which one fires it, so sweep all three. - test("EXISTS subquery with GROUP BY + LIMIT + OFFSET") { - withTempView("emp", "dept") { - spark - .sql("""SELECT * FROM VALUES - | (100, 'emp 1', date '2005-01-01', 100.0D, 10), - | (200, 'emp 2', date '2003-01-01', 200.0D, 10), - | (300, 'emp 3', date '2002-01-01', 300.0D, 20), - | (400, 'emp 4', date '2005-01-01', 400.0D, 30), - | (500, 'emp 5', date '2001-01-01', 400.0D, NULL), - | (700, 'emp 7', date '2010-01-01', 400.0D, 100), - | (800, 'emp 8', date '2016-01-01', 150.0D, 70) - |AS t(id, emp_name, hiredate, salary, dept_id)""".stripMargin) - .createOrReplaceTempView("emp") - spark - .sql("""SELECT * FROM VALUES - | (10, 'dept 1', 'CA'), - | (20, 'dept 2', 'NY'), - | (30, 'dept 3', 'TX'), - | (40, 'dept 4 - unassigned', 'OR'), - | (50, 'dept 5 - unassigned', 'NJ'), - | (70, 'dept 7', 'FL') - |AS t(dept_id, dept_name, state)""".stripMargin) - .createOrReplaceTempView("dept") - - val configDims = Seq( - Map("spark.sql.codegen.wholeStage" -> "true"), - Map( - "spark.sql.codegen.wholeStage" -> "false", - "spark.sql.codegen.factoryMode" -> "CODEGEN_ONLY"), - Map( - "spark.sql.codegen.wholeStage" -> "false", - "spark.sql.codegen.factoryMode" -> "NO_CODEGEN")) - - // Mirror the SQL test order: queries #1 through #16 from the file run before #17 (the - // one whose CI failure we're chasing). Query #11's subquery is identical except for - // the OFFSET; #13 and #15 share similar shapes. Subquery materialization / AQE plan - // cache state from running them first may alter query #17's executed shape. - val priorQueries = Seq( - // #1 - """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate""", - // #2 - """SELECT id, hiredate FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate DESC""", - // #3 - """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 1) ORDER BY hiredate""", - // #4 - """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 0) ORDER BY hiredate""", - // #5 - """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate""", - // #6 - """SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state)""", - // #7 - """SELECT count(*) FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY dept_id ORDER BY dept_id)""", - // #8 - """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 1) ORDER BY hiredate""", - // #9 - """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id FROM dept WHERE emp.dept_id = dept.dept_id ORDER BY state LIMIT 0) ORDER BY hiredate""", - // #10 - """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1)""", - // #11 - same subquery as #17 minus the OFFSET - """SELECT * FROM emp WHERE EXISTS (SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1)""", - // #12 - """SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 100 LIMIT 1)""", - // #13 - """SELECT * FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) FROM dept WHERE dept.dept_id > 100 GROUP BY state LIMIT 1)""", - // #14 - """SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state LIMIT 2 OFFSET 1)""", - // #15 - """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1 OFFSET 2)""", - // #16 - """SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > emp.dept_id LIMIT 1)""") - - for (dim <- configDims) { - // SQL-tests harness (dev/diffs/3.5.8.diff, 4.1.1.diff) sets only: - // spark.comet.enabled, spark.comet.exec.enabled, spark.comet.exec.shuffle.enabled, - // spark.comet.parquet.respectFilterPushdown, spark.shuffle.manager, - // spark.comet.memoryOverhead. - // It does NOT enable spark.comet.sparkToColumnar.enabled, but CometTestBase does. - // Force the harness shape so the reproducer matches the failing config. - val harnessConf = dim ++ Map(CometConf.COMET_SPARK_TO_ARROW_ENABLED.key -> "false") - withSQLConf(harnessConf.toSeq: _*) { - // scalastyle:off println - println(s"\n===== config dim: $harnessConf =====") - for ((q, idx) <- priorQueries.zipWithIndex) { - spark.sql(q).collect() - println(s"--- ran prior query #${idx + 1} ---") - } - val sql = """SELECT * - |FROM emp - |WHERE EXISTS (SELECT max(dept.dept_id) - | FROM dept - | GROUP BY state - | LIMIT 1 - | OFFSET 2)""".stripMargin - val df = spark.sql(sql) - println("--- query #17 initial executedPlan ---") - println(df.queryExecution.executedPlan) - val rows = df.collect() - println("--- query #17 final (post-AQE) executedPlan ---") - println(df.queryExecution.executedPlan) - println(s"--- ${rows.length} rows ---") - // scalastyle:on println - checkSparkAnswer(sql) - } - } - } - } - } case class BucketedTableTestSpec( From 214a75be2f7def766bc0c98158651c7ff01b5db6 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 15:04:28 -0400 Subject: [PATCH 08/13] cleanup. --- native/core/src/execution/planner.rs | 10 +- native/proto/src/proto/operator.proto | 7 +- .../apache/spark/sql/comet/operators.scala | 134 ++++++++---------- .../comet/exec/CometAggregateSuite.scala | 118 +++++---------- 4 files changed, 97 insertions(+), 172 deletions(-) diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 18c3acf2dd..328ca65760 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1164,13 +1164,9 @@ impl PhysicalPlanner { )?, ); - // The native HashAggregate emits its natural shape (group keys + agg - // results / state). Any post-aggregate projection Spark catalyst declares - // (`COUNT(col) + 1`, EXISTS-pruned-to-empty output, alias renames, etc.) is - // expressed as an explicit `OpStruct::Projection` op above the aggregate - // by the JVM serializer (see `CometBaseAggregate.doConvert`). Keeping that - // logic on the JVM side means only one place decides plan shape, and the - // native side stays a faithful executor. See comet#4515. + // HashAggregate emits its natural shape (group keys + agg results); any + // post-aggregate projection is serialized as an explicit `OpStruct::Projection` + // op above by the JVM serializer (see `CometBaseAggregate.doConvert`) Ok(( scans, shuffle_scans, diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index aaff3cc4de..cc56331d74 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -294,11 +294,8 @@ message Sort { message HashAggregate { repeated spark.spark_expression.Expr grouping_exprs = 1; repeated spark.spark_expression.AggExpr agg_exprs = 2; - // Field 3 (`result_exprs`) and field 8 (`apply_result_projection`) were used to apply a - // post-aggregate projection inside the HashAggregate operator. The same effect is now - // expressed by emitting an explicit `Projection` op above the `HashAggregate` from the - // JVM serializer when needed (see `CometBaseAggregate.doConvert`). Reserved to avoid - // accidental reuse at incompatible semantics. + // Was result_exprs / apply_result_projection; now expressed as an explicit Projection + // op above HashAggregate (see CometBaseAggregate.doConvert, comet#4515). reserved 3, 8; reserved "result_exprs", "apply_result_projection"; AggregateMode mode = 5; diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 7ec8a2e898..cbd9dc4cbb 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -1518,48 +1518,12 @@ trait CometBaseAggregate { if (aggregateExpressions.isEmpty) { val hashAggBuilder = OperatorOuterClass.HashAggregate.newBuilder() hashAggBuilder.addAllGroupingExprs(groupingExprs.map(_.get).asJava) - // The native HashAggregate emits its natural shape (the grouping keys, since there - // are no aggregate functions). When Spark catalyst declares a different output - - // either column-renaming via aliases, or an entirely empty output for catalyst-pruned - // EXISTS / row-existence-only subqueries - we wrap the HashAggregate in an explicit - // Projection op so the native side reshapes accordingly. See comet#4515: an empty - // declared output paired with the natural grouping-key output crashed downstream - // boundaries that derived their schema from the declared output. - val naturalOutput = groupingExpressions.map(_.toAttribute) - val needsProjection = resultExpressions.map(_.toAttribute) != naturalOutput - if (needsProjection) { - val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes - val resultExprs = resultExpressions.map(exprToProto(_, attributes)) - if (resultExprs.exists(_.isEmpty)) { - withInfo( - aggregate, - s"Unsupported result expressions found in: $resultExpressions", - resultExpressions: _*) - return None - } - // Build the inner HashAgg op carrying the original child operators from `builder`. - // Use a fresh outer builder for the Projection so it gets a single child (the - // HashAgg op), not the original children appended on top. Both ops share the same - // plan_id so the inner aggregate's native metrics roll up under the same Spark - // operator in the metric tree (otherwise they'd orphan against plan_id=0). - val hashAggOp = OperatorOuterClass.Operator - .newBuilder() - .setPlanId(builder.getPlanId) - .addAllChildren(builder.getChildrenList) - .setHashAgg(hashAggBuilder) - .build() - val projectionBuilder = OperatorOuterClass.Projection.newBuilder() - projectionBuilder.addAllProjectList(resultExprs.map(_.get).asJava) - Some( - OperatorOuterClass.Operator - .newBuilder() - .setPlanId(builder.getPlanId) - .addChildren(hashAggOp) - .setProjection(projectionBuilder) - .build()) - } else { - Some(builder.setHashAgg(hashAggBuilder).build()) - } + buildAggOp( + builder, + hashAggBuilder, + groupingExpressions.map(_.toAttribute), + resultExpressions, + aggregate) } else { // Validate mode combinations. We support: // - All Partial @@ -1647,40 +1611,15 @@ trait CometBaseAggregate { } // Final aggregations may carry a result projection (e.g. `COUNT(col) + 1`) that - // catalyst encodes via `resultExpressions`. DataFusion's hash aggregate only emits - // its natural shape (group keys + agg results), so we wrap the HashAggregate in - // an explicit Projection op to apply Spark's result expressions. Partial / - // PartialMerge aggregates emit raw state buffers and never need the projection. - // See comet#4515. + // catalyst encodes via `resultExpressions`. Partial / PartialMerge aggregates emit + // raw state buffers and never need it. See comet#4515. if (mode == CometAggregateMode.Final) { - val attributes = groupingExpressions.map(_.toAttribute) ++ aggregateAttributes - val resultExprs = resultExpressions.map(exprToProto(_, attributes)) - if (resultExprs.exists(_.isEmpty)) { - withInfo( - aggregate, - s"Unsupported result expressions found in: $resultExpressions", - resultExpressions: _*) - return None - } - // Inner HashAgg keeps the original input children from `builder`. Outer - // Projection uses a fresh builder so it has a single child (the HashAgg op). - // Both ops share the same plan_id so the aggregate's native metrics aggregate - // under the same Spark operator (else they'd orphan against plan_id=0). - val hashAggOp = OperatorOuterClass.Operator - .newBuilder() - .setPlanId(builder.getPlanId) - .addAllChildren(builder.getChildrenList) - .setHashAgg(hashAggBuilder) - .build() - val projectionBuilder = OperatorOuterClass.Projection.newBuilder() - projectionBuilder.addAllProjectList(resultExprs.map(_.get).asJava) - Some( - OperatorOuterClass.Operator - .newBuilder() - .setPlanId(builder.getPlanId) - .addChildren(hashAggOp) - .setProjection(projectionBuilder) - .build()) + buildAggOp( + builder, + hashAggBuilder, + groupingExpressions.map(_.toAttribute) ++ aggregateAttributes, + resultExpressions, + aggregate) } else { Some(builder.setHashAgg(hashAggBuilder).build()) } @@ -1694,6 +1633,51 @@ trait CometBaseAggregate { } + /** + * Serialize a HashAggregate, wrapping it in an explicit `Projection` op when Spark's declared + * output (`resultExpressions`) differs from the aggregate's natural output. DataFusion's hash + * aggregate emits only its natural shape (group keys + agg results), so any reshape catalyst + * declared - alias renames, `COUNT(col) + 1`, or empty output for catalyst-pruned EXISTS / + * row-existence-only subqueries - is expressed as a separate Projection above the HashAgg. Both + * ops share the caller's `plan_id` so the aggregate's native metrics roll up under the same + * Spark operator. See comet#4515. + */ + private def buildAggOp( + builder: Operator.Builder, + hashAggBuilder: OperatorOuterClass.HashAggregate.Builder, + naturalOutput: Seq[Attribute], + resultExpressions: Seq[NamedExpression], + aggregate: BaseAggregateExec): Option[Operator] = { + if (resultExpressions.map(_.toAttribute) == naturalOutput) { + return Some(builder.setHashAgg(hashAggBuilder).build()) + } + val resultExprs = resultExpressions.map(exprToProto(_, naturalOutput)) + if (resultExprs.exists(_.isEmpty)) { + withInfo( + aggregate, + s"Unsupported result expressions found in: $resultExpressions", + resultExpressions: _*) + return None + } + val planId = builder.getPlanId + val hashAggOp = OperatorOuterClass.Operator + .newBuilder() + .setPlanId(planId) + .addAllChildren(builder.getChildrenList) + .setHashAgg(hashAggBuilder) + .build() + val projection = OperatorOuterClass.Projection + .newBuilder() + .addAllProjectList(resultExprs.map(_.get).asJava) + Some( + OperatorOuterClass.Operator + .newBuilder() + .setPlanId(planId) + .addChildren(hashAggOp) + .setProjection(projection) + .build()) + } + /** * Find the first Comet partial aggregate in the plan. If it reaches a Spark HashAggregate with * partial or partial-merge mode, it will return None. diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala index f098040878..3e883d7a32 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala @@ -2109,95 +2109,43 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - // Regression for comet#4515: a HashAggregateExec whose `resultExpressions` (and therefore - // `output`) catalyst has pruned to empty must still produce 0-col batches at runtime. - // Catalyst prunes `resultExpressions=[]` for plans where the aggregate's column values are - // unused downstream - classically EXISTS subqueries that get rewritten into a literal-`1` - // wrapper. Before the fix, the native HashAggregate emitted its natural output (the - // grouping keys) regardless of the pruned JVM `output`, so any boundary that derived a - // schema from `output` (e.g. a wrapping vanilla Spark `ShuffleExchangeExec.output = - // child.output = []`, or a JVM-bridge Scan synthesized from the same `output`) declared - // 0 columns while the runtime RDD produced 1. The mismatch tripped - // `NativeUtil.exportBatch` with an ArrayIndexOutOfBoundsException on a length-0 - // schemaAddrs[]. + // Regression for comet#4515: catalyst prunes `HashAggregateExec.resultExpressions` to + // empty for EXISTS / row-existence-only subqueries. The native HashAggregate's natural + // output (the grouping keys) then disagrees with the pruned JVM `output`, leaking through + // any boundary that derived its schema from `output`. The fix wraps the aggregate in an + // explicit Projection op when natural != declared. // - // The bug needs a specific catalyst optimizer state: `HashAggregateExec. - // resultExpressions.isEmpty`. Whether the optimizer reaches that state from a given SQL - // depends on Spark version, scan source (LocalTableScan vs Parquet/native), AQE state, - // and which Comet rules already fired - we observed it in the SQL-tests harness running - // `subquery/exists-subquery/exists-orderby-limit.sql` on Spark 4.0.2 with parquet-backed - // temp views, but not via `Dataset.collect` over `LocalTableScan`-backed temp views - // under `CometTestBase`. So the test below writes parquet (matching the harness's scan - // shape), tries several known triggers, runs whichever (if any) produces the bug shape - // under `checkSparkAnswer`, and skips cleanly otherwise. The upstream SQL-tests run - // remains the primary safety net for the harness-only path. + // Surfaced upstream in `subquery/exists-subquery/exists-orderby-limit.sql` (query #19, + // an EXISTS over `max(...) GROUP BY state LIMIT 1 OFFSET 2`). The exact `EXISTS-in-WHERE` + // shape doesn't reproduce under CometTestBase's optimizer state, but `count(*)` over the + // same derived aggregate triggers the equivalent ColumnPruning path locally - we assert + // the inner HashAgg's resultExpressions actually got pruned, so a future Spark version + // that breaks the trigger fails the test loudly rather than passing silently. test("HashAggregate with catalyst-pruned resultExpressions returns 0-col output (#4515)") { withTempDir { dir => - withTempView("emp", "dept") { - // Write parquet so Comet's native scan path (vs LocalTableScan) is the source - - // matches the SQL-tests harness setup that surfaced the bug. - val empPath = new java.io.File(dir, "emp").getAbsolutePath - val deptPath = new java.io.File(dir, "dept").getAbsolutePath - - spark - .sql("""SELECT * FROM VALUES - | (100, 'emp 1', 100.0D, 10), - | (200, 'emp 2', 200.0D, 10), - | (300, 'emp 3', 300.0D, 20), - | (400, 'emp 4', 400.0D, 30), - | (500, 'emp 5', 400.0D, NULL), - | (700, 'emp 7', 400.0D, 100), - | (800, 'emp 8', 150.0D, 70) - |AS t(id, emp_name, salary, dept_id)""".stripMargin) - .write - .parquet(empPath) - spark - .sql("""SELECT * FROM VALUES - | (10, 'CA'), (20, 'NY'), (30, 'TX'), - | (40, 'OR'), (50, 'NJ'), (70, 'FL') - |AS t(dept_id, state)""".stripMargin) - .write - .parquet(deptPath) - - spark.read.parquet(empPath).createOrReplaceTempView("emp") - spark.read.parquet(deptPath).createOrReplaceTempView("dept") - - val candidates = Seq( - // The original failing SQL from the harness - EXISTS with grouped agg + LIMIT/OFFSET. - """SELECT * FROM emp - |WHERE EXISTS ( - | SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1 OFFSET 2)""".stripMargin, - // Inline view + outer constant: ColumnPruning may strip the inner agg's output. - """SELECT 1 FROM ( - | SELECT max(dept_id) FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin, - // Scalar subquery returning a constant. - """SELECT (SELECT 1 FROM dept GROUP BY state LIMIT 1 OFFSET 2)""".stripMargin, - // count(*) over a derived table: outer doesn't reference inner cols. + val deptPath = new Path(dir.toURI.toString, "dept") + spark + .sql("""SELECT * FROM VALUES + | (10, 'CA'), (20, 'NY'), (30, 'TX'), + | (40, 'OR'), (50, 'NJ'), (70, 'FL') + |AS t(dept_id, state)""".stripMargin) + .write + .parquet(deptPath.toUri.toString) + withParquetTable(deptPath.toUri.toString, "dept") { + val sql = """SELECT count(*) FROM ( - | SELECT max(dept_id) AS m FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin) - - // Find a candidate whose plan has a HashAggregateExec (Spark or Comet) with empty - // resultExpressions. collectWithSubqueries traverses Subquery nodes too. - val triggering = candidates.find { sql => - val plan = spark.sql(sql).queryExecution.executedPlan - collectWithSubqueries(plan) { - case a: org.apache.spark.sql.execution.aggregate.HashAggregateExec - if a.resultExpressions.isEmpty => - a - case a: CometHashAggregateExec if a.resultExpressions.isEmpty => a - }.nonEmpty - } - - triggering match { - case Some(sql) => checkSparkAnswer(sql) - case None => - cancel( - "No candidate query produced a HashAggregateExec with empty resultExpressions " + - "in this environment. The catalyst-pruned shape that exercises #4515 only " + - "appears under specific optimizer/AQE state we couldn't reproduce here. The " + - "upstream SQL-tests run (subquery/exists-subquery/exists-orderby-limit.sql) " + - "covers this path.") - } + | SELECT max(dept_id) AS m FROM dept GROUP BY state LIMIT 1 OFFSET 2) sub""".stripMargin + val plan = spark.sql(sql).queryExecution.executedPlan + val pruned = collectWithSubqueries(plan) { + case a: org.apache.spark.sql.execution.aggregate.HashAggregateExec + if a.resultExpressions.isEmpty => + a + case a: CometHashAggregateExec if a.resultExpressions.isEmpty => a + } + assert( + pruned.nonEmpty, + s"Expected a HashAggregateExec with empty resultExpressions in:\n$plan") + checkSparkAnswerAndOperator(sql) } } } From e8e438f451b52e678ff43f01e259851eef37c232 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 15:12:40 -0400 Subject: [PATCH 09/13] cleanup. --- .../src/main/scala/org/apache/spark/sql/comet/operators.scala | 4 ++-- .../scala/org/apache/comet/exec/CometAggregateSuite.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index cbd9dc4cbb..f419ca1b28 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -1612,7 +1612,7 @@ trait CometBaseAggregate { // Final aggregations may carry a result projection (e.g. `COUNT(col) + 1`) that // catalyst encodes via `resultExpressions`. Partial / PartialMerge aggregates emit - // raw state buffers and never need it. See comet#4515. + // raw state buffers and never need it. if (mode == CometAggregateMode.Final) { buildAggOp( builder, @@ -1640,7 +1640,7 @@ trait CometBaseAggregate { * declared - alias renames, `COUNT(col) + 1`, or empty output for catalyst-pruned EXISTS / * row-existence-only subqueries - is expressed as a separate Projection above the HashAgg. Both * ops share the caller's `plan_id` so the aggregate's native metrics roll up under the same - * Spark operator. See comet#4515. + * Spark operator. */ private def buildAggOp( builder: Operator.Builder, diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala index 3e883d7a32..ae14c68207 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala @@ -2109,7 +2109,7 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - // Regression for comet#4515: catalyst prunes `HashAggregateExec.resultExpressions` to + // Regression: Catalyst prunes `HashAggregateExec.resultExpressions` to // empty for EXISTS / row-existence-only subqueries. The native HashAggregate's natural // output (the grouping keys) then disagrees with the pruned JVM `output`, leaking through // any boundary that derived its schema from `output`. The fix wraps the aggregate in an @@ -2121,7 +2121,7 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { // same derived aggregate triggers the equivalent ColumnPruning path locally - we assert // the inner HashAgg's resultExpressions actually got pruned, so a future Spark version // that breaks the trigger fails the test loudly rather than passing silently. - test("HashAggregate with catalyst-pruned resultExpressions returns 0-col output (#4515)") { + test("HashAggregate with catalyst-pruned resultExpressions returns 0-col output") { withTempDir { dir => val deptPath = new Path(dir.toURI.toString, "dept") spark From 1913208e7b137a95dd4b6002cf24ed7b21c86037 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 17:05:01 -0400 Subject: [PATCH 10/13] fix aggregation wrapping now that we don't have an extra CometExecIterator. --- native/shuffle/src/shuffle_writer.rs | 34 +++++++++---------- .../comet/exec/CometAggregateSuite.scala | 24 +++++++++++++ 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs index 8502c79624..b96917550f 100644 --- a/native/shuffle/src/shuffle_writer.rs +++ b/native/shuffle/src/shuffle_writer.rs @@ -29,7 +29,7 @@ use datafusion::physical_expr::{EquivalenceProperties, Partitioning}; use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion::physical_plan::EmptyRecordBatchStream; use datafusion::{ - arrow::{datatypes::SchemaRef, error::ArrowError}, + arrow::datatypes::SchemaRef, error::Result, execution::context::TaskContext, physical_plan::{ @@ -38,7 +38,7 @@ use datafusion::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, }, }; -use futures::{StreamExt, TryFutureExt, TryStreamExt}; +use futures::{StreamExt, TryStreamExt}; use std::{ any::Any, fmt, @@ -171,23 +171,23 @@ impl ExecutionPlan for ShuffleWriterExec { let input = self.input.execute(partition, Arc::clone(&context))?; let metrics = ShufflePartitionerMetrics::new(&self.metrics, 0); + // Propagate DataFusionError unchanged: the JNI bridge only downcasts a single + // `DataFusionError::External(SparkError)` layer, so any extra wrap here loses the + // typed exception (e.g. SparkArithmeticException on decimal overflow). Ok(Box::pin(RecordBatchStreamAdapter::new( self.schema(), - futures::stream::once( - external_shuffle( - input, - partition, - self.output_data_file.clone(), - self.output_index_file.clone(), - self.partitioning.clone(), - metrics, - context, - self.codec.clone(), - self.tracing_enabled, - self.write_buffer_size, - ) - .map_err(|e| ArrowError::ExternalError(Box::new(e))), - ) + futures::stream::once(external_shuffle( + input, + partition, + self.output_data_file.clone(), + self.output_index_file.clone(), + self.partitioning.clone(), + metrics, + context, + self.codec.clone(), + self.tracing_enabled, + self.write_buffer_size, + )) .try_flatten(), ))) } diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala index ae14c68207..3079db842f 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala @@ -1817,6 +1817,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { // make sure that the error message throws overflow exception only assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) + assert( + cometExc.isInstanceOf[ArithmeticException], + "expected ArithmeticException, got " + + s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for Long overflow in ANSI mode") } } else { @@ -1831,6 +1835,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) + assert( + cometExc.isInstanceOf[ArithmeticException], + "expected ArithmeticException, got " + + s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for Long underflow in ANSI mode") } } else { @@ -1870,6 +1878,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) + assert( + cometExc.isInstanceOf[ArithmeticException], + "expected ArithmeticException, got " + + s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for decimal overflow in ANSI mode") } @@ -1893,6 +1905,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) + assert( + cometExc.isInstanceOf[ArithmeticException], + "expected ArithmeticException, got " + + s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for Long overflow with GROUP BY in ANSI mode") } @@ -1910,6 +1926,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) + assert( + cometExc.isInstanceOf[ArithmeticException], + "expected ArithmeticException, got " + + s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for Long underflow with GROUP BY in ANSI mode") } @@ -1951,6 +1971,10 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) + assert( + cometExc.isInstanceOf[ArithmeticException], + "expected ArithmeticException, got " + + s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for decimal overflow with GROUP BY in ANSI mode") } From 327a6535d53ac07c076e1faa1f3610441d4d134e Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 17:44:23 -0400 Subject: [PATCH 11/13] Remove archeology comments. --- .../core/src/execution/operators/schema_align.rs | 16 +++++++++------- native/core/src/execution/planner.rs | 3 --- .../shuffle/CometShuffleExchangeExec.scala | 7 +++---- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/native/core/src/execution/operators/schema_align.rs b/native/core/src/execution/operators/schema_align.rs index 3d207e0202..58983d132c 100644 --- a/native/core/src/execution/operators/schema_align.rs +++ b/native/core/src/execution/operators/schema_align.rs @@ -16,11 +16,11 @@ // under the License. //! `SchemaAlignExec` reshapes its child's output so the per-column Arrow type and field-level -//! nullability match what Spark catalyst declared. Used between an inlined native subtree and -//! `ShuffleWriterExec` when the FFI deep-copy + `ScanExec` cast in `build_record_batch` are both -//! gone, so DataFusion / `datafusion-spark` return-type drift would otherwise be written into -//! shuffle blocks. See for the running -//! list of mismatched functions. +//! nullability match what Spark catalyst declared, casting where necessary. Sits between a native +//! subtree and `ShuffleWriterExec` so DataFusion / `datafusion-spark` return-type drift is caught +//! before it reaches shuffle blocks. See +//! for the running list of mismatched +//! functions. use arrow::array::{ArrayRef, RecordBatch, RecordBatchOptions}; use arrow::compute::{cast_with_options, CastOptions}; @@ -53,6 +53,9 @@ fn warn_dedup() -> &'static Mutex> { SET.get_or_init(|| Mutex::new(HashSet::new())) } +/// Casts each column of `child`'s output to the data_type Spark catalyst declared, widening +/// nullability to `actual.nullable || expected.nullable`. See +/// . #[derive(Debug)] pub struct SchemaAlignExec { child: Arc, @@ -74,8 +77,7 @@ impl SchemaAlignExec { /// Build a SchemaAlignExec that aligns `child`'s output to `expected`. Returns /// `Ok(child)` unchanged when no per-column reshape is needed; otherwise wraps `child` /// in a SchemaAlignExec whose target schema preserves `expected`'s data_type and metadata - /// but widens nullability to `actual.nullable || expected.nullable` (matching the - /// reconciliation rule used at the FFI boundary on `main`). + /// but widens nullability to `actual.nullable || expected.nullable`. pub fn try_new_or_passthrough( child: Arc, expected: &SchemaRef, diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 328ca65760..91d77a35b4 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1164,9 +1164,6 @@ impl PhysicalPlanner { )?, ); - // HashAggregate emits its natural shape (group keys + agg results); any - // post-aggregate projection is serialized as an explicit `OpStruct::Projection` - // op above by the JVM serializer (see `CometBaseAggregate.doConvert`) Ok(( scans, shuffle_scans, diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala index 861ce6509e..05f3923714 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala @@ -124,7 +124,7 @@ case class CometShuffleExchangeExec( ctx.shuffleScanIndices) case None => // Non-native child (e.g. CometSparkToColumnarExec): no subtree to inline. The dep gets - // built via the legacy convenience overload below; we just need a real RDD of batches. + // built via the convenience overload below; we just need a real RDD of batches. child.executeColumnar() } } else if (shuffleType == CometColumnarShuffle) { @@ -676,9 +676,8 @@ object CometShuffleExchangeExec * Implemented as a thin wrapper around [[prepareNativeShuffleDependency]]: synthesizes a * `Scan("ShuffleWriterInput")` as the child native op (so the writer's plan is still * `ShuffleWriter -> Scan`, consuming JVM batches via Arrow C Stream), wraps `rdd` as the single - * leaf input of a thin scheduling RDD, and supplies a minimal [[NativeExecContext]]. Same wire - * shape as before; one writer code path for both this case and the [[CometShuffleExchangeExec]] - * case. + * leaf input of a thin scheduling RDD, and supplies a minimal [[NativeExecContext]]. Lets the + * writer use one code path for both this case and the [[CometShuffleExchangeExec]] case. */ def prepareShuffleDependency( rdd: RDD[ColumnarBatch], From 0b71de43014c683630730d0bf297187d979ba091 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 18:54:22 -0400 Subject: [PATCH 12/13] Undo stricter tests since they're not happy on Spark 3.x. --- .../comet/exec/CometAggregateSuite.scala | 33 +++++-------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala index 3079db842f..c441480cec 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala @@ -1805,6 +1805,15 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { (1 to 50).flatMap(_ => Seq((maxDec38_0, 1))) } + // SparkPlan.executeCollect wraps task failures in SparkException("Job aborted..."), so the + // typed exception thrown by Comet (SparkArithmeticException, which extends ArithmeticException) + // ends up in the cause chain rather than at the top. + private def hasArithmeticInChain(t: Throwable): Boolean = + Iterator + .iterate[Throwable](t)(_.getCause) + .takeWhile(_ != null) + .exists(_.isInstanceOf[ArithmeticException]) + test("ANSI support - SUM function") { Seq(true, false).foreach { ansiEnabled => withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) { @@ -1817,10 +1826,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { // make sure that the error message throws overflow exception only assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) - assert( - cometExc.isInstanceOf[ArithmeticException], - "expected ArithmeticException, got " + - s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for Long overflow in ANSI mode") } } else { @@ -1835,10 +1840,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) - assert( - cometExc.isInstanceOf[ArithmeticException], - "expected ArithmeticException, got " + - s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for Long underflow in ANSI mode") } } else { @@ -1878,10 +1879,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) - assert( - cometExc.isInstanceOf[ArithmeticException], - "expected ArithmeticException, got " + - s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for decimal overflow in ANSI mode") } @@ -1905,10 +1902,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) - assert( - cometExc.isInstanceOf[ArithmeticException], - "expected ArithmeticException, got " + - s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for Long overflow with GROUP BY in ANSI mode") } @@ -1926,10 +1919,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) - assert( - cometExc.isInstanceOf[ArithmeticException], - "expected ArithmeticException, got " + - s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for Long underflow with GROUP BY in ANSI mode") } @@ -1971,10 +1960,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { case (Some(sparkExc), Some(cometExc)) => assert(sparkExc.getMessage.contains("ARITHMETIC_OVERFLOW")) assert(cometExc.getMessage.contains("ARITHMETIC_OVERFLOW")) - assert( - cometExc.isInstanceOf[ArithmeticException], - "expected ArithmeticException, got " + - s"${cometExc.getClass.getName}: ${cometExc.getMessage}") case _ => fail("Exception should be thrown for decimal overflow with GROUP BY in ANSI mode") } From 1ecfd8a2001f9a1c5b6313fcca05745f768fdbfb Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Fri, 29 May 2026 18:55:19 -0400 Subject: [PATCH 13/13] Remove unintended change. --- .../org/apache/comet/exec/CometAggregateSuite.scala | 9 --------- 1 file changed, 9 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala index c441480cec..ae14c68207 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala @@ -1805,15 +1805,6 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper { (1 to 50).flatMap(_ => Seq((maxDec38_0, 1))) } - // SparkPlan.executeCollect wraps task failures in SparkException("Job aborted..."), so the - // typed exception thrown by Comet (SparkArithmeticException, which extends ArithmeticException) - // ends up in the cause chain rather than at the top. - private def hasArithmeticInChain(t: Throwable): Boolean = - Iterator - .iterate[Throwable](t)(_.getCause) - .takeWhile(_ != null) - .exists(_.isInstanceOf[ArithmeticException]) - test("ANSI support - SUM function") { Seq(true, false).foreach { ansiEnabled => withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) {