From 30b140745f11314eb0dc68902401a23024bdf211 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 27 May 2026 16:50:17 -0600 Subject: [PATCH] chore(audit): audit collection expressions across Spark 3.4.3, 3.5.8, 4.0.1, 4.1.1 Add per-version audit sub-bullets to `concat`, `reverse`, and `size` in `docs/source/contributor-guide/spark_expressions_support.md`. Spark `Concat` and `Reverse` widen StringType inputs to `StringTypeWithCollation` in 4.0; `Size` is byte-for-byte identical across all four versions. Apply one support-level consistency fix surfaced by the audit: - `CometConcat`: relabel the non-`StringType` branch from `Incompatible(Some(...))` to `Unsupported(Some(...))`. Spark accepts `BinaryType` and `ArrayType`, but Comet has no native path for either, so the user-observable effect is a fallback, not a wrong result. The reason string is now exposed via `getUnsupportedReasons` (rather than `getIncompatibleReasons`) and the constant is now `private val` for parity with other serdes. Tracking issues filed for the gaps found: - #4471 concat for BinaryType and ArrayType inputs (feature) - #4472 size for MapType inputs (feature) Existing #2763 covers `reverse` on array. --- .../spark_expressions_support.md | 15 +++++++++++++++ .../scala/org/apache/comet/serde/strings.scala | 6 +++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 8f82847bb6..c7118222a5 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -180,8 +180,23 @@ - [ ] array_size - [ ] cardinality - [x] concat + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `Concat(children) extends ComplexTypeMergingExpression with QueryErrorsBase`; `allowedTypes = Seq(StringType, BinaryType, ArrayType)`; result type is the merged child type. Empty children is allowed and returns the empty string of the result type. + - Spark 4.0.1 (audited 2026-05-27): `allowedTypes` widens `StringType` to `StringTypeWithCollation(supportsTrimCollation = true)`. Error-formatting helper changes from `paramIndex` to `ordinalNumber`. Runtime semantics unchanged for `UTF8_BINARY`. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. + - Known limitation: Comet only supports `StringType` children natively; `BinaryType` and `ArrayType` inputs fall back to Spark (https://github.com/apache/datafusion-comet/issues/4471). Non-default Spark 4.0 string collations are not propagated (https://github.com/apache/datafusion-comet/issues/2190). - [x] reverse + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `Reverse(child) extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant`; `inputTypes = Seq(TypeCollection(StringType, ArrayType))`; `dataType = child.dataType`. For string, calls `UTF8String.reverse()`; for array, reverses element order in-place via `GenericArrayData`. + - Spark 4.0.1 (audited 2026-05-27): `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`; `inputTypes` widened to `Seq(TypeCollection(StringTypeWithCollation(supportsTrimCollation = true), ArrayType))`. Semantics unchanged for `UTF8_BINARY`. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. + - Known limitation: `Reverse` on an array containing `BinaryType` elements is reported as `Incompatible` and falls back unless explicitly enabled (https://github.com/apache/datafusion-comet/issues/2763). - [x] size + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `Size(child, legacySizeOfNull) extends UnaryExpression with ExpectsInputTypes`; `inputTypes = Seq(TypeCollection(ArrayType, MapType)) -> IntegerType`. `legacySizeOfNull=true` returns `-1` for NULL input; `false` returns NULL. Comet routes via `CometSize`, which emits a `CaseWhen(isNotNull(child), size_scalar(child), Literal(legacySizeOfNull))`. + - Spark 4.0.1 (audited 2026-05-27): byte-for-byte identical to 3.5.8. + - Spark 4.1.1 (audited 2026-05-27): byte-for-byte identical to 3.5.8. + - Known limitation: `Size` over `MapType` falls back to Spark (https://github.com/apache/datafusion-comet/issues/4472). ### conditional_funcs diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index aec4b19111..efab2e848a 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -225,15 +225,15 @@ object CometRight extends CometExpressionSerde[Right] { } object CometConcat extends CometScalarFunction[Concat]("concat") { - val unsupportedReason = "CONCAT supports only string input parameters" + private val unsupportedReason = "CONCAT supports only string input parameters" - override def getIncompatibleReasons(): Seq[String] = Seq(unsupportedReason) + override def getUnsupportedReasons(): Seq[String] = Seq(unsupportedReason) override def getSupportLevel(expr: Concat): SupportLevel = { if (expr.children.forall(_.dataType == DataTypes.StringType)) { Compatible() } else { - Incompatible(Some(unsupportedReason)) + Unsupported(Some(unsupportedReason)) } } }