From 477818c06e1febc1f459849af7b38b041c1bd9a0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 27 May 2026 16:36:10 -0600 Subject: [PATCH] chore(audit): audit struct expressions across Spark 3.4.3, 3.5.8, 4.0.1, 4.1.1 Add per-version audit sub-bullets to `named_struct` and `struct` in `docs/source/contributor-guide/spark_expressions_support.md`. The Spark `CreateNamedStruct` class is byte-for-byte identical for behaviour across all four versions; only internal optimizer flags (`stateful` on 4.0, `contextIndependentFoldable` on 4.1) were added. Both `named_struct` and `struct` SQL functions lower to the same `CreateNamedStruct` node, so Comet handles them through one serde. Apply the one support-level consistency fix surfaced by the audit: - `CometCreateNamedStruct`: lift the duplicate-field-names fallback out of `convert` and into `getSupportLevel`, so the dispatcher handles the fallback uniformly and `getUnsupportedReasons()` documents the restriction for the compatibility guide. No correctness divergences were found, so no tracking issues are filed for this category. --- .../spark_expressions_support.md | 9 +++++++++ .../org/apache/comet/serde/structs.scala | 19 ++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 8f82847bb6..b6fcd2f6d1 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -598,7 +598,16 @@ ### struct_funcs - [x] named_struct + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `CreateNamedStruct(children)` with `NoThrow`; `children` is `Seq(name1, val1, name2, val2, ...)`. Names must be foldable `StringType` and distinct; non-null. `dataType` is the `StructType` of the resulting fields. Comet routes via `CometCreateNamedStruct`, builds a `CreateNamedStruct` proto, and the native `CreateNamedStruct` expression (in `native/spark-expr/src/struct_funcs/create_named_struct.rs`) emits a `StructArray`. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; an `override def stateful: Boolean = true` flag is added for the optimizer. Parser-level Alias unwrapping (`case (a: Alias, _) => Seq(Literal(a.name), a)`) is transparent to Comet. + - Spark 4.1.1 (audited 2026-05-27): semantics unchanged; adds `override def contextIndependentFoldable: Boolean = children.forall(_.contextIndependentFoldable)` for the new constant-folding pass; no impact on Comet conversion. + - Comet limitation: a `CreateNamedStruct` whose names contain duplicates falls back to Spark. (Spark's analyzer also tolerates duplicates at the column-name level, but the proto layer would lose them, so the fallback is the safe choice.) - [x] struct + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): the SQL function `struct(a, b, c)` is built by `CreateStruct.create`, which lowers to a `CreateNamedStruct` with synthetic field names `col1`, `col2`, etc. Comet handles both `struct` and `named_struct` via the same `CometCreateNamedStruct` path; the synthetic-name and explicit-name cases behave identically. + - Spark 4.0.1 (audited 2026-05-27): same lowering; identical to 3.5.8. + - Spark 4.1.1 (audited 2026-05-27): same lowering; identical to 3.5.8. ### url_funcs diff --git a/spark/src/main/scala/org/apache/comet/serde/structs.scala b/spark/src/main/scala/org/apache/comet/serde/structs.scala index 9ef00272ec..3f0f184263 100644 --- a/spark/src/main/scala/org/apache/comet/serde/structs.scala +++ b/spark/src/main/scala/org/apache/comet/serde/structs.scala @@ -31,15 +31,24 @@ import org.apache.comet.DataTypeSupport import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, serializeDataType} object CometCreateNamedStruct extends CometExpressionSerde[CreateNamedStruct] { + + private val duplicateNamesReason = + "`CreateNamedStruct` with duplicate field names is not supported" + + override def getUnsupportedReasons(): Seq[String] = Seq(duplicateNamesReason) + + override def getSupportLevel(expr: CreateNamedStruct): SupportLevel = { + if (expr.names.length != expr.names.distinct.length) { + Unsupported(Some(duplicateNamesReason)) + } else { + Compatible() + } + } + override def convert( expr: CreateNamedStruct, inputs: Seq[Attribute], binding: Boolean): Option[ExprOuterClass.Expr] = { - if (expr.names.length != expr.names.distinct.length) { - withInfo(expr, "CreateNamedStruct with duplicate field names are not supported") - return None - } - val valExprs = expr.valExprs.map(exprToProtoInternal(_, inputs, binding)) if (valExprs.forall(_.isDefined)) {