From f07b070b86e3387c64d850ab632a1f9d3eba5883 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 27 May 2026 16:41:59 -0600 Subject: [PATCH] chore(audit): audit json expressions across Spark 3.4.3, 3.5.8, 4.0.1, 4.1.1 Add per-version audit sub-bullets to `get_json_object` in `docs/source/contributor-guide/spark_expressions_support.md`. Spark 3.4.3 and 3.5.8 use a `BinaryExpression with CodegenFallback` with inline Jackson-based eval; Spark 4.0 extracts the eval into a `GetJsonObjectEvaluator` helper and widens `inputTypes` to `StringTypeWithCollation` (`DefaultStringProducingExpression` trait added). 4.1 is identical to 4.0. Apply the one support-level consistency fix surfaced by the audit: - `CometGetJsonObject`: extract the duplicate single-quote / control-character incompatibility reason into a shared `private val` so the doc generator and the EXPLAIN dispatcher cannot drift. No new tracking issues filed. The known incompatibilities (single- quoted JSON, unescaped control characters) are already declared via `getSupportLevel` and `getIncompatibleReasons`. Spark 4.0 collation propagation is covered by the umbrella #2190. --- .../contributor-guide/spark_expressions_support.md | 5 +++++ .../main/scala/org/apache/comet/serde/strings.scala | 11 +++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 8f82847bb6..ce1b4ba0e1 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -326,6 +326,11 @@ - [ ] from_json - [x] get_json_object + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `BinaryExpression with ExpectsInputTypes with CodegenFallback`; `inputTypes = Seq(StringType, StringType) -> StringType`. Eval is inline and uses Jackson with `RawStyle` output. Foldable paths are parsed once. Returns NULL for invalid JSON, missing paths, or `JsonProcessingException`. + - Spark 4.0.1 (audited 2026-05-27): the eval is extracted into a `GetJsonObjectEvaluator` helper (no behaviour change). The trait set now mixes in `DefaultStringProducingExpression`, and `inputTypes` is widened to `StringTypeWithCollation(supportsTrimCollation = true)` for both arguments. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. + - Known incompatibility: Spark accepts single-quoted JSON and unescaped control characters; Comet's native parser (built on `serde_json`) rejects both, so those inputs require `spark.comet.expression.GetJsonObject.allowIncompatible=true` and may still produce different results. Non-default Spark 4.0 string collations are not propagated (https://github.com/apache/datafusion-comet/issues/2190). - [ ] json_array_length - [ ] json_object_keys - [ ] json_tuple diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index aec4b19111..bdd43a783f 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -443,15 +443,14 @@ object CometStringSplit extends CometExpressionSerde[StringSplit] { object CometGetJsonObject extends CometExpressionSerde[GetJsonObject] { - override def getIncompatibleReasons(): Seq[String] = Seq( + private val incompatReason = "Spark allows single-quoted JSON and unescaped control characters which Comet does not" + - " support") + " support" + + override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason) override def getSupportLevel(expr: GetJsonObject): SupportLevel = - Incompatible( - Some( - "Spark allows single-quoted JSON and unescaped control characters " + - "which Comet does not support")) + Incompatible(Some(incompatReason)) override def convert( expr: GetJsonObject,