diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 8f82847bb6..e0175ed83c 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -350,15 +350,48 @@ ### map_funcs - [x] element_at + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `ElementAt(left, right, defaultValueOutOfBound, failOnError) extends GetMapValueUtil`; the parser routes `element_at(, ...)` to one overload and `element_at(, ...)` to another. Comet `CometElementAt` only supports `ArrayType` input; `MapType` input falls back. + - Spark 4.0.1 (audited 2026-05-27): adds `nullIntolerant: Boolean` field; semantics unchanged. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. - [ ] map - [ ] map_concat - [x] map_contains_key + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `MapContainsKey(left, right) extends RuntimeReplaceable with InheritAnalysisRules`; the analyzer rewrites to `ArrayContains(MapKeys(left), right)`. Comet routes via `CometMapContainsKey` which emits the equivalent `array_has(map_keys(map), key)`. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; minor trait refactors. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. - [x] map_entries + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `MapEntries(child)` returns an array of structs ``. Wired to native `map_entries`. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. - [x] map_from_arrays + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `MapFromArrays(left, right) extends BinaryExpression with NullIntolerant`; Spark uses `ArrayBasedMapBuilder` to detect duplicate keys (subject to `spark.sql.mapKeyDedupPolicy`) and rejects null keys with `RuntimeException("Cannot use null as map key")`. Comet `CometMapFromArrays` wraps the inputs in `CaseWhen(IsNotNull(left) AND IsNotNull(right), map(left, right), null)` so NULL-array inputs return NULL rather than triggering the previously reported native crash (#3327). + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `nullIntolerant: Boolean`. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. - [x] map_from_entries + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `MapFromEntries(child) extends UnaryExpression with NullIntolerant`; expects an array of structs and produces a map. Wired as `CometScalarFunction("map_from_entries")`. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; trait refactor. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. + - Known limitation: input arrays where the struct's key or value type contains `BinaryType` are marked `Incompatible` and fall back unless `spark.comet.expression.MapFromEntries.allowIncompatible=true`. - [x] map_keys + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `MapKeys(child)` returns the map's keys as an array. Wired to native `map_keys`. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. - [x] map_values + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `MapValues(child)` returns the map's values as an array. Wired to native `map_values`. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged. + - Spark 4.1.1 (audited 2026-05-27): identical to 4.0.1. - [x] str_to_map + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringToMap(text, pairDelim, keyValueDelim) extends TernaryExpression`; splits `text` on `pairDelim`, then each pair on `keyValueDelim` (default `","` and `":"`). Uses `ArrayBasedMapBuilder` for duplicate-key handling. Wired as `CometScalarFunction("str_to_map")`. + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeNonCSAICollation`; uses `CollationAwareUTF8String.splitSQL` with a `collationId`. Runtime unchanged for `UTF8_BINARY`. + - Spark 4.1.1 (audited 2026-05-27): adds the `legacySplitTruncate` flag (driven by `spark.sql.legacy.truncateForEmptyRegexSplit`) to both `splitSQL` calls (https://github.com/apache/datafusion-comet/issues/4477). The Comet native impl does not honour this flag; behaviour matches the non-legacy default. - [ ] try_element_at ### math_funcs diff --git a/spark/src/main/scala/org/apache/comet/serde/maps.scala b/spark/src/main/scala/org/apache/comet/serde/maps.scala index 4852ece50c..01c100b5e7 100644 --- a/spark/src/main/scala/org/apache/comet/serde/maps.scala +++ b/spark/src/main/scala/org/apache/comet/serde/maps.scala @@ -134,8 +134,10 @@ object CometMapContainsKey extends CometExpressionSerde[MapContainsKey] { } object CometMapFromEntries extends CometScalarFunction[MapFromEntries]("map_from_entries") { - val keyUnsupportedReason = "Using BinaryType as Map keys is not allowed in map_from_entries" - val valueUnsupportedReason = "Using BinaryType as Map values is not allowed in map_from_entries" + val keyUnsupportedReason = + "`BinaryType` is not supported as a map key in `map_from_entries`" + val valueUnsupportedReason = + "`BinaryType` is not supported as a map value in `map_from_entries`" override def getIncompatibleReasons(): Seq[String] = Seq(keyUnsupportedReason, valueUnsupportedReason) @@ -160,9 +162,4 @@ object CometMapFromEntries extends CometScalarFunction[MapFromEntries]("map_from } } -object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map") { - - override def getSupportLevel(expr: StringToMap): SupportLevel = { - Compatible(None) - } -} +object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map") diff --git a/spark/src/test/resources/sql-tests/expressions/map/map_from_entries.sql b/spark/src/test/resources/sql-tests/expressions/map/map_from_entries.sql index 0beffcc292..156e98df69 100644 --- a/spark/src/test/resources/sql-tests/expressions/map/map_from_entries.sql +++ b/spark/src/test/resources/sql-tests/expressions/map/map_from_entries.sql @@ -24,10 +24,10 @@ INSERT INTO test_map_from_entries VALUES (array(struct('a', 1), struct('b', 2), query SELECT map_from_entries(entries) FROM test_map_from_entries -query expect_fallback(Using BinaryType as Map keys is not allowed in map_from_entries) +query expect_fallback(is not supported as a map key in `map_from_entries`) SELECT map_from_entries(array(struct(cast('x' as binary), 10))) -query expect_fallback(Using BinaryType as Map values is not allowed in map_from_entries) +query expect_fallback(is not supported as a map value in `map_from_entries`) SELECT map_from_entries(array(struct(10, cast('x' as binary)))) -- literal arguments