diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 8f82847bb6..1facb653a8 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -523,40 +523,107 @@ ### string_funcs - [x] ascii + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringType -> IntegerType`; `nullSafeEval` returns `codePointAt(0)` of the first char, or `0` for the empty string. Wired via `CometScalarFunction("ascii")` and resolved to DataFusion `ascii` (`chars().next() as i32`); first-code-point semantics match for ASCII, BMP, and supplementary code points. + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; behaviour unchanged for `UTF8_BINARY`. Comet does not propagate collation, so non-default collations may diverge silently (https://github.com/apache/datafusion-comet/issues/4496). - [ ] base64 - [x] bit_length + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `(StringType|BinaryType) -> IntegerType`; eval returns `numBytes * 8` for strings and `.length * 8` for binary. + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + - Known limitation: wired as a raw `CometScalarFunction("bit_length")` with no `BinaryType` guard. DataFusion's `BitLengthFunc` signature only accepts string types, so `bit_length()` execute-fails on the native side instead of falling back cleanly (https://github.com/apache/datafusion-comet/issues/4464). - [x] btrim + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringTrimBoth` is `RuntimeReplaceable` and rewritten to `StringTrim(srcStr, trimStr)` before serde runs. Support is provided by the `trim` entry; no dedicated serde registration. + - Spark 4.0.1 (audited 2026-05-27): `StringTrim` (the rewrite target) routes through `CollationSupport.StringTrim.exec` and uses `StringTypeNonCSAICollation(supportsTrimCollation = true)`; semantics unchanged for `UTF8_BINARY`. Non-default collations may diverge in Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] char + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `Chr(LongType) -> StringType`; `lon < 0` returns `""`, else `((lon & 0xFF) as char).toString` (so `chr(256)` and `chr(0)` both return `\u0000`). + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`. Resolves natively to `datafusion_spark::function::string::char::CharFunc`, which mirrors Spark's negative-input and `& 0xFF` semantics. - [x] char_length + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): registry alias of `Length`. Same support as `length`. + - Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Length`. - [x] character_length + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): registry alias of `Length`. Same support as `length`. + - Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Length`. - [x] chr + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): registry alias of `Chr`. Same support as `char`. + - Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Chr`. - [ ] collate - [ ] collation - [x] concat_ws + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `Seq[Expression] -> StringType`; NULL separator yields NULL, NULL element values are skipped, children can be `StringType` or `ArrayType(StringType)`. Comet serde rewrites a NULL-literal separator to a NULL of the result type and bails out on all-foldable inputs so Spark's `ConstantFolding` handles them; otherwise delegates to DataFusion `concat_ws`. + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation` / `AbstractArrayType`; `dataType` becomes `children.head.dataType` (collation-derived). Semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] contains + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `UTF8String.contains` on `StringType`; the parser routes `(BinaryType, BinaryType)` to `BinaryPredicate`, so Comet only ever sees the String form. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.Contains.exec(..., collationId)`; behaviour identical for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] decode + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringDecode(bin, charset)` evaluated directly; invalid sequences silently substitute replacement characters via `new String(bytes, charset)`. + - Spark 4.0.1 (audited 2026-05-27): refactored to `RuntimeReplaceable` whose `replacement` is a `StaticInvoke(StringDecode.decode, bin, charset, legacyCharsets, legacyErrorAction)`; the 4-arg form raises on malformed input unless legacy flags are set. + - Known limitations: Comet handles `decode` via `CommonStringExprs.stringDecode` from the version shims (no `CometExpressionSerde[StringDecode]` registration, so the function does not surface in the auto-generated compatibility docs: https://github.com/apache/datafusion-comet/issues/4466). Only literal `charset = 'utf-8'` (case-insensitive) is supported; everything else falls back. The Spark 4.0 `legacyCharsets` / `legacyErrorAction` flags are ignored: Comet always lowers to `Cast(bin, StringType, TRY)`, so invalid UTF-8 yields NULL where Spark 3.x substitutes replacement characters and Spark 4.0 (non-legacy) raises (https://github.com/apache/datafusion-comet/issues/4465). - [ ] elt - [ ] encode - [x] endswith + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `UTF8String.endsWith` on `StringType`; binary form routed to `BinaryPredicate` before Comet. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.EndsWith.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [ ] find_in_set - [ ] format_number - [ ] format_string - [x] initcap + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `string.toLowerCase.toTitleCase` on `UTF8String`; word boundary is Java `Character.isWhitespace`. Comet routes to DataFusion `initcap`, which splits on `!is_alphanumeric()` (hyphens, apostrophes, and punctuation all split words), so Comet is unconditionally `Incompatible` (https://github.com/apache/datafusion-comet/issues/1052). + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.InitCap.exec` (collation- and ICU-aware) and propagates `child.dataType`. Comet ignores collation; 3.x divergences persist plus collation/ICU mismatches (https://github.com/apache/datafusion-comet/issues/4496). - [x] instr + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringInstr(str, substr) -> IntegerType`; returns `string.indexOf(sub, 0) + 1` (1-based, 0 when not found, 1 on empty substring). Resolves to DataFusion `strpos` (alias `instr`) with matching semantics. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringInstr.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [ ] is_valid_utf8 - [x] lcase + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): registry alias of `Lower`. Same support as `lower`. + - Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Lower`. - [x] left + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `RuntimeReplaceable` with `replacement = Substring(str, Literal(1), len)`; accepts `StringType` or `BinaryType` plus `IntegerType`. Comet serde rewrites to a `Substring` proto with `start=1, len=lenValue`. `getSupportLevel` declares `Unsupported` for non-literal `len` so the dispatcher falls back uniformly. + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened with `StringTypeWithCollation`; behaviour unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] len + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): registry alias of `Length`. Same support as `length`. + - Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Length`. - [x] length + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `(StringType|BinaryType) -> IntegerType`; eval returns `numChars` for strings and `.length` for binary. `BinaryType` input falls back via `Unsupported` (DataFusion's `character_length` accepts string types only). + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`; semantics unchanged. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [ ] levenshtein - [ ] locate - [x] lower + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. JVM default-locale `toLowerCase` on `UTF8String`. Comet routes to DataFusion `lower` (Rust Unicode default case mapping, no locale awareness) and is unconditionally `Incompatible`; users opt in via the standard `spark.comet.expression.Lower.allowIncompatible=true`. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.Lower.exec(v, collationId, useICU)` with `SQLConf.ICU_CASE_MAPPINGS_ENABLED`; `inputTypes` widened to `StringTypeWithCollation`. Comet ignores collation and ICU mode, so non-default collations or `ICU_CASE_MAPPINGS_ENABLED=true` diverge even after opting in (https://github.com/apache/datafusion-comet/issues/2190). - [x] lpad + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringLPad(str, len, pad) -> StringType`; `len <= 0` returns the empty string, empty `pad` returns `str` unchanged, NULL inputs propagate. Comet serde requires `str` to be a column and `pad` to be a literal; otherwise falls back. + - Spark 4.0.1 (audited 2026-05-27): `NullIntolerant` trait replaced by `override def nullIntolerant: Boolean = true`; `inputTypes` widened to `StringTypeWithCollation(supportsTrimCollation = true)`. Semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + - Known limitation: `lpad(, ...)` is rewritten by Spark to `BinaryPad / StaticInvoke(ByteArray.lpad)` before serde runs and always falls back to Spark. - [x] ltrim + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringTrimLeft` extends `String2TrimExpression`; no-arg form strips ASCII space `0x20` only. The two-arg parser form `ltrim(trimStr, srcStr)` is swapped to `(srcStr, Option(trimStr))` by Spark's secondary constructor, so children match DataFusion `ltrim(str, chars)`. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringTrimLeft.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [ ] luhn_check - [ ] make_valid_utf8 - [ ] mask - [x] octet_length + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `(StringType|BinaryType) -> IntegerType`; eval returns `numBytes` for strings and `.length` for binary. + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened to `StringTypeWithCollation`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + - Known limitation: wired as a raw `CometScalarFunction("octet_length")` with no `BinaryType` guard. DataFusion's `OctetLengthFunc` signature only accepts string types, so `octet_length()` execute-fails on the native side instead of falling back cleanly (https://github.com/apache/datafusion-comet/issues/4464). - [ ] overlay - [ ] position - [ ] printf @@ -566,33 +633,84 @@ - [ ] regexp_extract_all - [ ] regexp_instr - [x] regexp_replace + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `RegExpReplace(subject, regexp, rep, pos)` with foldable `pos > 0`; uses Java `Pattern`. Comet supports only `pos = 1` (other offsets fall back) and injects a `'g'` flag because DataFusion's `regexp_replace` stops at the first match by default. + - Spark 4.0.1 (audited 2026-05-27): adds raw-string literal support at the parser level and `nullIntolerant: Boolean = true`; runtime semantics unchanged. + - Known limitation: regex semantics differ (Rust `regex` crate vs Java `Pattern`); `RegExp.isSupportedPattern` currently returns `false` for every pattern, so the path always requires `spark.comet.expression.regexp.allowIncompatible=true`. - [ ] regexp_substr - [x] repeat + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringRepeat(str, times)` with `nullSafeEval(s, n) = s.repeat(n)`; `UTF8String.repeat` returns the empty string for `n <= 0`. Comet casts `times` to `LongType` and delegates to DataFusion `repeat`, which mirrors Spark for negative counts. + - Spark 4.0.1 (audited 2026-05-27): adds `nullIntolerant: Boolean` field; `dataType` becomes `str.dataType` (collation-tracking). Semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] replace + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringReplace(src, search, replace)`; when `search` is empty, Spark returns `src` unchanged (short-circuit on `search.numBytes == 0`). DataFusion `replace` instead inserts `replace` between every character, so `CometStringReplace.getSupportLevel` marks `Incompatible(Some(reason))` when `search` is a literal empty string and falls back to Spark by default (https://github.com/apache/datafusion-comet/issues/4497). + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringReplace.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] right + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `RuntimeReplaceable` with `replacement = If(IsNull(str), null, If(len <= 0, "", Substring(str, -len, len)))`; accepts `StringType` plus `IntegerType`. Comet serde rewrites positive `len` to a `Substring` proto with `start=-len, len=len`; for `len <= 0` it builds an `If(IsNull(str), null, "")` proto chain to preserve NULL propagation. `getSupportLevel` declares `Unsupported` for non-literal `len` so the dispatcher falls back uniformly. + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened with collation; uses `UnaryMinus(len, failOnError = false)` to avoid integer-overflow exceptions on `len = Int.MinValue`. Semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] rpad + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringRPad(str, len, pad) -> StringType`; same edge-case behaviour as `lpad` (negative len, empty pad, NULL propagation). Comet serde requires column `str` and literal `pad`. + - Spark 4.0.1 (audited 2026-05-27): same evolution as `lpad`; default-pad literal type tightened; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + - Known limitation: same `BinaryPad / StaticInvoke` rewrite as `lpad` causes `rpad(, ...)` to fall back. - [x] rtrim + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringTrimRight` extends `String2TrimExpression`; semantically symmetric to `ltrim`. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringTrimRight.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [ ] sentences - [ ] soundex - [x] space + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringSpace(IntegerType) -> StringType`; negative input yields the empty string. Resolves natively to `datafusion_spark::function::string::space::SparkSpace`. + - Spark 4.0.1 (audited 2026-05-27): semantics unchanged; `NullIntolerant` trait replaced by `nullIntolerant: Boolean` override. - [x] split + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringSplit(str, regex, limit)`; `limit > 0` permits at most `limit-1` splits, `limit <= 0` is unlimited. Comet registers `split` as a custom UDF (`native/spark-expr/src/string_funcs/split.rs`) using the Rust `regex` crate, and is unconditionally `Incompatible` due to regex-engine differences. + - Spark 4.0.1 (audited 2026-05-27): wraps the regex via `CollationSupport.collationAwareRegex` and changes `dataType` to `ArrayType(str.dataType, ...)`. Comet does not honour collation flags (https://github.com/apache/datafusion-comet/issues/4496). - [ ] split_part - [x] startswith + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `UTF8String.startsWith` on `StringType`; binary form routed to `BinaryPredicate`. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StartsWith.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] substr + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): registry alias of `Substring`. Same support as `substring`. + - Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Substring`. - [x] substring + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `TernaryExpression`; two-arg form defaults `len = Integer.MAX_VALUE`; supports `StringType` and `BinaryType`. Comet serializes to a dedicated `Substring` proto. `getSupportLevel` declares `Unsupported` when either `pos` or `len` is not a `Literal` so the dispatcher falls back uniformly. + - Spark 4.0.1 (audited 2026-05-27): `inputTypes` widened with `StringTypeWithCollation`; semantics unchanged for `UTF8_BINARY`. Native `SubstringExpr` implements Spark's negative-start clamping and is exercised against ASCII, multibyte UTF-8, emoji, decomposed and Telugu inputs. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [x] substring_index + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `TernaryExpression(StringType, StringType, IntegerType) -> StringType`. Comet casts `count` to `LongType` and delegates to DataFusion's `substr_index` UDF (alias `substring_index`). + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.SubstringIndex.exec` and propagates `strExpr.dataType`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [ ] to_binary - [ ] to_char - [ ] to_number - [ ] to_varchar - [x] translate + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringTranslate(src, from, to)`; `UTF8String.translate(dict)` is code-point based, and any character mapped explicitly to U+0000 in `to` is also deleted. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringTranslate.exec`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). + - Known divergence: DataFusion's `translate` is grapheme-based (Spark uses code points), and does not delete characters mapped to U+0000 in `to`. Currently the support level is `Compatible` (https://github.com/apache/datafusion-comet/issues/4463). - [x] trim + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. `StringTrim` no-arg form strips ASCII space `0x20` only (matches DataFusion `btrim`'s default); two-arg form's children are `(srcStr, trimStr)` after Spark's secondary-constructor swap. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.StringTrim.exec` and uses `StringTypeNonCSAICollation`; semantics unchanged for `UTF8_BINARY`. Non-default collations not honoured by Comet (https://github.com/apache/datafusion-comet/issues/4496). - [ ] try_to_binary - [ ] try_to_number - [ ] try_validate_utf8 - [x] ucase + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): registry alias of `Upper`. Same support as `upper`. + - Spark 4.0.1 (audited 2026-05-27): unchanged alias of `Upper`. - [ ] unbase64 - [x] upper + - Spark 3.4.3 (audited 2026-05-27): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-27): baseline. JVM default-locale `toUpperCase` on `UTF8String`. Comet routes to DataFusion `upper` (Rust Unicode default case mapping, no locale awareness) and is unconditionally `Incompatible`; users opt in via the standard `spark.comet.expression.Upper.allowIncompatible=true`. + - Spark 4.0.1 (audited 2026-05-27): routes through `CollationSupport.Upper.exec(v, collationId, useICU)` with `SQLConf.ICU_CASE_MAPPINGS_ENABLED`. Comet does not propagate collation or ICU mode; non-default collations or `ICU_CASE_MAPPINGS_ENABLED=true` diverge even after opting in (https://github.com/apache/datafusion-comet/issues/2190). - [ ] validate_utf8 ### struct_funcs diff --git a/spark/src/main/scala/org/apache/comet/CometConf.scala b/spark/src/main/scala/org/apache/comet/CometConf.scala index 82700a939e..32e868ce1b 100644 --- a/spark/src/main/scala/org/apache/comet/CometConf.scala +++ b/spark/src/main/scala/org/apache/comet/CometConf.scala @@ -745,14 +745,6 @@ object CometConf extends ShimCometConf { .toSequence .createWithDefault(Seq("Range,InMemoryTableScan,RDDScan,OneRowRelation")) - val COMET_CASE_CONVERSION_ENABLED: ConfigEntry[Boolean] = - conf("spark.comet.caseConversion.enabled") - .category(CATEGORY_EXEC) - .doc("Java uses locale-specific rules when converting strings to upper or lower case and " + - "Rust does not, so we disable upper and lower by default.") - .booleanConf - .createWithDefault(false) - val COMET_PARQUET_UNSIGNED_SMALL_INT_CHECK: ConfigEntry[Boolean] = conf("spark.comet.scan.unsignedSmallIntSafetyCheck") .category(CATEGORY_SCAN) diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index b818b61b1b..555a92db18 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -191,14 +191,13 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[StartsWith] -> CometScalarFunction("starts_with"), classOf[StringInstr] -> CometScalarFunction("instr"), classOf[StringRepeat] -> CometStringRepeat, - classOf[StringReplace] -> CometScalarFunction("replace"), + classOf[StringReplace] -> CometStringReplace, classOf[StringRPad] -> CometStringRPad, classOf[StringLPad] -> CometStringLPad, classOf[StringSpace] -> CometScalarFunction("space"), classOf[StringSplit] -> CometStringSplit, classOf[StringTranslate] -> CometScalarFunction("translate"), classOf[StringTrim] -> CometScalarFunction("trim"), - classOf[StringTrimBoth] -> CometScalarFunction("btrim"), classOf[StringTrimLeft] -> CometScalarFunction("ltrim"), classOf[StringTrimRight] -> CometScalarFunction("rtrim"), classOf[Left] -> CometLeft, diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index aec4b19111..57963cfac3 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, ConcatWs, Expression, GetJsonObject, If, InitCap, IsNull, Left, Length, Like, Literal, Lower, RegExpReplace, Right, RLike, StringLPad, StringRepeat, StringRPad, StringSplit, Substring, SubstringIndex, Upper} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, ConcatWs, Expression, GetJsonObject, If, InitCap, IsNull, Left, Length, Like, Literal, Lower, RegExpReplace, Right, RLike, StringLPad, StringRepeat, StringReplace, StringRPad, StringSplit, Substring, SubstringIndex, Upper} import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType} import org.apache.spark.unsafe.types.UTF8String @@ -33,10 +33,6 @@ import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInter object CometStringRepeat extends CometExpressionSerde[StringRepeat] { - override def getCompatibleNotes(): Seq[String] = Seq( - "A negative argument for the number of times to repeat throws an exception" + - " instead of returning an empty string as Spark does") - override def convert( expr: StringRepeat, inputs: Seq[Attribute], @@ -54,21 +50,13 @@ object CometStringRepeat extends CometExpressionSerde[StringRepeat] { class CometCaseConversionBase[T <: Expression](function: String) extends CometScalarFunction[T](function) { - override def getIncompatibleReasons(): Seq[String] = Seq( - "Results can vary depending on locale and character set." + - s" Requires `${CometConf.COMET_CASE_CONVERSION_ENABLED.key}=true` to enable.") + private val incompatReason = + "Results can vary depending on locale and character set " + + "(https://github.com/apache/datafusion-comet/issues/2190)." - override def convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { - if (!CometConf.COMET_CASE_CONVERSION_ENABLED.get()) { - withInfo( - expr, - "Comet is not compatible with Spark for case conversion in " + - s"locale-specific cases. Set ${CometConf.COMET_CASE_CONVERSION_ENABLED.key}=true " + - "to enable it anyway.") - return None - } - super.convert(expr, inputs, binding) - } + override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason) + + override def getSupportLevel(expr: T): SupportLevel = Incompatible(Some(incompatReason)) } object CometUpper extends CometCaseConversionBase[Upper]("upper") @@ -76,54 +64,54 @@ object CometUpper extends CometCaseConversionBase[Upper]("upper") object CometLower extends CometCaseConversionBase[Lower]("lower") object CometLength extends CometScalarFunction[Length]("length") { - override def getUnsupportedReasons(): Seq[String] = Seq("`BinaryType` input is not supported") + private val binaryUnsupportedReason = "`BinaryType` input is not supported" + + override def getUnsupportedReasons(): Seq[String] = Seq(binaryUnsupportedReason) override def getSupportLevel(expr: Length): SupportLevel = expr.child.dataType match { - case _: BinaryType => Unsupported(Some("Length on BinaryType is not supported")) + case _: BinaryType => Unsupported(Some(binaryUnsupportedReason)) case _ => Compatible() } } object CometInitCap extends CometScalarFunction[InitCap]("initcap") { - override def getIncompatibleReasons(): Seq[String] = Seq( + private val incompatReason = "Treats hyphen as a word separator (e.g. `robert rose-smith` produces `Robert Rose-Smith`" + " instead of Spark's `Robert Rose-smith`)" + - " (https://github.com/apache/datafusion-comet/issues/1052)") + " (https://github.com/apache/datafusion-comet/issues/1052)" - override def getSupportLevel(expr: InitCap): SupportLevel = { - // Behavior differs from Spark. One example is that for the input "robert rose-smith", Spark - // will produce "Robert Rose-smith", but Comet will produce "Robert Rose-Smith". - // https://github.com/apache/datafusion-comet/issues/1052 - Incompatible(None) - } + override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason) - override def convert(expr: InitCap, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { - super.convert(expr, inputs, binding) - } + override def getSupportLevel(expr: InitCap): SupportLevel = Incompatible(Some(incompatReason)) } object CometSubstring extends CometExpressionSerde[Substring] { + private val literalArgsReason = "`pos` and `len` arguments must be literal values" + + override def getUnsupportedReasons(): Seq[String] = Seq(literalArgsReason) + + override def getSupportLevel(expr: Substring): SupportLevel = (expr.pos, expr.len) match { + case (_: Literal, _: Literal) => Compatible() + case _ => Unsupported(Some(literalArgsReason)) + } + override def convert( expr: Substring, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { - (expr.pos, expr.len) match { - case (Literal(pos, _), Literal(len, _)) => - exprToProtoInternal(expr.str, inputs, binding) match { - case Some(strExpr) => - val builder = ExprOuterClass.Substring.newBuilder() - builder.setChild(strExpr) - builder.setStart(pos.asInstanceOf[Int]) - builder.setLen(len.asInstanceOf[Int]) - Some(ExprOuterClass.Expr.newBuilder().setSubstring(builder).build()) - case None => - withInfo(expr, expr.str) - None - } - case _ => - withInfo(expr, "Substring pos and len must be literals") + val Literal(pos, _) = expr.pos + val Literal(len, _) = expr.len + exprToProtoInternal(expr.str, inputs, binding) match { + case Some(strExpr) => + val builder = ExprOuterClass.Substring.newBuilder() + builder.setChild(strExpr) + builder.setStart(pos.asInstanceOf[Int]) + builder.setLen(len.asInstanceOf[Int]) + Some(ExprOuterClass.Expr.newBuilder().setSubstring(builder).build()) + case None => + withInfo(expr, expr.str) None } } @@ -147,33 +135,35 @@ object CometSubstringIndex extends CometExpressionSerde[SubstringIndex] { object CometLeft extends CometExpressionSerde[Left] { - override def getUnsupportedReasons(): Seq[String] = Seq( - "Only supports `BinaryType` and `StringType` input", - "The length argument must be a literal value") + private val literalLenReason = "The `length` argument must be a literal value" + private val unsupportedDataTypeReason = "Only supports `BinaryType` and `StringType` input" + + override def getUnsupportedReasons(): Seq[String] = + Seq(unsupportedDataTypeReason, literalLenReason) override def convert(expr: Left, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { - expr.len match { - case Literal(lenValue, _) => - exprToProtoInternal(expr.str, inputs, binding) match { - case Some(strExpr) => - val builder = ExprOuterClass.Substring.newBuilder() - builder.setChild(strExpr) - builder.setStart(1) - builder.setLen(lenValue.asInstanceOf[Int]) - Some(ExprOuterClass.Expr.newBuilder().setSubstring(builder).build()) - case None => - withInfo(expr, expr.str) - None - } - case _ => - withInfo(expr, "LEFT len must be a literal") + val Literal(lenValue, _) = expr.len + exprToProtoInternal(expr.str, inputs, binding) match { + case Some(strExpr) => + val builder = ExprOuterClass.Substring.newBuilder() + builder.setChild(strExpr) + builder.setStart(1) + builder.setLen(lenValue.asInstanceOf[Int]) + Some(ExprOuterClass.Expr.newBuilder().setSubstring(builder).build()) + case None => + withInfo(expr, expr.str) None } } override def getSupportLevel(expr: Left): SupportLevel = { expr.str.dataType match { - case _: BinaryType | _: StringType => Compatible() + case _: BinaryType | _: StringType => + if (!expr.len.isInstanceOf[Literal]) { + Unsupported(Some(literalLenReason)) + } else { + Compatible() + } case _ => Unsupported(Some(s"LEFT does not support ${expr.str.dataType}")) } } @@ -181,49 +171,66 @@ object CometLeft extends CometExpressionSerde[Left] { object CometRight extends CometExpressionSerde[Right] { + private val literalLenReason = "The `length` argument must be a literal value" + private val unsupportedDataTypeReason = "Only supports `StringType` input" + override def convert(expr: Right, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { - expr.len match { - case Literal(lenValue, _) => - val lenInt = lenValue.asInstanceOf[Int] - if (lenInt <= 0) { - // Match Spark's behavior: If(IsNull(str), NULL, "") - // This ensures NULL propagation: RIGHT(NULL, 0) -> NULL, RIGHT("hello", 0) -> "" - val isNullExpr = IsNull(expr.str) - val nullLiteral = Literal.create(null, StringType) - val emptyStringLiteral = Literal(UTF8String.EMPTY_UTF8, StringType) - val ifExpr = If(isNullExpr, nullLiteral, emptyStringLiteral) - - // Serialize the If expression using existing infrastructure - exprToProtoInternal(ifExpr, inputs, binding) - } else { - exprToProtoInternal(expr.str, inputs, binding) match { - case Some(strExpr) => - val builder = ExprOuterClass.Substring.newBuilder() - builder.setChild(strExpr) - builder.setStart(-lenInt) - builder.setLen(lenInt) - Some(ExprOuterClass.Expr.newBuilder().setSubstring(builder).build()) - case None => - withInfo(expr, expr.str) - None - } - } - case _ => - withInfo(expr, "RIGHT len must be a literal") - None + val Literal(lenValue, _) = expr.len + val lenInt = lenValue.asInstanceOf[Int] + if (lenInt <= 0) { + // Match Spark's behavior: If(IsNull(str), NULL, "") + // This ensures NULL propagation: RIGHT(NULL, 0) -> NULL, RIGHT("hello", 0) -> "" + val isNullExpr = IsNull(expr.str) + val nullLiteral = Literal.create(null, StringType) + val emptyStringLiteral = Literal(UTF8String.EMPTY_UTF8, StringType) + val ifExpr = If(isNullExpr, nullLiteral, emptyStringLiteral) + exprToProtoInternal(ifExpr, inputs, binding) + } else { + exprToProtoInternal(expr.str, inputs, binding) match { + case Some(strExpr) => + val builder = ExprOuterClass.Substring.newBuilder() + builder.setChild(strExpr) + builder.setStart(-lenInt) + builder.setLen(lenInt) + Some(ExprOuterClass.Expr.newBuilder().setSubstring(builder).build()) + case None => + withInfo(expr, expr.str) + None + } } } - override def getUnsupportedReasons(): Seq[String] = Seq("Only supports `StringType` input") + override def getUnsupportedReasons(): Seq[String] = + Seq(unsupportedDataTypeReason, literalLenReason) override def getSupportLevel(expr: Right): SupportLevel = { expr.str.dataType match { - case _: StringType => Compatible() + case _: StringType => + if (!expr.len.isInstanceOf[Literal]) { + Unsupported(Some(literalLenReason)) + } else { + Compatible() + } case _ => Unsupported(Some(s"RIGHT does not support ${expr.str.dataType}")) } } } +object CometStringReplace extends CometScalarFunction[StringReplace]("replace") { + + private val emptySearchReason = + "Empty `search` string produces different output: Spark returns `str` unchanged, " + + "DataFusion inserts `replace` between every character " + + "(https://github.com/apache/datafusion-comet/issues/4497)." + + override def getIncompatibleReasons(): Seq[String] = Seq(emptySearchReason) + + override def getSupportLevel(expr: StringReplace): SupportLevel = expr.searchExpr match { + case Literal(s: UTF8String, _) if s.numBytes == 0 => Incompatible(Some(emptySearchReason)) + case _ => Compatible() + } +} + object CometConcat extends CometScalarFunction[Concat]("concat") { val unsupportedReason = "CONCAT supports only string input parameters" @@ -309,18 +316,22 @@ object CometRLike extends CometExpressionSerde[RLike] { } } +private object PadReasons { + val literalStrReason = "Scalar values are not supported for the `str` argument." + val nonLiteralPadReason = "Only scalar values are supported for the `pad` argument." +} + object CometStringRPad extends CometExpressionSerde[StringRPad] { - override def getUnsupportedReasons(): Seq[String] = Seq( - "Scalar values are not supported for the `str` argument." + - " Only scalar values are supported for the `pad` argument.") + override def getUnsupportedReasons(): Seq[String] = + Seq(PadReasons.literalStrReason, PadReasons.nonLiteralPadReason) override def getSupportLevel(expr: StringRPad): SupportLevel = { if (expr.str.isInstanceOf[Literal]) { - return Unsupported(Some("Scalar values are not supported for the str argument")) + return Unsupported(Some(PadReasons.literalStrReason)) } if (!expr.pad.isInstanceOf[Literal]) { - return Unsupported(Some("Only scalar values are supported for the pad argument")) + return Unsupported(Some(PadReasons.nonLiteralPadReason)) } Compatible() } @@ -340,16 +351,15 @@ object CometStringRPad extends CometExpressionSerde[StringRPad] { object CometStringLPad extends CometExpressionSerde[StringLPad] { - override def getUnsupportedReasons(): Seq[String] = Seq( - "Scalar values are not supported for the `str` argument." + - " Only scalar values are supported for the `pad` argument.") + override def getUnsupportedReasons(): Seq[String] = + Seq(PadReasons.literalStrReason, PadReasons.nonLiteralPadReason) override def getSupportLevel(expr: StringLPad): SupportLevel = { if (expr.str.isInstanceOf[Literal]) { - return Unsupported(Some("Scalar values are not supported for the str argument")) + return Unsupported(Some(PadReasons.literalStrReason)) } if (!expr.pad.isInstanceOf[Literal]) { - return Unsupported(Some("Only scalar values are supported for the pad argument")) + return Unsupported(Some(PadReasons.nonLiteralPadReason)) } Compatible() } @@ -367,11 +377,13 @@ object CometStringLPad extends CometExpressionSerde[StringLPad] { } object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { - override def getIncompatibleReasons(): Seq[String] = Seq( - "Regexp pattern may not be compatible with Spark") + private val incompatReason = "Regexp pattern may not be compatible with Spark" + private val offsetUnsupportedReason = + "Only supports `regexp_replace` with an offset of 1 (no offset)" - override def getUnsupportedReasons(): Seq[String] = Seq( - "Only supports `regexp_replace` with an offset of 1 (no offset)") + override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason) + + override def getUnsupportedReasons(): Seq[String] = Seq(offsetUnsupportedReason) override def getSupportLevel(expr: RegExpReplace): SupportLevel = { if (!RegExp.isSupportedPattern(expr.regexp.toString) && @@ -381,12 +393,11 @@ object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { s"Regexp pattern ${expr.regexp} is not compatible with Spark. " + s"Set ${CometConf.getExprAllowIncompatConfigKey("regexp")}=true " + "to allow it anyway.") - return Incompatible() + return Incompatible(Some(incompatReason)) } expr.pos match { case Literal(value, DataTypes.IntegerType) if value == 1 => Compatible() - case _ => - Unsupported(Some("Comet only supports regexp_replace with an offset of 1 (no offset).")) + case _ => Unsupported(Some(offsetUnsupportedReason)) } } diff --git a/spark/src/test/resources/sql-tests/expressions/string/left.sql b/spark/src/test/resources/sql-tests/expressions/string/left.sql index 7c05ecac35..1bedbe5c82 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/left.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/left.sql @@ -21,7 +21,7 @@ CREATE TABLE test_str_left(s string, n int) USING parquet statement INSERT INTO test_str_left VALUES ('hello', 3), ('hello', 0), ('hello', -1), ('hello', 10), ('', 3), (NULL, 3), ('hello', NULL) -query expect_fallback(Substring pos and len must be literals) +query expect_fallback(arguments must be literal values) SELECT left(s, n) FROM test_str_left -- column + literal @@ -40,7 +40,7 @@ query SELECT left(s, 10) FROM test_str_left -- literal + column -query expect_fallback(Substring pos and len must be literals) +query expect_fallback(arguments must be literal values) SELECT left('hello', n) FROM test_str_left -- literal + literal diff --git a/spark/src/test/resources/sql-tests/expressions/string/lower.sql b/spark/src/test/resources/sql-tests/expressions/string/lower.sql index 7233251820..8f6e74c713 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/lower.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/lower.sql @@ -21,9 +21,9 @@ CREATE TABLE test_lower(s string) USING parquet statement INSERT INTO test_lower VALUES ('HELLO'), ('hello'), ('Hello World'), (''), (NULL), ('123ABC') -query expect_fallback(case conversion) +query expect_fallback(locale and character set) SELECT lower(s) FROM test_lower -- literal arguments -query expect_fallback(case conversion) +query expect_fallback(locale and character set) SELECT lower('HELLO'), lower(''), lower(NULL) diff --git a/spark/src/test/resources/sql-tests/expressions/string/lower_enabled.sql b/spark/src/test/resources/sql-tests/expressions/string/lower_enabled.sql index 0461fce735..10c188b441 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/lower_enabled.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/lower_enabled.sql @@ -15,8 +15,8 @@ -- specific language governing permissions and limitations -- under the License. --- Test lower() with case conversion enabled (happy path) --- Config: spark.comet.caseConversion.enabled=true +-- Test lower() with the standard allowIncompatible opt-in (happy path) +-- Config: spark.comet.expression.Lower.allowIncompatible=true statement CREATE TABLE test_lower_enabled(s string) USING parquet diff --git a/spark/src/test/resources/sql-tests/expressions/string/string_lpad.sql b/spark/src/test/resources/sql-tests/expressions/string/string_lpad.sql index 83d0ceaebb..c27d93de62 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/string_lpad.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/string_lpad.sql @@ -21,7 +21,7 @@ CREATE TABLE test_lpad(s string, len int, pad string) USING parquet statement INSERT INTO test_lpad VALUES ('hi', 5, 'x'), ('hello', 3, 'x'), ('hi', 5, 'xy'), ('', 3, 'a'), (NULL, 5, 'x'), ('hi', 0, 'x'), ('hi', -1, 'x') -query expect_fallback(Only scalar values are supported for the pad argument) +query expect_fallback(Only scalar values are supported for the `pad` argument) SELECT lpad(s, len, pad) FROM test_lpad query @@ -32,5 +32,5 @@ query SELECT lpad(s, 5, 'x') FROM test_lpad -- literal + literal + literal -query expect_fallback(Scalar values are not supported for the str argument) +query expect_fallback(Scalar values are not supported for the `str` argument) SELECT lpad('hi', 5, 'x'), lpad('hello', 3, 'x'), lpad('', 3, 'a'), lpad(NULL, 5, 'x') diff --git a/spark/src/test/resources/sql-tests/expressions/string/string_replace.sql b/spark/src/test/resources/sql-tests/expressions/string/string_replace.sql index c9dddebd63..8a605e24f0 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/string_replace.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/string_replace.sql @@ -24,7 +24,8 @@ INSERT INTO test_str_replace VALUES ('hello world', 'world', 'there'), ('aaa', ' query SELECT replace(s, search, replace) FROM test_str_replace -query ignore(https://github.com/apache/datafusion-comet/issues/3344) +-- Comet returns 'xhxexlxlxox' where Spark returns 'hello' (short-circuits on empty search). +query expect_fallback(Empty `search`) SELECT replace('hello', '', 'x') -- column + literal + literal diff --git a/spark/src/test/resources/sql-tests/expressions/string/string_rpad.sql b/spark/src/test/resources/sql-tests/expressions/string/string_rpad.sql index 48d3fb6cec..4ea06c3b23 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/string_rpad.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/string_rpad.sql @@ -21,7 +21,7 @@ CREATE TABLE test_rpad(s string, len int, pad string) USING parquet statement INSERT INTO test_rpad VALUES ('hi', 5, 'x'), ('hello', 3, 'x'), ('hi', 5, 'xy'), ('', 3, 'a'), (NULL, 5, 'x'), ('hi', 0, 'x'), ('hi', -1, 'x') -query expect_fallback(Only scalar values are supported for the pad argument) +query expect_fallback(Only scalar values are supported for the `pad` argument) SELECT rpad(s, len, pad) FROM test_rpad query @@ -32,5 +32,5 @@ query SELECT rpad(s, 5, 'x') FROM test_rpad -- literal + literal + literal -query expect_fallback(Scalar values are not supported for the str argument) +query expect_fallback(Scalar values are not supported for the `str` argument) SELECT rpad('hi', 5, 'x'), rpad('hello', 3, 'x'), rpad('', 3, 'a'), rpad(NULL, 5, 'x') diff --git a/spark/src/test/resources/sql-tests/expressions/string/upper.sql b/spark/src/test/resources/sql-tests/expressions/string/upper.sql index 6a15efe59b..74b864a98a 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/upper.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/upper.sql @@ -21,9 +21,9 @@ CREATE TABLE test_upper(s string) USING parquet statement INSERT INTO test_upper VALUES ('hello'), ('HELLO'), ('Hello World'), (''), (NULL), ('123abc') -query expect_fallback(case conversion) +query expect_fallback(locale and character set) SELECT upper(s) FROM test_upper -- literal arguments -query expect_fallback(case conversion) +query expect_fallback(locale and character set) SELECT upper('hello'), upper(''), upper(NULL) diff --git a/spark/src/test/resources/sql-tests/expressions/string/upper_enabled.sql b/spark/src/test/resources/sql-tests/expressions/string/upper_enabled.sql index 95ad265229..e1035ab37f 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/upper_enabled.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/upper_enabled.sql @@ -15,8 +15,8 @@ -- specific language governing permissions and limitations -- under the License. --- Test upper() with case conversion enabled (happy path) --- Config: spark.comet.caseConversion.enabled=true +-- Test upper() with the standard allowIncompatible opt-in (happy path) +-- Config: spark.comet.expression.Upper.allowIncompatible=true statement CREATE TABLE test_upper_enabled(s string) USING parquet diff --git a/spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a/extended.txt b/spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a/extended.txt index 4fd8252b21..02aa90cb10 100644 --- a/spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a/extended.txt +++ b/spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a/extended.txt @@ -7,7 +7,7 @@ Filter : +- Exchange : +- HashAggregate : +- Project -: +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.] +: +- BroadcastHashJoin [COMET: upper(ca_country#1) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).] : :- CometNativeColumnarToRow : : +- CometProject : : +- CometBroadcastHashJoin @@ -51,7 +51,7 @@ Filter +- Exchange +- HashAggregate +- Project - +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.] + +- BroadcastHashJoin [COMET: upper(ca_country#2) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).] :- CometNativeColumnarToRow : +- CometProject : +- CometBroadcastHashJoin diff --git a/spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b/extended.txt b/spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b/extended.txt index 4fd8252b21..02aa90cb10 100644 --- a/spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b/extended.txt +++ b/spark/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b/extended.txt @@ -7,7 +7,7 @@ Filter : +- Exchange : +- HashAggregate : +- Project -: +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.] +: +- BroadcastHashJoin [COMET: upper(ca_country#1) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).] : :- CometNativeColumnarToRow : : +- CometProject : : +- CometBroadcastHashJoin @@ -51,7 +51,7 @@ Filter +- Exchange +- HashAggregate +- Project - +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.] + +- BroadcastHashJoin [COMET: upper(ca_country#2) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).] :- CometNativeColumnarToRow : +- CometProject : +- CometBroadcastHashJoin diff --git a/spark/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q24/extended.txt b/spark/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q24/extended.txt index a94d9e453a..a90d0856a8 100644 --- a/spark/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q24/extended.txt +++ b/spark/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q24/extended.txt @@ -10,7 +10,7 @@ CometNativeColumnarToRow : +- Exchange : +- HashAggregate : +- Project - : +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.] + : +- BroadcastHashJoin [COMET: upper(ca_country#1) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).] : :- CometNativeColumnarToRow : : +- CometProject : : +- CometBroadcastHashJoin @@ -54,7 +54,7 @@ CometNativeColumnarToRow +- Exchange +- HashAggregate +- Project - +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.] + +- BroadcastHashJoin [COMET: upper(ca_country#2) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).] :- CometNativeColumnarToRow : +- CometProject : +- CometBroadcastHashJoin diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index 89d5dfd4bc..b5b14ef4a4 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -89,11 +89,11 @@ class CometStringExpressionSuite extends CometTestBase { } else if (isLiteralStr) { checkSparkAnswerAndFallbackReason( sql, - "Scalar values are not supported for the str argument") + "Scalar values are not supported for the `str` argument") } else if (!isLiteralPad) { checkSparkAnswerAndFallbackReason( sql, - "Only scalar values are supported for the pad argument") + "Only scalar values are supported for the `pad` argument") } else { checkSparkAnswerAndOperator(sql) } @@ -261,7 +261,9 @@ class CometStringExpressionSuite extends CometTestBase { } test("Upper and Lower") { - withSQLConf(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true") { + withSQLConf( + CometConf.getExprAllowIncompatConfigKey("Upper") -> "true", + CometConf.getExprAllowIncompatConfigKey("Lower") -> "true") { val table = "names" withTable(table) { sql(s"create table $table(id int, name varchar(20)) using parquet") @@ -339,7 +341,7 @@ class CometStringExpressionSuite extends CometTestBase { } test("trim") { - withSQLConf(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true") { + withSQLConf(CometConf.getExprAllowIncompatConfigKey("Upper") -> "true") { val table = "test" withTable(table) { sql(s"create table $table(col varchar(20)) using parquet") diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometStringExpressionBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometStringExpressionBenchmark.scala index d7be505161..653ca6fd0b 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometStringExpressionBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometStringExpressionBenchmark.scala @@ -86,7 +86,10 @@ object CometStringExpressionBenchmark extends CometBenchmarkBase { dir, spark.sql(s"SELECT REPEAT(CAST(value AS STRING), 10) AS c1 FROM $tbl")) - val extraConfigs = Map(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true") + val extraConfigs = Map( + CometConf.getExprAllowIncompatConfigKey("Upper") -> "true", + CometConf.getExprAllowIncompatConfigKey("Lower") -> "true", + CometConf.getExprAllowIncompatConfigKey("InitCap") -> "true") stringExpressions.foreach { config => val allConfigs = extraConfigs ++ config.extraCometConfigs