Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions docs/source/contributor-guide/spark_expressions_support.md

Large diffs are not rendered by default.

8 changes: 0 additions & 8 deletions spark/src/main/scala/org/apache/comet/CometConf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -745,14 +745,6 @@ object CometConf extends ShimCometConf {
.toSequence
.createWithDefault(Seq("Range,InMemoryTableScan,RDDScan,OneRowRelation"))

val COMET_CASE_CONVERSION_ENABLED: ConfigEntry[Boolean] =
conf("spark.comet.caseConversion.enabled")
.category(CATEGORY_EXEC)
.doc("Java uses locale-specific rules when converting strings to upper or lower case and " +
"Rust does not, so we disable upper and lower by default.")
.booleanConf
.createWithDefault(false)

val COMET_PARQUET_UNSIGNED_SMALL_INT_CHECK: ConfigEntry[Boolean] =
conf("spark.comet.scan.unsignedSmallIntSafetyCheck")
.category(CATEGORY_SCAN)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,13 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
classOf[StartsWith] -> CometScalarFunction("starts_with"),
classOf[StringInstr] -> CometScalarFunction("instr"),
classOf[StringRepeat] -> CometStringRepeat,
classOf[StringReplace] -> CometScalarFunction("replace"),
classOf[StringReplace] -> CometStringReplace,
classOf[StringRPad] -> CometStringRPad,
classOf[StringLPad] -> CometStringLPad,
classOf[StringSpace] -> CometScalarFunction("space"),
classOf[StringSplit] -> CometStringSplit,
classOf[StringTranslate] -> CometScalarFunction("translate"),
classOf[StringTrim] -> CometScalarFunction("trim"),
classOf[StringTrimBoth] -> CometScalarFunction("btrim"),
classOf[StringTrimLeft] -> CometScalarFunction("ltrim"),
classOf[StringTrimRight] -> CometScalarFunction("rtrim"),
classOf[Left] -> CometLeft,
Expand Down
239 changes: 125 additions & 114 deletions spark/src/main/scala/org/apache/comet/serde/strings.scala

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ CREATE TABLE test_str_left(s string, n int) USING parquet
statement
INSERT INTO test_str_left VALUES ('hello', 3), ('hello', 0), ('hello', -1), ('hello', 10), ('', 3), (NULL, 3), ('hello', NULL)

query expect_fallback(Substring pos and len must be literals)
query expect_fallback(arguments must be literal values)
SELECT left(s, n) FROM test_str_left

-- column + literal
Expand All @@ -40,7 +40,7 @@ query
SELECT left(s, 10) FROM test_str_left

-- literal + column
query expect_fallback(Substring pos and len must be literals)
query expect_fallback(arguments must be literal values)
SELECT left('hello', n) FROM test_str_left

-- literal + literal
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ CREATE TABLE test_lower(s string) USING parquet
statement
INSERT INTO test_lower VALUES ('HELLO'), ('hello'), ('Hello World'), (''), (NULL), ('123ABC')

query expect_fallback(case conversion)
query expect_fallback(locale and character set)
SELECT lower(s) FROM test_lower

-- literal arguments
query expect_fallback(case conversion)
query expect_fallback(locale and character set)
SELECT lower('HELLO'), lower(''), lower(NULL)
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
-- specific language governing permissions and limitations
-- under the License.

-- Test lower() with case conversion enabled (happy path)
-- Config: spark.comet.caseConversion.enabled=true
-- Test lower() with the standard allowIncompatible opt-in (happy path)
-- Config: spark.comet.expression.Lower.allowIncompatible=true

statement
CREATE TABLE test_lower_enabled(s string) USING parquet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ CREATE TABLE test_lpad(s string, len int, pad string) USING parquet
statement
INSERT INTO test_lpad VALUES ('hi', 5, 'x'), ('hello', 3, 'x'), ('hi', 5, 'xy'), ('', 3, 'a'), (NULL, 5, 'x'), ('hi', 0, 'x'), ('hi', -1, 'x')

query expect_fallback(Only scalar values are supported for the pad argument)
query expect_fallback(Only scalar values are supported for the `pad` argument)
SELECT lpad(s, len, pad) FROM test_lpad

query
Expand All @@ -32,5 +32,5 @@ query
SELECT lpad(s, 5, 'x') FROM test_lpad

-- literal + literal + literal
query expect_fallback(Scalar values are not supported for the str argument)
query expect_fallback(Scalar values are not supported for the `str` argument)
SELECT lpad('hi', 5, 'x'), lpad('hello', 3, 'x'), lpad('', 3, 'a'), lpad(NULL, 5, 'x')
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ INSERT INTO test_str_replace VALUES ('hello world', 'world', 'there'), ('aaa', '
query
SELECT replace(s, search, replace) FROM test_str_replace

query ignore(https://github.com/apache/datafusion-comet/issues/3344)
-- Comet returns 'xhxexlxlxox' where Spark returns 'hello' (short-circuits on empty search).
query expect_fallback(Empty `search`)
SELECT replace('hello', '', 'x')

-- column + literal + literal
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ CREATE TABLE test_rpad(s string, len int, pad string) USING parquet
statement
INSERT INTO test_rpad VALUES ('hi', 5, 'x'), ('hello', 3, 'x'), ('hi', 5, 'xy'), ('', 3, 'a'), (NULL, 5, 'x'), ('hi', 0, 'x'), ('hi', -1, 'x')

query expect_fallback(Only scalar values are supported for the pad argument)
query expect_fallback(Only scalar values are supported for the `pad` argument)
SELECT rpad(s, len, pad) FROM test_rpad

query
Expand All @@ -32,5 +32,5 @@ query
SELECT rpad(s, 5, 'x') FROM test_rpad

-- literal + literal + literal
query expect_fallback(Scalar values are not supported for the str argument)
query expect_fallback(Scalar values are not supported for the `str` argument)
SELECT rpad('hi', 5, 'x'), rpad('hello', 3, 'x'), rpad('', 3, 'a'), rpad(NULL, 5, 'x')
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ CREATE TABLE test_upper(s string) USING parquet
statement
INSERT INTO test_upper VALUES ('hello'), ('HELLO'), ('Hello World'), (''), (NULL), ('123abc')

query expect_fallback(case conversion)
query expect_fallback(locale and character set)
SELECT upper(s) FROM test_upper

-- literal arguments
query expect_fallback(case conversion)
query expect_fallback(locale and character set)
SELECT upper('hello'), upper(''), upper(NULL)
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
-- specific language governing permissions and limitations
-- under the License.

-- Test upper() with case conversion enabled (happy path)
-- Config: spark.comet.caseConversion.enabled=true
-- Test upper() with the standard allowIncompatible opt-in (happy path)
-- Config: spark.comet.expression.Upper.allowIncompatible=true

statement
CREATE TABLE test_upper_enabled(s string) USING parquet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Filter
: +- Exchange
: +- HashAggregate
: +- Project
: +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.]
: +- BroadcastHashJoin [COMET: upper(ca_country#1) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).]
: :- CometNativeColumnarToRow
: : +- CometProject
: : +- CometBroadcastHashJoin
Expand Down Expand Up @@ -51,7 +51,7 @@ Filter
+- Exchange
+- HashAggregate
+- Project
+- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.]
+- BroadcastHashJoin [COMET: upper(ca_country#2) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).]
:- CometNativeColumnarToRow
: +- CometProject
: +- CometBroadcastHashJoin
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Filter
: +- Exchange
: +- HashAggregate
: +- Project
: +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.]
: +- BroadcastHashJoin [COMET: upper(ca_country#1) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).]
: :- CometNativeColumnarToRow
: : +- CometProject
: : +- CometBroadcastHashJoin
Expand Down Expand Up @@ -51,7 +51,7 @@ Filter
+- Exchange
+- HashAggregate
+- Project
+- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.]
+- BroadcastHashJoin [COMET: upper(ca_country#2) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).]
:- CometNativeColumnarToRow
: +- CometProject
: +- CometBroadcastHashJoin
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ CometNativeColumnarToRow
: +- Exchange
: +- HashAggregate
: +- Project
: +- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.]
: +- BroadcastHashJoin [COMET: upper(ca_country#1) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).]
: :- CometNativeColumnarToRow
: : +- CometProject
: : +- CometBroadcastHashJoin
Expand Down Expand Up @@ -54,7 +54,7 @@ CometNativeColumnarToRow
+- Exchange
+- HashAggregate
+- Project
+- BroadcastHashJoin [COMET: Comet is not compatible with Spark for case conversion in locale-specific cases. Set spark.comet.caseConversion.enabled=true to enable it anyway.]
+- BroadcastHashJoin [COMET: upper(ca_country#2) is not fully compatible with Spark (Results can vary depending on locale and character set (https://github.com/apache/datafusion-comet/issues/2190).). To enable it anyway, set spark.comet.expression.Upper.allowIncompatible=true. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html).]
:- CometNativeColumnarToRow
: +- CometProject
: +- CometBroadcastHashJoin
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@ class CometStringExpressionSuite extends CometTestBase {
} else if (isLiteralStr) {
checkSparkAnswerAndFallbackReason(
sql,
"Scalar values are not supported for the str argument")
"Scalar values are not supported for the `str` argument")
} else if (!isLiteralPad) {
checkSparkAnswerAndFallbackReason(
sql,
"Only scalar values are supported for the pad argument")
"Only scalar values are supported for the `pad` argument")
} else {
checkSparkAnswerAndOperator(sql)
}
Expand Down Expand Up @@ -261,7 +261,9 @@ class CometStringExpressionSuite extends CometTestBase {
}

test("Upper and Lower") {
withSQLConf(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true") {
withSQLConf(
CometConf.getExprAllowIncompatConfigKey("Upper") -> "true",
CometConf.getExprAllowIncompatConfigKey("Lower") -> "true") {
val table = "names"
withTable(table) {
sql(s"create table $table(id int, name varchar(20)) using parquet")
Expand Down Expand Up @@ -339,7 +341,7 @@ class CometStringExpressionSuite extends CometTestBase {
}

test("trim") {
withSQLConf(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true") {
withSQLConf(CometConf.getExprAllowIncompatConfigKey("Upper") -> "true") {
val table = "test"
withTable(table) {
sql(s"create table $table(col varchar(20)) using parquet")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,10 @@ object CometStringExpressionBenchmark extends CometBenchmarkBase {
dir,
spark.sql(s"SELECT REPEAT(CAST(value AS STRING), 10) AS c1 FROM $tbl"))

val extraConfigs = Map(CometConf.COMET_CASE_CONVERSION_ENABLED.key -> "true")
val extraConfigs = Map(
CometConf.getExprAllowIncompatConfigKey("Upper") -> "true",
CometConf.getExprAllowIncompatConfigKey("Lower") -> "true",
CometConf.getExprAllowIncompatConfigKey("InitCap") -> "true")

stringExpressions.foreach { config =>
val allConfigs = extraConfigs ++ config.extraCometConfigs
Expand Down
Loading