diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 4c69dc49b2..fa160ed536 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -423,7 +423,8 @@ - [x] datediff - [x] datepart - [x] day -- [ ] dayname +- [x] dayname + - Spark 4.0+. Implemented natively: maps a `DateType` value to a fixed US-English abbreviated day name (`DayOfWeek.getDisplayName(TextStyle.SHORT, Locale.US)`), with no session-locale or timezone dependence. - [x] dayofmonth - [x] dayofweek - [x] dayofyear @@ -447,7 +448,8 @@ - [ ] make_ym_interval - [x] minute - [x] month -- [ ] monthname +- [x] monthname + - Spark 4.0+. Implemented natively: maps a `DateType` value to a fixed US-English abbreviated month name (`Month.getDisplayName(TextStyle.SHORT, Locale.US)`), with no session-locale or timezone dependence. - [x] months_between - [x] next_day - [ ] now diff --git a/native/spark-expr/src/comet_scalar_funcs.rs b/native/spark-expr/src/comet_scalar_funcs.rs index 784e6c6829..61e86b3a5e 100644 --- a/native/spark-expr/src/comet_scalar_funcs.rs +++ b/native/spark-expr/src/comet_scalar_funcs.rs @@ -22,12 +22,12 @@ use crate::math_funcs::checked_arithmetic::{checked_add, checked_div, checked_mu use crate::math_funcs::log::spark_log; use crate::math_funcs::modulo_expr::spark_modulo; use crate::{ - spark_ceil, spark_decimal_div, spark_decimal_integral_div, spark_floor, spark_isnan, - spark_lpad, spark_make_decimal, spark_read_side_padding, spark_round, spark_rpad, - spark_to_time, spark_unhex, spark_unscaled_value, EvalMode, SparkArrayCompact, - SparkArrayPositionFunc, SparkArraySlice, SparkArraysOverlap, SparkContains, SparkDateDiff, - SparkDateFromUnixDate, SparkDateTrunc, SparkMakeDate, SparkMakeTime, SparkSecondsToTimestamp, - SparkSizeFunc, + spark_ceil, spark_day_name, spark_decimal_div, spark_decimal_integral_div, spark_floor, + spark_isnan, spark_lpad, spark_make_decimal, spark_month_name, spark_read_side_padding, + spark_round, spark_rpad, spark_to_time, spark_unhex, spark_unscaled_value, EvalMode, + SparkArrayCompact, SparkArrayPositionFunc, SparkArraySlice, SparkArraysOverlap, SparkContains, + SparkDateDiff, SparkDateFromUnixDate, SparkDateTrunc, SparkMakeDate, SparkMakeTime, + SparkSecondsToTimestamp, SparkSizeFunc, }; use arrow::datatypes::DataType; use datafusion::common::{DataFusionError, Result as DataFusionResult}; @@ -116,6 +116,14 @@ pub fn create_comet_physical_fun_with_eval_mode( let func = Arc::new(spark_read_side_padding); make_comet_scalar_udf!("read_side_padding", func, without data_type) } + "dayname" => { + let func = Arc::new(spark_day_name); + make_comet_scalar_udf!("dayname", func, without data_type) + } + "monthname" => { + let func = Arc::new(spark_month_name); + make_comet_scalar_udf!("monthname", func, without data_type) + } "rpad" => { let func = Arc::new(spark_rpad); make_comet_scalar_udf!("rpad", func, without data_type) diff --git a/native/spark-expr/src/datetime_funcs/day_month_name.rs b/native/spark-expr/src/datetime_funcs/day_month_name.rs new file mode 100644 index 0000000000..589b1d13ca --- /dev/null +++ b/native/spark-expr/src/datetime_funcs/day_month_name.rs @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Spark-compatible `dayname` and `monthname` (Spark 4.0+). +//! +//! Spark's `DayName` / `MonthName` call `DateTimeUtils.getDayName` / `getMonthName`, which map a +//! `DateType` value through `DayOfWeek` / `Month` `.getDisplayName(TextStyle.SHORT, Locale.US)`. +//! `DateFormatter.defaultLocale` is the constant `Locale.US`, so the output is a fixed set of +//! abbreviated English names, independent of the session locale or timezone. We reproduce that +//! exactly with the lookup tables below, computing the weekday / month from the Date32 value. + +use arrow::array::{Array, Date32Array, StringArray}; +use arrow::temporal_conversions::date32_to_datetime; +use chrono::Datelike; +use datafusion::common::{DataFusionError, Result, ScalarValue}; +use datafusion::logical_expr::ColumnarValue; +use std::sync::Arc; + +// `DayOfWeek.getDisplayName(TextStyle.SHORT, Locale.US)`, indexed Monday-first to match +// `chrono::Weekday::num_days_from_monday` (0 = Monday) and Spark's `DayOfWeek.ordinal()`. +const DAY_NAMES: [&str; 7] = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]; + +// `Month.getDisplayName(TextStyle.SHORT, Locale.US)`, indexed by `month0` (0 = January). +const MONTH_NAMES: [&str; 12] = [ + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", +]; + +fn day_name(days: i32) -> Option<&'static str> { + date32_to_datetime(days) + .map(|dt| DAY_NAMES[dt.date().weekday().num_days_from_monday() as usize]) +} + +fn month_name(days: i32) -> Option<&'static str> { + date32_to_datetime(days).map(|dt| MONTH_NAMES[dt.date().month0() as usize]) +} + +/// Spark-compatible `dayname(date)`. +pub fn spark_day_name(args: &[ColumnarValue]) -> Result { + day_month_name(args, "dayname", day_name) +} + +/// Spark-compatible `monthname(date)`. +pub fn spark_month_name(args: &[ColumnarValue]) -> Result { + day_month_name(args, "monthname", month_name) +} + +fn day_month_name( + args: &[ColumnarValue], + name: &str, + f: fn(i32) -> Option<&'static str>, +) -> Result { + if args.len() != 1 { + return Err(DataFusionError::Execution(format!( + "{name} expects exactly one argument, got {}", + args.len() + ))); + } + match &args[0] { + ColumnarValue::Array(array) => { + let dates = array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Execution(format!( + "{name} expects a Date32 argument, got {:?}", + array.data_type() + )) + })?; + let result: StringArray = dates.iter().map(|d| d.and_then(f)).collect(); + Ok(ColumnarValue::Array(Arc::new(result))) + } + ColumnarValue::Scalar(ScalarValue::Date32(days)) => Ok(ColumnarValue::Scalar( + ScalarValue::Utf8(days.and_then(f).map(|s| s.to_string())), + )), + ColumnarValue::Scalar(ScalarValue::Null) => { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))) + } + other => Err(DataFusionError::Execution(format!( + "{name} expects a Date32 argument, got {other:?}" + ))), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // 2024-01-15 is a Monday; 2024-06-30 is a Sunday; 2020-12-31 is a Thursday. + // Days since epoch: 2024-01-15 = 19737, 2024-06-30 = 19904, 2020-12-31 = 18627. + #[test] + fn day_names() { + assert_eq!(day_name(19737), Some("Mon")); + assert_eq!(day_name(19904), Some("Sun")); + assert_eq!(day_name(18627), Some("Thu")); + assert_eq!(day_name(0), Some("Thu")); // 1970-01-01 is a Thursday + assert_eq!(day_name(-1), Some("Wed")); // 1969-12-31 + } + + #[test] + fn month_names() { + assert_eq!(month_name(19737), Some("Jan")); + assert_eq!(month_name(19904), Some("Jun")); + assert_eq!(month_name(18627), Some("Dec")); + assert_eq!(month_name(0), Some("Jan")); + } +} diff --git a/native/spark-expr/src/datetime_funcs/mod.rs b/native/spark-expr/src/datetime_funcs/mod.rs index 53a1f185ff..b62ef057df 100644 --- a/native/spark-expr/src/datetime_funcs/mod.rs +++ b/native/spark-expr/src/datetime_funcs/mod.rs @@ -18,6 +18,7 @@ mod date_diff; mod date_from_unix_date; mod date_trunc; +mod day_month_name; mod extract_date_part; mod hours; mod make_date; @@ -30,6 +31,7 @@ mod unix_timestamp; pub use date_diff::SparkDateDiff; pub use date_from_unix_date::SparkDateFromUnixDate; pub use date_trunc::SparkDateTrunc; +pub use day_month_name::{spark_day_name, spark_month_name}; pub use extract_date_part::SparkHour; pub use extract_date_part::SparkMinute; pub use extract_date_part::SparkSecond; diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs index d27b15b75d..cb793026d6 100644 --- a/native/spark-expr/src/lib.rs +++ b/native/spark-expr/src/lib.rs @@ -76,9 +76,9 @@ pub use comet_scalar_funcs::{ }; pub use csv_funcs::*; pub use datetime_funcs::{ - spark_to_time, SparkDateDiff, SparkDateFromUnixDate, SparkDateTrunc, SparkHour, - SparkHoursTransform, SparkMakeDate, SparkMakeTime, SparkMinute, SparkSecond, - SparkSecondsToTimestamp, SparkUnixTimestamp, TimestampTruncExpr, + spark_day_name, spark_month_name, spark_to_time, SparkDateDiff, SparkDateFromUnixDate, + SparkDateTrunc, SparkHour, SparkHoursTransform, SparkMakeDate, SparkMakeTime, SparkMinute, + SparkSecond, SparkSecondsToTimestamp, SparkUnixTimestamp, TimestampTruncExpr, }; pub use error::{decimal_overflow_error, SparkError, SparkErrorWithContext, SparkResult}; pub use hash_funcs::*; diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index 85a8e9b292..62a4795504 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -38,7 +38,7 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithFa /** * `CometExprShim` acts as a shim for parsing expressions from different Spark versions. */ -trait CometExprShim extends CommonStringExprs { +trait CometExprShim extends CommonStringExprs with CometExprShim4x { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) @@ -183,6 +183,11 @@ trait CometExprShim extends CommonStringExprs { optExprWithFallbackReason(mapSortExpr, ms, ms.child) } + // dayname / monthname (Spark 4.0+) are shared across all 4.x minor versions; see + // CometExprShim4x.convertDayMonthName. + case _: DayName | _: MonthName => + convertDayMonthName(expr, inputs, binding) + case _ => None } } diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index 1e31360eb0..0303f28e3a 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -39,7 +39,7 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithFa /** * `CometExprShim` acts as a shim for parsing expressions from different Spark versions. */ -trait CometExprShim extends CommonStringExprs { +trait CometExprShim extends CommonStringExprs with CometExprShim4x { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) @@ -214,6 +214,11 @@ trait CometExprShim extends CommonStringExprs { optExprWithFallbackReason(mapSortExpr, ms, ms.child) } + // dayname / monthname (Spark 4.0+) are shared across all 4.x minor versions; see + // CometExprShim4x.convertDayMonthName. + case _: DayName | _: MonthName => + convertDayMonthName(expr, inputs, binding) + case _ => None } } diff --git a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala index 1e31360eb0..0303f28e3a 100644 --- a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala @@ -39,7 +39,7 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithFa /** * `CometExprShim` acts as a shim for parsing expressions from different Spark versions. */ -trait CometExprShim extends CommonStringExprs { +trait CometExprShim extends CommonStringExprs with CometExprShim4x { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) @@ -214,6 +214,11 @@ trait CometExprShim extends CommonStringExprs { optExprWithFallbackReason(mapSortExpr, ms, ms.child) } + // dayname / monthname (Spark 4.0+) are shared across all 4.x minor versions; see + // CometExprShim4x.convertDayMonthName. + case _: DayName | _: MonthName => + convertDayMonthName(expr, inputs, binding) + case _ => None } } diff --git a/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprShim4x.scala b/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprShim4x.scala new file mode 100644 index 0000000000..f2a1e32a6f --- /dev/null +++ b/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprShim4x.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.shims + +import org.apache.spark.sql.catalyst.expressions.{Attribute, DayName, Expression, MonthName} + +import org.apache.comet.serde.ExprOuterClass.Expr +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithFallbackReason, scalarFunctionExprToProtoWithReturnType} + +/** + * Expression conversions shared across all Spark 4.x minor versions, compiled from the + * `spark-4.x` source root. Spark 4.0+ expression classes (e.g. `DayName` / `MonthName`) do not + * exist in 3.x, so they cannot live in the version-agnostic serde, but they are identical across + * 4.0 / 4.1 / 4.2 and belong here rather than being duplicated in each per-minor-version + * `CometExprShim`. + */ +trait CometExprShim4x { + + /** + * `dayname` / `monthname` (Spark 4.0+) map a `DateType` value to a fixed US-English abbreviated + * name. Spark's `DateTimeUtils.getDayName` / `getMonthName` use `DayOfWeek` / `Month` + * `getDisplayName(TextStyle.SHORT, Locale.US)` (`DateFormatter.defaultLocale` is the constant + * `Locale.US`), so there is no session-locale or timezone dependence and they map directly to + * the native `dayname` / `monthname` scalar functions. + */ + protected def convertDayMonthName( + expr: Expression, + inputs: Seq[Attribute], + binding: Boolean): Option[Expr] = expr match { + case d: DayName => + val childExpr = exprToProtoInternal(d.child, inputs, binding) + val nameExpr = + scalarFunctionExprToProtoWithReturnType("dayname", d.dataType, false, childExpr) + optExprWithFallbackReason(nameExpr, d, d.child) + case m: MonthName => + val childExpr = exprToProtoInternal(m.child, inputs, binding) + val nameExpr = + scalarFunctionExprToProtoWithReturnType("monthname", m.dataType, false, childExpr) + optExprWithFallbackReason(nameExpr, m, m.child) + case _ => None + } +} diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/dayname.sql b/spark/src/test/resources/sql-tests/expressions/datetime/dayname.sql new file mode 100644 index 0000000000..822186f983 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/dayname.sql @@ -0,0 +1,39 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- dayname (Spark 4.0+) is implemented natively. It maps a date to a fixed US-English abbreviated +-- day name (DayOfWeek.getDisplayName(TextStyle.SHORT, Locale.US)), with no session-locale or +-- timezone dependence. +-- MinSparkVersion: 4.0 + +statement +CREATE TABLE test_dayname(d date) USING parquet + +statement +INSERT INTO test_dayname VALUES + (date('2024-01-15')), + (date('2024-06-30')), + (date('2020-12-31')), + (date('1970-01-01')), + (NULL) + +query +SELECT dayname(d) FROM test_dayname + +-- literal argument +query +SELECT dayname(date('2024-02-29')) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/monthname.sql b/spark/src/test/resources/sql-tests/expressions/datetime/monthname.sql new file mode 100644 index 0000000000..e8d6562735 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/monthname.sql @@ -0,0 +1,39 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- monthname (Spark 4.0+) is implemented natively. It maps a date to a fixed US-English +-- abbreviated month name (Month.getDisplayName(TextStyle.SHORT, Locale.US)), with no +-- session-locale or timezone dependence. +-- MinSparkVersion: 4.0 + +statement +CREATE TABLE test_monthname(d date) USING parquet + +statement +INSERT INTO test_monthname VALUES + (date('2024-01-15')), + (date('2024-06-30')), + (date('2020-12-31')), + (date('1970-01-01')), + (NULL) + +query +SELECT monthname(d) FROM test_monthname + +-- literal argument +query +SELECT monthname(date('2024-02-29'))