From c8967f1bdb935f4bd0262e14d9af582f22d2ea66 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 12:23:31 +0400 Subject: [PATCH 01/10] feat: spark function `quote` --- datafusion/spark/src/function/string/mod.rs | 8 ++ datafusion/spark/src/function/string/quote.rs | 118 ++++++++++++++++++ .../test_files/spark/string/quote.slt | 81 ++++++++++++ 3 files changed, 207 insertions(+) create mode 100644 datafusion/spark/src/function/string/quote.rs create mode 100644 datafusion/sqllogictest/test_files/spark/string/quote.slt diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 64d603cb8bb67..9c90ded5f7e1b 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -27,6 +27,7 @@ pub mod length; pub mod like; pub mod luhn_check; pub mod make_valid_utf8; +pub mod quote; pub mod soundex; pub mod space; pub mod substring; @@ -51,6 +52,7 @@ make_udf_function!(base64::SparkUnBase64, unbase64); make_udf_function!(soundex::SparkSoundex, soundex); make_udf_function!(make_valid_utf8::SparkMakeValidUtf8, make_valid_utf8); make_udf_function!(is_valid_utf8::SparkIsValidUtf8, is_valid_utf8); +make_udf_function!(quote::SparkQuote, quote); pub mod expr_fn { use datafusion_functions::export_functions; @@ -127,6 +129,11 @@ pub mod expr_fn { "Returns the original string if str is a valid UTF-8 string, otherwise returns a new string whose invalid UTF8 byte sequences are replaced using the UNICODE replacement character U+FFFD.", str )); + export_functions!(( + quote, + "Returns str enclosed by single quotes and each instance of single quote in it is preceded by a backslash", + str + )); } pub fn functions() -> Vec> { @@ -147,5 +154,6 @@ pub fn functions() -> Vec> { soundex(), make_valid_utf8(), is_valid_utf8(), + quote(), ] } diff --git a/datafusion/spark/src/function/string/quote.rs b/datafusion/spark/src/function/string/quote.rs new file mode 100644 index 0000000000000..85cb059574c67 --- /dev/null +++ b/datafusion/spark/src/function/string/quote.rs @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray}; +use arrow::datatypes::DataType; +use datafusion::logical_expr::{Coercion, ColumnarValue, Signature, TypeSignatureClass}; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; +use datafusion_common::types::{NativeType, logical_string}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Volatility}; +use datafusion_functions::utils::make_scalar_function; + +use std::sync::Arc; + +// Spark-compatible `quote` expression +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkQuote { + signature: Signature, +} + +impl Default for SparkQuote { + fn default() -> Self { + Self::new() + } +} + +impl SparkQuote { + pub fn new() -> Self { + let str_coercion = Coercion::new_implicit( + TypeSignatureClass::Native(logical_string()), + vec![TypeSignatureClass::Any], + NativeType::String, + ); + Self { + signature: Signature::coercible(vec![str_coercion], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkQuote { + fn name(&self) -> &str { + "quote" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + match &arg_types[0] { + DataType::LargeUtf8 => Ok(DataType::LargeUtf8), + _ => Ok(DataType::Utf8), + } + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(spark_quote_inner, vec![])(&args.args) + } +} + +fn spark_quote_inner(arg: &[ArrayRef]) -> Result { + let [array] = take_function_args("quote", arg)?; + match &array.data_type() { + DataType::Utf8 => quote_array::(array), + DataType::LargeUtf8 => quote_array::(array), + DataType::Utf8View => quote_view(array), + other => { + exec_err!("unsupported data type {other:?} for function `quote`") + } + } +} + +fn quote_array(array: &ArrayRef) -> Result { + let str_array = as_generic_string_array::(array)?; + let result = str_array + .iter() + .map(|s| s.map(compute_quote)) + .collect::(); + Ok(Arc::new(result)) +} + +fn quote_view(str_view: &ArrayRef) -> Result { + let str_array = as_string_view_array(str_view)?; + let result = str_array + .iter() + .map(|opt_str| opt_str.map(compute_quote)) + .collect::(); + Ok(Arc::new(result) as ArrayRef) +} + +fn compute_quote(s: &str) -> String { + let mut quoted = String::with_capacity(s.len() + 2); + quoted.push('\''); + for c in s.chars() { + if c == '\\' || c == '\'' { + quoted.push('\\'); + } + quoted.push(c); + } + quoted.push('\''); + quoted +} diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt new file mode 100644 index 0000000000000..e10077f46b4d5 --- /dev/null +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +query T +SELECT quote(1:INT); +---- +1 + +query T +SELECT quote(0:INT); +---- +0 + +query T +SELECT quote(-5:INT); +---- +-5 + +query T +SELECT quote('hello'); +---- +'hello' + +query T +SELECT quote(''); +---- +'' + +query T +SELECT quote('with space'); +---- +'with space' + +query T +SELECT quote('it''s'); +---- +'it''s' + +query T +SELECT quote('line\nbreak'); +---- +'line\nbreak' + +query T +SELECT quote(NULL); +---- +NULL + +query T +SELECT quote(TRUE); +---- +true + +query T +SELECT quote(FALSE); +---- +false + +query T +SELECT quote(3.14:FLOAT); +---- +3.14 + +query T +SELECT quote(-0.001:FLOAT); +---- +-0.001 From c14c45fcc6bba912690893aa402710d159f46403 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 12:46:29 +0400 Subject: [PATCH 02/10] feat: spark function `quote` --- datafusion/spark/src/function/string/quote.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/datafusion/spark/src/function/string/quote.rs b/datafusion/spark/src/function/string/quote.rs index 85cb059574c67..f0f600e6e6a56 100644 --- a/datafusion/spark/src/function/string/quote.rs +++ b/datafusion/spark/src/function/string/quote.rs @@ -104,15 +104,18 @@ fn quote_view(str_view: &ArrayRef) -> Result { Ok(Arc::new(result) as ArrayRef) } +const QUOTE_CHAR: char = '\''; +const ESCAPE_CHAR: char = '\\'; + fn compute_quote(s: &str) -> String { let mut quoted = String::with_capacity(s.len() + 2); - quoted.push('\''); + quoted.push(QUOTE_CHAR); for c in s.chars() { - if c == '\\' || c == '\'' { - quoted.push('\\'); + if c == ESCAPE_CHAR || c == QUOTE_CHAR { + quoted.push(ESCAPE_CHAR); } quoted.push(c); } - quoted.push('\''); + quoted.push(QUOTE_CHAR); quoted } From dcec62839917ecf5d48f6f921bcc9b72fc16c205 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 13:08:57 +0400 Subject: [PATCH 03/10] fix tests --- .../sqllogictest/test_files/spark/string/quote.slt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt index e10077f46b4d5..4f3fe1394dd11 100644 --- a/datafusion/sqllogictest/test_files/spark/string/quote.slt +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -18,17 +18,17 @@ query T SELECT quote(1:INT); ---- -1 +'1' query T SELECT quote(0:INT); ---- -0 +'0' query T SELECT quote(-5:INT); ---- --5 +'-5' query T SELECT quote('hello'); @@ -63,19 +63,19 @@ NULL query T SELECT quote(TRUE); ---- -true +'true' query T SELECT quote(FALSE); ---- -false +'false' query T SELECT quote(3.14:FLOAT); ---- -3.14 +'3.14' query T SELECT quote(-0.001:FLOAT); ---- --0.001 +'-0.001' From 2d42ef84fcf461ea2e45ebe4a8681de405971ee3 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 13:41:40 +0400 Subject: [PATCH 04/10] fix tests --- datafusion/sqllogictest/test_files/spark/string/quote.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt index 4f3fe1394dd11..71fdad16840e8 100644 --- a/datafusion/sqllogictest/test_files/spark/string/quote.slt +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -53,7 +53,7 @@ SELECT quote('it''s'); query T SELECT quote('line\nbreak'); ---- -'line\nbreak' +'line\\nbreak' query T SELECT quote(NULL); From ea97c76a75f292cf2d07749a13eb84444a02c55c Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 13:41:53 +0400 Subject: [PATCH 05/10] fix tests --- datafusion/sqllogictest/test_files/spark/string/quote.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt index 71fdad16840e8..28084fc9e113f 100644 --- a/datafusion/sqllogictest/test_files/spark/string/quote.slt +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -48,7 +48,7 @@ SELECT quote('with space'); query T SELECT quote('it''s'); ---- -'it''s' +'it\'s' query T SELECT quote('line\nbreak'); From bfaa76cb38fa5cb171fb2a52fdc38c03cf12f10a Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 13:42:46 +0400 Subject: [PATCH 06/10] fix tests --- .../sqllogictest/test_files/spark/string/quote.slt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt index 28084fc9e113f..86da4a6b8a099 100644 --- a/datafusion/sqllogictest/test_files/spark/string/quote.slt +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -69,13 +69,3 @@ query T SELECT quote(FALSE); ---- 'false' - -query T -SELECT quote(3.14:FLOAT); ----- -'3.14' - -query T -SELECT quote(-0.001:FLOAT); ----- -'-0.001' From f51f4f4d515148bdd8ae46a9170f3e71a7ce6735 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 14:07:59 +0400 Subject: [PATCH 07/10] fix tests --- datafusion/spark/src/function/string/quote.rs | 2 +- .../sqllogictest/test_files/spark/string/quote.slt | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/datafusion/spark/src/function/string/quote.rs b/datafusion/spark/src/function/string/quote.rs index f0f600e6e6a56..3025d7e1b1355 100644 --- a/datafusion/spark/src/function/string/quote.rs +++ b/datafusion/spark/src/function/string/quote.rs @@ -27,7 +27,7 @@ use datafusion_functions::utils::make_scalar_function; use std::sync::Arc; -// Spark-compatible `quote` expression +/// Spark-compatible `quote` expression /// #[derive(Debug, PartialEq, Eq, Hash)] pub struct SparkQuote { diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt index 86da4a6b8a099..369a28ba0a3e8 100644 --- a/datafusion/sqllogictest/test_files/spark/string/quote.slt +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -16,17 +16,12 @@ # under the License. query T -SELECT quote(1:INT); +SELECT quote(arrow_cast(1, 'Int8')); ---- '1' query T -SELECT quote(0:INT); ----- -'0' - -query T -SELECT quote(-5:INT); +SELECT quote(arrow_cast(-5, 'Int8')); ---- '-5' From 5d56981d0e24a5e23cb58d6ea14997e23cadff7a Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 14:54:34 +0400 Subject: [PATCH 08/10] fix tests --- datafusion/spark/src/function/string/quote.rs | 2 +- .../test_files/spark/string/quote.slt | 121 +++++++++++++++--- 2 files changed, 102 insertions(+), 21 deletions(-) diff --git a/datafusion/spark/src/function/string/quote.rs b/datafusion/spark/src/function/string/quote.rs index 3025d7e1b1355..39ad8bf841764 100644 --- a/datafusion/spark/src/function/string/quote.rs +++ b/datafusion/spark/src/function/string/quote.rs @@ -111,7 +111,7 @@ fn compute_quote(s: &str) -> String { let mut quoted = String::with_capacity(s.len() + 2); quoted.push(QUOTE_CHAR); for c in s.chars() { - if c == ESCAPE_CHAR || c == QUOTE_CHAR { + if c == QUOTE_CHAR { quoted.push(ESCAPE_CHAR); } quoted.push(c); diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt index 369a28ba0a3e8..6dded2c0a678c 100644 --- a/datafusion/sqllogictest/test_files/spark/string/quote.slt +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -16,51 +16,132 @@ # under the License. query T -SELECT quote(arrow_cast(1, 'Int8')); +SELECT quote(arrow_cast(127, 'Int8')); ---- -'1' +'127' query T -SELECT quote(arrow_cast(-5, 'Int8')); +SELECT quote(arrow_cast(-128, 'Int8')); ---- -'-5' +'-128' query T -SELECT quote('hello'); +SELECT quote(arrow_cast(32767, 'Int16')); ---- -'hello' +'32767' query T -SELECT quote(''); +SELECT quote(arrow_cast(-32768, 'Int16')); ---- -'' +'-32768' query T -SELECT quote('with space'); +SELECT quote(arrow_cast(2147483647, 'Int32')); ---- -'with space' +'2147483647' query T -SELECT quote('it''s'); +SELECT quote(arrow_cast(-2147483648, 'Int32')); ---- -'it\'s' +'-2147483648' query T -SELECT quote('line\nbreak'); +SELECT quote(arrow_cast(9223372036854775807, 'Int64')); ---- -'line\\nbreak' +'9223372036854775807' query T -SELECT quote(NULL); +SELECT quote(arrow_cast(-9223372036854775808, 'Int64')); ---- -NULL +'-9223372036854775808' query T -SELECT quote(TRUE); +SELECT quote(arrow_cast(3.14, 'Float32')); ---- -'true' +'3.14' query T -SELECT quote(FALSE); +SELECT quote(arrow_cast(2.718281828459045, 'Float64')); ---- -'false' +'2.718281828459045' + +query T +SELECT quote(arrow_cast(0, 'UInt8')); +---- +'0' + +query T +SELECT quote(arrow_cast(255, 'UInt8')); +---- +'255' + +query T +SELECT quote(arrow_cast(65535, 'UInt16')); +---- +'65535' + +query T +SELECT quote(arrow_cast(4294967295, 'UInt32')); +---- +'4294967295' + +query T +SELECT quote(arrow_cast(18446744073709551615, 'UInt64')); +---- +'18446744073709551615' + +query T +SELECT quote('special chars: !@#$%^&*()'); +---- +'special chars: !@#$%^&*()' + +query T +SELECT quote('tab\tseparated'); +---- +'tab\tseparated' + +query T +SELECT quote('carriage\rreturn'); +---- +'carriage\rreturn' + +query T +SELECT quote('backslash\\test'); +---- +'backslash\\test' + +query T +SELECT quote('quote\"inside\"'); +---- +'quote\"inside\"' + +query T +SELECT quote('mixed\nescape\tchars\r\n'); +---- +'mixed\nescape\tchars\r\n' + +query T +SELECT quote('unicode: 你好, 世界'); +---- +'unicode: 你好, 世界' + +query T +SELECT quote('emoji: 😀🎉❤️🚀'); +---- +'emoji: 😀🎉❤️🚀' + +query T +SELECT quote(arrow_cast('2024-01-15', 'Date32')); +---- +'2024-01-15' + +query T +SELECT quote(arrow_cast('2024-01-15 12:30:45', 'Timestamp')); +---- +'2024-01-15 12:30:45' + +query T +SELECT quote('special\n\t\r'); +---- +'special\n\t\r' + From df12f69b3fbc0168fa52a765b8df99091df6ae00 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 15:23:56 +0400 Subject: [PATCH 09/10] fix tests --- datafusion/sqllogictest/test_files/spark/string/quote.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt index 6dded2c0a678c..7b1ccc6eb931c 100644 --- a/datafusion/sqllogictest/test_files/spark/string/quote.slt +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -136,7 +136,7 @@ SELECT quote(arrow_cast('2024-01-15', 'Date32')); '2024-01-15' query T -SELECT quote(arrow_cast('2024-01-15 12:30:45', 'Timestamp')); +SELECT quote(arrow_cast('2024-01-15 12:30:45', 'Timestamp(µs)')); ---- '2024-01-15 12:30:45' From 5d40951342673b6be095b1796957cf6ec75cbfcf Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Sat, 30 May 2026 15:42:11 +0400 Subject: [PATCH 10/10] fix tests --- datafusion/sqllogictest/test_files/spark/string/quote.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/spark/string/quote.slt b/datafusion/sqllogictest/test_files/spark/string/quote.slt index 7b1ccc6eb931c..856c8d9e5c516 100644 --- a/datafusion/sqllogictest/test_files/spark/string/quote.slt +++ b/datafusion/sqllogictest/test_files/spark/string/quote.slt @@ -136,9 +136,9 @@ SELECT quote(arrow_cast('2024-01-15', 'Date32')); '2024-01-15' query T -SELECT quote(arrow_cast('2024-01-15 12:30:45', 'Timestamp(µs)')); +SELECT quote(arrow_cast('2024-01-15T12:30:45', 'Timestamp(µs)')); ---- -'2024-01-15 12:30:45' +'2024-01-15T12:30:45' query T SELECT quote('special\n\t\r');