diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index c0cbf8bbef..fe7df906de 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -149,6 +149,9 @@ jobs: org.apache.comet.CometBitwiseExpressionSuite org.apache.comet.CometMapExpressionSuite org.apache.comet.objectstore.NativeConfigSuite + - name: "sql" + value: | + ${{ matrix.profile.maven_opts != 'Spark 3.4, JDK 11, Scala 2.12' && 'org.apache.spark.sql.CometToPrettyStringSuite' || ''}} fail-fast: false name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml index ea09de06f5..1abe644f51 100644 --- a/.github/workflows/pr_build_macos.yml +++ b/.github/workflows/pr_build_macos.yml @@ -114,6 +114,9 @@ jobs: org.apache.comet.CometBitwiseExpressionSuite org.apache.comet.CometMapExpressionSuite org.apache.comet.objectstore.NativeConfigSuite + - name: "sql" + value: | + ${{ matrix.profile.maven_opts != 'Spark 3.4, JDK 11, Scala 2.12' && 'org.apache.spark.sql.CometToPrettyStringSuite' || ''}} fail-fast: false name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name }}] runs-on: ${{ matrix.os }} diff --git a/dev/ci/check-suites.py b/dev/ci/check-suites.py index 62bcd77b5e..8d9acb2d59 100644 --- a/dev/ci/check-suites.py +++ b/dev/ci/check-suites.py @@ -36,10 +36,7 @@ def file_to_class_name(path: Path) -> str | None: "org.apache.comet.parquet.ParquetReadFromS3Suite", # manual test suite "org.apache.spark.sql.comet.CometPlanStabilitySuite", # abstract "org.apache.spark.sql.comet.ParquetDatetimeRebaseSuite", # abstract - "org.apache.comet.exec.CometColumnarShuffleSuite", # abstract - # TODO add CometToPrettyStringSuite to PR worklows - # https://github.com/apache/datafusion-comet/issues/2307 - "org.apache.spark.sql.CometToPrettyStringSuite" + "org.apache.comet.exec.CometColumnarShuffleSuite" # abstract ] for workflow_filename in [".github/workflows/pr_build_linux.yml", ".github/workflows/pr_build_macos.yml"]: diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index 1758556559..dd059abbcd 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -210,6 +210,7 @@ The following cast operations are generally compatible with Spark except for the | string | long | | | string | binary | | | string | date | Only supports years between 262143 BC and 262142 AD | +| binary | string | | | date | string | | | timestamp | long | | | timestamp | string | | @@ -233,7 +234,6 @@ The following cast operations are not compatible with Spark for all inputs and a | string | double | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. | | string | decimal | Does not support inputs ending with 'd' or 'f'. Does not support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input contains no digits | | string | timestamp | Not all valid formats are supported | -| binary | string | Only works for binary data representing valid UTF-8 strings | ### Unsupported Casts diff --git a/native/Cargo.lock b/native/Cargo.lock index f9d14fa94e..b94458ddd2 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1581,11 +1581,13 @@ name = "datafusion-comet-spark-expr" version = "0.11.0" dependencies = [ "arrow", + "base64", "chrono", "chrono-tz", "criterion", "datafusion", "futures", + "hex", "num", "rand", "regex", diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 1465d33ade..5aa6ece3bc 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -62,8 +62,8 @@ use datafusion::{ prelude::SessionContext, }; use datafusion_comet_spark_expr::{ - create_comet_physical_fun, create_modulo_expr, create_negate_expr, BloomFilterAgg, - BloomFilterMightContain, EvalMode, SparkHour, SparkMinute, SparkSecond, + create_comet_physical_fun, create_modulo_expr, create_negate_expr, BinaryOutputStyle, + BloomFilterAgg, BloomFilterMightContain, EvalMode, SparkHour, SparkMinute, SparkSecond, }; use crate::execution::operators::ExecutionError::GeneralError; @@ -809,6 +809,8 @@ impl PhysicalPlanner { SparkCastOptions::new(EvalMode::Try, &expr.timezone, true); let null_string = "NULL"; spark_cast_options.null_string = null_string.to_string(); + spark_cast_options.binary_output_style = + from_protobuf_binary_output_style(expr.binary_output_style).ok(); let child = self.create_expr(expr.child.as_ref().unwrap(), input_schema)?; let cast = Arc::new(Cast::new( Arc::clone(&child), @@ -2693,6 +2695,18 @@ fn create_case_expr( } } +fn from_protobuf_binary_output_style( + value: i32, +) -> Result { + match spark_expression::BinaryOutputStyle::try_from(value)? { + spark_expression::BinaryOutputStyle::Utf8 => Ok(BinaryOutputStyle::Utf8), + spark_expression::BinaryOutputStyle::Basic => Ok(BinaryOutputStyle::Basic), + spark_expression::BinaryOutputStyle::Base64 => Ok(BinaryOutputStyle::Base64), + spark_expression::BinaryOutputStyle::Hex => Ok(BinaryOutputStyle::Hex), + spark_expression::BinaryOutputStyle::HexDiscrete => Ok(BinaryOutputStyle::HexDiscrete), + } +} + fn literal_to_array_ref( data_type: DataType, list_literal: ListLiteral, diff --git a/native/proto/src/proto/expr.proto b/native/proto/src/proto/expr.proto index 04d9376ac6..ade9860c80 100644 --- a/native/proto/src/proto/expr.proto +++ b/native/proto/src/proto/expr.proto @@ -269,9 +269,18 @@ message ToJson { bool ignore_null_fields = 6; } +enum BinaryOutputStyle { + UTF8 = 0; + BASIC = 1; + BASE64 = 2; + HEX = 3; + HEX_DISCRETE = 4; +} + message ToPrettyString { Expr child = 1; string timezone = 2; + BinaryOutputStyle binaryOutputStyle = 3; } message Hour { diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml index 6ccecf7d27..961c7bec08 100644 --- a/native/spark-expr/Cargo.toml +++ b/native/spark-expr/Cargo.toml @@ -37,6 +37,8 @@ thiserror = { workspace = true } futures = { workspace = true } twox-hash = "2.1.2" rand = { workspace = true } +hex = "0.4.3" +base64 = "0.22.1" [dev-dependencies] arrow = {workspace = true} diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs index 8f33bf912f..0c7b437a56 100644 --- a/native/spark-expr/src/conversion_funcs/cast.rs +++ b/native/spark-expr/src/conversion_funcs/cast.rs @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. -use crate::timezone; use crate::utils::array_with_timezone; +use crate::{timezone, BinaryOutputStyle}; use crate::{EvalMode, SparkError, SparkResult}; use arrow::array::builder::StringBuilder; -use arrow::array::{DictionaryArray, StringArray, StructArray}; +use arrow::array::{DictionaryArray, GenericByteArray, StringArray, StructArray}; use arrow::compute::can_cast_types; -use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType, Schema}; +use arrow::datatypes::{ + ArrowDictionaryKeyType, ArrowNativeType, DataType, GenericBinaryType, Schema, +}; use arrow::{ array::{ cast::AsArray, @@ -60,6 +62,8 @@ use std::{ sync::Arc, }; +use base64::prelude::*; + static TIMESTAMP_FORMAT: Option<&str> = Some("%Y-%m-%d %H:%M:%S%.f"); const MICROS_PER_SECOND: i64 = 1000000; @@ -260,11 +264,7 @@ fn can_cast_to_string(from_type: &DataType, options: &SparkCastOptions) -> bool // scientific notation where Comet does not true } - Binary => { - // https://github.com/apache/datafusion-comet/issues/377 - // Only works for binary data representing valid UTF-8 strings - options.allow_incompat - } + Binary => true, Struct(fields) => fields .iter() .all(|f| can_cast_to_string(f.data_type(), options)), @@ -816,6 +816,8 @@ pub struct SparkCastOptions { pub is_adapting_schema: bool, /// String to use to represent null values pub null_string: String, + /// SparkSQL's binaryOutputStyle + pub binary_output_style: Option, } impl SparkCastOptions { @@ -827,6 +829,7 @@ impl SparkCastOptions { allow_cast_unsigned_ints: false, is_adapting_schema: false, null_string: "null".to_string(), + binary_output_style: None, } } @@ -838,6 +841,7 @@ impl SparkCastOptions { allow_cast_unsigned_ints: false, is_adapting_schema: false, null_string: "null".to_string(), + binary_output_style: None, } } } @@ -1027,6 +1031,7 @@ fn cast_array( { Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?) } + (Binary, Utf8) => Ok(cast_binary_to_string::(&array, cast_options)?), _ if cast_options.is_adapting_schema || is_datafusion_spark_compatible(from_type, to_type, cast_options.allow_incompat) => { @@ -1045,6 +1050,74 @@ fn cast_array( Ok(spark_cast_postprocess(cast_result?, from_type, to_type)) } +fn cast_binary_to_string( + array: &dyn Array, + spark_cast_options: &SparkCastOptions, +) -> Result { + let input = array + .as_any() + .downcast_ref::>>() + .unwrap(); + + fn binary_formatter(value: &[u8], spark_cast_options: &SparkCastOptions) -> String { + match spark_cast_options.binary_output_style { + Some(s) => spark_binary_formatter(value, s), + None => cast_binary_formatter(value), + } + } + + let output_array = input + .iter() + .map(|value| match value { + Some(value) => Ok(Some(binary_formatter(value, spark_cast_options))), + _ => Ok(None), + }) + .collect::, ArrowError>>()?; + Ok(Arc::new(output_array)) +} + +/// This function mimics the [BinaryFormatter]: https://github.com/apache/spark/blob/v4.0.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala#L449-L468 +/// used by SparkSQL's ToPrettyString expression. +/// The BinaryFormatter was [introduced]: https://issues.apache.org/jira/browse/SPARK-47911 in Spark 4.0.0 +/// Before Spark 4.0.0, the default is SPACE_DELIMITED_UPPERCASE_HEX +fn spark_binary_formatter(value: &[u8], binary_output_style: BinaryOutputStyle) -> String { + match binary_output_style { + BinaryOutputStyle::Utf8 => String::from_utf8(value.to_vec()).unwrap(), + BinaryOutputStyle::Basic => { + format!( + "{:?}", + value + .iter() + .map(|v| i8::from_ne_bytes([*v])) + .collect::>() + ) + } + BinaryOutputStyle::Base64 => BASE64_STANDARD_NO_PAD.encode(value), + BinaryOutputStyle::Hex => value + .iter() + .map(|v| hex::encode_upper([*v])) + .collect::(), + BinaryOutputStyle::HexDiscrete => { + // Spark's default SPACE_DELIMITED_UPPERCASE_HEX + format!( + "[{}]", + value + .iter() + .map(|v| hex::encode_upper([*v])) + .collect::>() + .join(" ") + ) + } + } +} + +fn cast_binary_formatter(value: &[u8]) -> String { + match String::from_utf8(value.to_vec()) { + Ok(value) => value, + Err(_) => unsafe { String::from_utf8_unchecked(value.to_vec()) }, + } +} + /// Determines if DataFusion supports the given cast in a way that is /// compatible with Spark fn is_datafusion_spark_compatible( diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs index 4b29b61775..af5677a9bf 100644 --- a/native/spark-expr/src/lib.rs +++ b/native/spark-expr/src/lib.rs @@ -98,6 +98,15 @@ pub enum EvalMode { Try, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub enum BinaryOutputStyle { + Utf8, + Basic, + Base64, + Hex, + HexDiscrete, +} + pub(crate) fn arithmetic_overflow_error(from_type: &str) -> SparkError { SparkError::ArithmeticOverflow { from_type: from_type.to_string(), diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 3ea4882563..7db62130d4 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -215,8 +215,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { "There can be formatting differences in some case due to Spark using " + "scientific notation where Comet does not")) case DataTypes.BinaryType => - // https://github.com/apache/datafusion-comet/issues/377 - Incompatible(Some("Only works for binary data representing valid UTF-8 strings")) + Compatible() case StructType(fields) => for (field <- fields) { isSupported(field.dataType, DataTypes.StringType, timeZoneId, evalMode) match { diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 1e0e97862d..258d275e5b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -716,6 +716,7 @@ object QueryPlanSerde extends Logging with CometExprShim { .newBuilder() .setChild(p) .setTimezone(timezoneId.getOrElse("UTC")) + .setBinaryOutputStyle(binaryOutputStyle) .build() Some( ExprOuterClass.Expr diff --git a/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala index b6ba91ad10..e9785ce5e0 100644 --- a/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala @@ -20,7 +20,7 @@ package org.apache.comet.shims import org.apache.comet.expressions.CometEvalMode import org.apache.comet.serde.CommonStringExprs -import org.apache.comet.serde.ExprOuterClass.Expr +import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr} import org.apache.spark.sql.catalyst.expressions._ /** @@ -30,6 +30,8 @@ trait CometExprShim extends CommonStringExprs { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) + protected def binaryOutputStyle: BinaryOutputStyle = BinaryOutputStyle.HEX_DISCRETE + def versionSpecificExprToProtoInternal( expr: Expression, inputs: Seq[Attribute], diff --git a/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala index b6ba91ad10..bbabb389d6 100644 --- a/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala @@ -20,7 +20,7 @@ package org.apache.comet.shims import org.apache.comet.expressions.CometEvalMode import org.apache.comet.serde.CommonStringExprs -import org.apache.comet.serde.ExprOuterClass.Expr +import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr} import org.apache.spark.sql.catalyst.expressions._ /** @@ -30,6 +30,8 @@ trait CometExprShim extends CommonStringExprs { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) + protected def binaryOutputStyle: BinaryOutputStyle = BinaryOutputStyle.HEX_DISCRETE + def versionSpecificExprToProtoInternal( expr: Expression, inputs: Seq[Attribute], diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index 5885c908eb..09dcface63 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -20,9 +20,10 @@ package org.apache.comet.shims import org.apache.comet.expressions.CometEvalMode import org.apache.comet.serde.CommonStringExprs -import org.apache.comet.serde.ExprOuterClass.Expr +import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types.{BinaryType, BooleanType, StringType} @@ -33,6 +34,16 @@ trait CometExprShim extends CommonStringExprs { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) + protected def binaryOutputStyle: BinaryOutputStyle = { + SQLConf.get.getConf(SQLConf.BINARY_OUTPUT_STYLE).map(SQLConf.BinaryOutputStyle.withName) match { + case Some(SQLConf.BinaryOutputStyle.UTF8) => BinaryOutputStyle.UTF8 + case Some(SQLConf.BinaryOutputStyle.BASIC) => BinaryOutputStyle.BASIC + case Some(SQLConf.BinaryOutputStyle.BASE64) => BinaryOutputStyle.BASE64 + case Some(SQLConf.BinaryOutputStyle.HEX) => BinaryOutputStyle.HEX + case _ => BinaryOutputStyle.HEX_DISCRETE + } + } + def versionSpecificExprToProtoInternal( expr: Expression, inputs: Seq[Attribute], diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index 772cc064ad..2667b40877 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -827,8 +827,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { // CAST from BinaryType - ignore("cast BinaryType to StringType") { - // https://github.com/apache/datafusion-comet/issues/377 + test("cast BinaryType to StringType") { castTest(generateBinary(), DataTypes.StringType) } diff --git a/spark/src/test/spark-3.4/org/apache/sql/ShimCometTestBase.scala b/spark/src/test/spark-3.4/org/apache/spark/sql/ShimCometTestBase.scala similarity index 100% rename from spark/src/test/spark-3.4/org/apache/sql/ShimCometTestBase.scala rename to spark/src/test/spark-3.4/org/apache/spark/sql/ShimCometTestBase.scala diff --git a/spark/src/test/spark-3.5/org/apache/spark/sql/CometToPrettyStringSuite.scala b/spark/src/test/spark-3.5/org/apache/spark/sql/CometToPrettyStringSuite.scala index d030106c3e..df2ba67b7b 100644 --- a/spark/src/test/spark-3.5/org/apache/spark/sql/CometToPrettyStringSuite.scala +++ b/spark/src/test/spark-3.5/org/apache/spark/sql/CometToPrettyStringSuite.scala @@ -19,60 +19,17 @@ package org.apache.spark.sql -import org.apache.comet.CometConf +import org.apache.comet.CometFuzzTestBase import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.Compatible -import org.apache.comet.testing.{DataGenOptions, ParquetGenerator} -import org.apache.commons.io.FileUtils + import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Alias, ToPrettyString} import org.apache.spark.sql.catalyst.plans.logical.Project -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.DataTypes -import java.io.File -import java.text.SimpleDateFormat -import scala.util.Random - -class CometToPrettyStringSuite extends CometTestBase { - - private var filename: String = null - - /** - * We use Asia/Kathmandu because it has a non-zero number of minutes as the offset, so is an - * interesting edge case. Also, this timezone tends to be different from the default system - * timezone. - * - * Represents UTC+5:45 - */ - private val defaultTimezone = "Asia/Kathmandu" - - override def beforeAll(): Unit = { - super.beforeAll() - val tempDir = System.getProperty("java.io.tmpdir") - filename = s"$tempDir/CometFuzzTestSuite_${System.currentTimeMillis()}.parquet" - val random = new Random(42) - withSQLConf( - CometConf.COMET_ENABLED.key -> "false", - SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) { - val options = - DataGenOptions( - generateArray = true, - generateStruct = true, - generateMap = true, - generateNegativeZero = false, - // override base date due to known issues with experimental scans - baseDate = - new SimpleDateFormat("YYYY-MM-DD hh:mm:ss").parse("2024-05-25 12:34:56").getTime) - ParquetGenerator.makeParquetFile(random, spark, filename, 1000, options) - } - } - - protected override def afterAll(): Unit = { - super.afterAll() - FileUtils.deleteDirectory(new File(filename)) - } +class CometToPrettyStringSuite extends CometFuzzTestBase { test("ToPrettyString") { val df = spark.read.parquet(filename) diff --git a/spark/src/test/spark-4.0/org/apache/spark/sql/CometToPrettyStringSuite.scala b/spark/src/test/spark-4.0/org/apache/spark/sql/CometToPrettyStringSuite.scala new file mode 100644 index 0000000000..469b49fb2f --- /dev/null +++ b/spark/src/test/spark-4.0/org/apache/spark/sql/CometToPrettyStringSuite.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql + +import org.apache.comet.CometFuzzTestBase +import org.apache.comet.expressions.{CometCast, CometEvalMode} +import org.apache.comet.serde.Compatible +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Alias, ToPrettyString} +import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.classic.Dataset +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.BinaryOutputStyle +import org.apache.spark.sql.types.DataTypes + +class CometToPrettyStringSuite extends CometFuzzTestBase { + + test("ToPrettyString") { + val style = List( + BinaryOutputStyle.UTF8, + BinaryOutputStyle.BASIC, + BinaryOutputStyle.BASE64, + BinaryOutputStyle.HEX, + BinaryOutputStyle.HEX_DISCRETE + ) + style.foreach(s => + withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> s.toString) { + val df = spark.read.parquet(filename) + df.createOrReplaceTempView("t1") + val table = spark.sessionState.catalog.lookupRelation(TableIdentifier("t1")) + + for (field <- df.schema.fields) { + val col = field.name + val prettyExpr = Alias(ToPrettyString(UnresolvedAttribute(col)), s"pretty_$col")() + val plan = Project(Seq(prettyExpr), table) + val analyzed = spark.sessionState.analyzer.execute(plan) + val result: DataFrame = Dataset.ofRows(spark, analyzed) + CometCast.isSupported(field.dataType, DataTypes.StringType, Some(spark.sessionState.conf.sessionLocalTimeZone), CometEvalMode.TRY) match { + case _: Compatible => checkSparkAnswerAndOperator(result) + case _ => checkSparkAnswer(result) + } + } + } + ) + } +}