Skip to content

Commit dae7aad

Browse files
authored
feat: decode() expression when using 'utf-8' encoding (#1697)
1 parent 3008f4a commit dae7aad

File tree

3 files changed

+41
-2
lines changed

3 files changed

+41
-2
lines changed

native/spark-expr/src/conversion_funcs/cast.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,10 @@ fn cast_array(
882882
let array = match &from_type {
883883
Dictionary(key_type, value_type)
884884
if key_type.as_ref() == &Int32
885-
&& (value_type.as_ref() == &Utf8 || value_type.as_ref() == &LargeUtf8) =>
885+
&& (value_type.as_ref() == &Utf8
886+
|| value_type.as_ref() == &LargeUtf8
887+
|| value_type.as_ref() == &Binary
888+
|| value_type.as_ref() == &LargeBinary) =>
886889
{
887890
let dict_array = array
888891
.as_any()

spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
package org.apache.comet.serde
2121

22+
import java.util.Locale
23+
2224
import scala.collection.JavaConverters._
2325
import scala.math.min
2426

@@ -1430,6 +1432,25 @@ object QueryPlanSerde extends Logging with CometExprShim {
14301432
val optExpr = scalarFunctionExprToProto("ascii", childExpr)
14311433
optExprWithInfo(optExpr, expr, castExpr)
14321434

1435+
case s: StringDecode =>
1436+
// Right child is the encoding expression.
1437+
s.right match {
1438+
case Literal(str, DataTypes.StringType)
1439+
if str.toString.toLowerCase(Locale.ROOT) == "utf-8" =>
1440+
// decode(col, 'utf-8') can be treated as a cast with "try" eval mode that puts nulls
1441+
// for invalid strings.
1442+
// Left child is the binary expression.
1443+
castToProto(
1444+
expr,
1445+
None,
1446+
DataTypes.StringType,
1447+
exprToProtoInternal(s.left, inputs, binding).get,
1448+
CometEvalMode.TRY)
1449+
case _ =>
1450+
withInfo(expr, "Comet only supports decoding with 'utf-8'.")
1451+
None
1452+
}
1453+
14331454
case BitLength(child) =>
14341455
val castExpr = Cast(child, StringType)
14351456
val childExpr = exprToProtoInternal(castExpr, inputs, binding)

spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ import org.apache.spark.sql.execution.SparkPlan
3535
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
3636
import org.apache.spark.sql.internal.SQLConf
3737
import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType
38-
import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, StructType}
38+
import org.apache.spark.sql.types._
3939

4040
import org.apache.comet.testing.{DataGenOptions, ParquetGenerator}
4141

@@ -188,6 +188,21 @@ class CometFuzzTestSuite extends CometTestBase with AdaptiveSparkPlanHelper {
188188
}
189189
}
190190

191+
test("decode") {
192+
val df = spark.read.parquet(filename)
193+
df.createOrReplaceTempView("t1")
194+
// We want to make sure that the schema generator wasn't modified to accidentally omit
195+
// BinaryType, since then this test would not run any queries and silently pass.
196+
var testedBinary = false
197+
for (field <- df.schema.fields if field.dataType == BinaryType) {
198+
testedBinary = true
199+
// Intentionally use odd capitalization of 'utf-8' to test normalization.
200+
val sql = s"SELECT decode(${field.name}, 'utF-8') FROM t1"
201+
checkSparkAnswerAndOperator(sql)
202+
}
203+
assert(testedBinary)
204+
}
205+
191206
test("Parquet temporal types written as INT96") {
192207
// int96 coercion in DF does not work with nested types yet
193208
// https://github.com/apache/datafusion/issues/15763

0 commit comments

Comments
 (0)