Skip to content

Commit e470e74

Browse files
committed
[SPARK-45887][SQL] Align codegen and non-codegen implementation of Encode
### What changes were proposed in this pull request? In the PR, I propose to change the implementation of interpretation mode, and make it consistent to codegen. Both implementation raise the same error with new error class `INVALID_PARAMETER_VALUE.CHARSET`. ### Why are the changes needed? To make codegen and non-codegen of the `Encode` expression consistent. So, users will observe the same behaviour in both modes. ### Does this PR introduce _any_ user-facing change? Yes, if user code depends on error from `encode()`. ### How was this patch tested? By running the following test suites: ``` $ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql" $ build/sbt "core/testOnly *SparkThrowableSuite" $ build/sbt "test:testOnly *.StringFunctionsSuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43759 from MaxGekk/restrict-charsets-in-encode. Authored-by: Max Gekk <[email protected]> Signed-off-by: Max Gekk <[email protected]>
1 parent 2407066 commit e470e74

File tree

9 files changed

+134
-5
lines changed

9 files changed

+134
-5
lines changed

common/utils/src/main/resources/error/error-classes.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2042,6 +2042,11 @@
20422042
"expects an integer value in [0, <upper>), but got <invalidValue>."
20432043
]
20442044
},
2045+
"CHARSET" : {
2046+
"message" : [
2047+
"expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', but got <charset>."
2048+
]
2049+
},
20452050
"DATETIME_UNIT" : {
20462051
"message" : [
20472052
"expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal <invalidValue>."

docs/sql-error-conditions-invalid-parameter-value-error-class.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ expects one of binary formats 'base64', 'hex', 'utf-8', but got `<invalidFormat>
4545

4646
expects an integer value in [0, `<upper>`), but got `<invalidValue>`.
4747

48+
## CHARSET
49+
50+
expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', but got `<charset>`.
51+
4852
## DATETIME_UNIT
4953

5054
expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal `<invalidValue>`.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.expressions
1919

20+
import java.io.UnsupportedEncodingException
2021
import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
2122
import java.util.{Base64 => JBase64}
2223
import java.util.{HashMap, Locale, Map => JMap}
@@ -2694,17 +2695,25 @@ case class Encode(value: Expression, charset: Expression)
26942695

26952696
protected override def nullSafeEval(input1: Any, input2: Any): Any = {
26962697
val toCharset = input2.asInstanceOf[UTF8String].toString
2697-
input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
2698+
try {
2699+
input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
2700+
} catch {
2701+
case _: UnsupportedEncodingException =>
2702+
throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset)
2703+
}
26982704
}
26992705

27002706
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
2701-
nullSafeCodeGen(ctx, ev, (string, charset) =>
2707+
nullSafeCodeGen(ctx, ev, (string, charset) => {
2708+
val toCharset = ctx.freshName("toCharset")
27022709
s"""
2710+
String $toCharset = $charset.toString();
27032711
try {
2704-
${ev.value} = $string.toString().getBytes($charset.toString());
2712+
${ev.value} = $string.toString().getBytes($toCharset);
27052713
} catch (java.io.UnsupportedEncodingException e) {
2706-
org.apache.spark.unsafe.Platform.throwException(e);
2707-
}""")
2714+
throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset);
2715+
}"""
2716+
})
27082717
}
27092718

27102719
override protected def withNewChildrenInternal(

sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2758,4 +2758,13 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE
27582758
"upper" -> size.toString,
27592759
"invalidValue" -> pos.toString))
27602760
}
2761+
2762+
def invalidCharsetError(functionName: String, charset: String): RuntimeException = {
2763+
new SparkIllegalArgumentException(
2764+
errorClass = "INVALID_PARAMETER_VALUE.CHARSET",
2765+
messageParameters = Map(
2766+
"functionName" -> toSQLId(functionName),
2767+
"parameter" -> toSQLId("charset"),
2768+
"charset" -> charset))
2769+
}
27612770
}

sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x]
640640
+- OneRowRelation
641641

642642

643+
-- !query
644+
select encode('hello', 'Windows-xxx')
645+
-- !query analysis
646+
Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
647+
+- OneRowRelation
648+
649+
650+
-- !query
651+
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
652+
-- !query analysis
653+
Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
654+
+- SubqueryAlias t
655+
+- LocalRelation [scol#x, ecol#x]
656+
657+
643658
-- !query
644659
select decode()
645660
-- !query analysis

sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x]
640640
+- OneRowRelation
641641

642642

643+
-- !query
644+
select encode('hello', 'Windows-xxx')
645+
-- !query analysis
646+
Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
647+
+- OneRowRelation
648+
649+
650+
-- !query
651+
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
652+
-- !query analysis
653+
Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
654+
+- SubqueryAlias t
655+
+- LocalRelation [scol#x, ecol#x]
656+
657+
643658
-- !query
644659
select decode()
645660
-- !query analysis

sql/core/src/test/resources/sql-tests/inputs/string-functions.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ SELECT lpad(x'57', 5, 'abc');
117117
SELECT rpad('abc', 5, x'57');
118118
SELECT rpad(x'57', 5, 'abc');
119119

120+
-- encode
121+
select encode('hello', 'Windows-xxx');
122+
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol);
123+
120124
-- decode
121125
select decode();
122126
select decode(encode('abc', 'utf-8'));

sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,40 @@ struct<rpad(X'57', 5, abc):string>
803803
Wabca
804804

805805

806+
-- !query
807+
select encode('hello', 'Windows-xxx')
808+
-- !query schema
809+
struct<>
810+
-- !query output
811+
org.apache.spark.SparkIllegalArgumentException
812+
{
813+
"errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
814+
"sqlState" : "22023",
815+
"messageParameters" : {
816+
"charset" : "Windows-xxx",
817+
"functionName" : "`encode`",
818+
"parameter" : "`charset`"
819+
}
820+
}
821+
822+
823+
-- !query
824+
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
825+
-- !query schema
826+
struct<>
827+
-- !query output
828+
org.apache.spark.SparkIllegalArgumentException
829+
{
830+
"errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
831+
"sqlState" : "22023",
832+
"messageParameters" : {
833+
"charset" : "Windows-xxx",
834+
"functionName" : "`encode`",
835+
"parameter" : "`charset`"
836+
}
837+
}
838+
839+
806840
-- !query
807841
select decode()
808842
-- !query schema

sql/core/src/test/resources/sql-tests/results/string-functions.sql.out

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,40 @@ struct<rpad(X'57', 5, abc):string>
735735
Wabca
736736

737737

738+
-- !query
739+
select encode('hello', 'Windows-xxx')
740+
-- !query schema
741+
struct<>
742+
-- !query output
743+
org.apache.spark.SparkIllegalArgumentException
744+
{
745+
"errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
746+
"sqlState" : "22023",
747+
"messageParameters" : {
748+
"charset" : "Windows-xxx",
749+
"functionName" : "`encode`",
750+
"parameter" : "`charset`"
751+
}
752+
}
753+
754+
755+
-- !query
756+
select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
757+
-- !query schema
758+
struct<>
759+
-- !query output
760+
org.apache.spark.SparkIllegalArgumentException
761+
{
762+
"errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
763+
"sqlState" : "22023",
764+
"messageParameters" : {
765+
"charset" : "Windows-xxx",
766+
"functionName" : "`encode`",
767+
"parameter" : "`charset`"
768+
}
769+
}
770+
771+
738772
-- !query
739773
select decode()
740774
-- !query schema

0 commit comments

Comments
 (0)