Skip to content

Commit d5da49d

Browse files
vladanvasi-dbMaxGekk
authored andcommitted
Revert [SPARK-50230][SQL] Added logic to support reading unknown collation name as utf8_binary
### What changes were proposed in this pull request? I propose reverting changes for new `SQLConf` entry which enables spark to read an invalid collation name as `UTF8_BINARY`. ### Why are the changes needed? Since the original changes may bring unwanted data corruption when a user writes in a table that has unknown collation and modifies its properties, the original PR must be reverted. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Not applicable. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48876 from vladanvasi-db/vladanvasi-db/allow-reading-unknown-collations-as-utf8-binary-revert. Authored-by: Vladan Vasić <[email protected]> Signed-off-by: Max Gekk <[email protected]>
1 parent ad82505 commit d5da49d

File tree

5 files changed

+5
-199
lines changed

5 files changed

+5
-199
lines changed

sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ private[sql] trait SqlApiConf {
4747
def stackTracesInDataFrameContext: Int
4848
def dataFrameQueryContextEnabled: Boolean
4949
def legacyAllowUntypedScalaUDFs: Boolean
50-
def allowReadingUnknownCollations: Boolean
5150
}
5251

5352
private[sql] object SqlApiConf {
@@ -60,7 +59,6 @@ private[sql] object SqlApiConf {
6059
SqlApiConfHelper.LOCAL_RELATION_CACHE_THRESHOLD_KEY
6160
}
6261
val DEFAULT_COLLATION: String = SqlApiConfHelper.DEFAULT_COLLATION
63-
val ALLOW_READING_UNKNOWN_COLLATIONS: String = SqlApiConfHelper.ALLOW_READING_UNKNOWN_COLLATIONS
6462

6563
def get: SqlApiConf = SqlApiConfHelper.getConfGetter.get()()
6664

@@ -89,5 +87,4 @@ private[sql] object DefaultSqlApiConf extends SqlApiConf {
8987
override def stackTracesInDataFrameContext: Int = 1
9088
override def dataFrameQueryContextEnabled: Boolean = true
9189
override def legacyAllowUntypedScalaUDFs: Boolean = false
92-
override def allowReadingUnknownCollations: Boolean = false
9390
}

sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConfHelper.scala

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ private[sql] object SqlApiConfHelper {
3333
val SESSION_LOCAL_TIMEZONE_KEY: String = "spark.sql.session.timeZone"
3434
val LOCAL_RELATION_CACHE_THRESHOLD_KEY: String = "spark.sql.session.localRelationCacheThreshold"
3535
val DEFAULT_COLLATION: String = "spark.sql.session.collation.default"
36-
val ALLOW_READING_UNKNOWN_COLLATIONS: String =
37-
"spark.sql.collation.allowReadingUnknownCollations"
3836

3937
val confGetter: AtomicReference[() => SqlApiConf] = {
4038
new AtomicReference[() => SqlApiConf](() => DefaultSqlApiConf)

sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import org.json4s.JsonAST.JValue
2727
import org.json4s.JsonDSL._
2828
import org.json4s.jackson.JsonMethods._
2929

30-
import org.apache.spark.{SparkException, SparkIllegalArgumentException, SparkThrowable}
30+
import org.apache.spark.{SparkIllegalArgumentException, SparkThrowable}
3131
import org.apache.spark.annotation.Stable
3232
import org.apache.spark.sql.catalyst.analysis.SqlApiAnalysis
3333
import org.apache.spark.sql.catalyst.parser.DataTypeParser
@@ -340,17 +340,8 @@ object DataType {
340340
fields.collect { case (fieldPath, JString(collation)) =>
341341
collation.split("\\.", 2) match {
342342
case Array(provider: String, collationName: String) =>
343-
try {
344-
CollationFactory.assertValidProvider(provider)
345-
fieldPath -> collationName
346-
} catch {
347-
case e: SparkException
348-
if e.getCondition == "COLLATION_INVALID_PROVIDER" &&
349-
SqlApiConf.get.allowReadingUnknownCollations =>
350-
// If the collation provider is unknown and the config for reading such
351-
// collations is enabled, return the UTF8_BINARY collation.
352-
fieldPath -> "UTF8_BINARY"
353-
}
343+
CollationFactory.assertValidProvider(provider)
344+
fieldPath -> collationName
354345
}
355346
}.toMap
356347

@@ -359,16 +350,7 @@ object DataType {
359350
}
360351

361352
private def stringTypeWithCollation(collationName: String): StringType = {
362-
try {
363-
StringType(CollationFactory.collationNameToId(collationName))
364-
} catch {
365-
case e: SparkException
366-
if e.getCondition == "COLLATION_INVALID_NAME" &&
367-
SqlApiConf.get.allowReadingUnknownCollations =>
368-
// If the collation name is unknown and the config for reading such collations is enabled,
369-
// return the UTF8_BINARY collation.
370-
StringType(CollationFactory.UTF8_BINARY_COLLATION_ID)
371-
}
353+
StringType(CollationFactory.collationNameToId(collationName))
372354
}
373355

374356
protected[types] def buildFormattedString(

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -778,15 +778,6 @@ object SQLConf {
778778
.booleanConf
779779
.createWithDefault(Utils.isTesting)
780780

781-
val ALLOW_READING_UNKNOWN_COLLATIONS =
782-
buildConf(SqlApiConfHelper.ALLOW_READING_UNKNOWN_COLLATIONS)
783-
.internal()
784-
.doc("Enables spark to read unknown collation name as UTF8_BINARY. If the config is " +
785-
"not enabled, when spark encounters an unknown collation name, it will throw an error.")
786-
.version("4.0.0")
787-
.booleanConf
788-
.createWithDefault(false)
789-
790781
val DEFAULT_COLLATION =
791782
buildConf(SqlApiConfHelper.DEFAULT_COLLATION)
792783
.doc("Sets default collation to use for string literals, parameter markers or the string" +
@@ -5582,8 +5573,6 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
55825573
}
55835574
}
55845575

5585-
override def allowReadingUnknownCollations: Boolean = getConf(ALLOW_READING_UNKNOWN_COLLATIONS)
5586-
55875576
def adaptiveExecutionEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_ENABLED)
55885577

55895578
def adaptiveExecutionLogLevel: String = getConf(ADAPTIVE_EXECUTION_LOG_LEVEL)

sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala

Lines changed: 1 addition & 161 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,11 @@ import org.json4s.jackson.JsonMethods
2323
import org.apache.spark.{SparkException, SparkFunSuite, SparkIllegalArgumentException}
2424
import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, caseSensitiveResolution}
2525
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
26-
import org.apache.spark.sql.catalyst.plans.SQLHelper
2726
import org.apache.spark.sql.catalyst.types.DataTypeUtils
2827
import org.apache.spark.sql.catalyst.util.{CollationFactory, StringConcat}
29-
import org.apache.spark.sql.internal.SQLConf
3028
import org.apache.spark.sql.types.DataTypeTestUtils.{dayTimeIntervalTypes, yearMonthIntervalTypes}
3129

32-
class DataTypeSuite extends SparkFunSuite with SQLHelper {
30+
class DataTypeSuite extends SparkFunSuite {
3331

3432
private val UNICODE_COLLATION_ID = CollationFactory.collationNameToId("UNICODE")
3533

@@ -878,90 +876,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper {
878876
}
879877
}
880878

881-
test("string field with invalid collation name") {
882-
val collationProviders = Seq("spark", "icu")
883-
collationProviders.foreach { provider =>
884-
val json =
885-
s"""
886-
|{
887-
| "type": "struct",
888-
| "fields": [
889-
| {
890-
| "name": "c1",
891-
| "type": "string",
892-
| "nullable": false,
893-
| "metadata": {
894-
| "${DataType.COLLATIONS_METADATA_KEY}": {
895-
| "c1": "$provider.INVALID"
896-
| }
897-
| }
898-
| }
899-
| ]
900-
|}
901-
|""".stripMargin
902-
903-
// Check that the exception will be thrown in case of invalid collation name and
904-
// UNKNOWN_COLLATION_NAME config not enabled.
905-
checkError(
906-
exception = intercept[SparkException] {
907-
DataType.fromJson(json)
908-
},
909-
condition = "COLLATION_INVALID_NAME",
910-
parameters = Map(
911-
"proposals" -> "id",
912-
"collationName" -> "INVALID"))
913-
914-
// Check that the exception will not be thrown in case of invalid collation name and
915-
// UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be returned.
916-
withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") {
917-
val dataType = DataType.fromJson(json)
918-
assert(dataType === StructType(
919-
StructField("c1", StringType(CollationFactory.UTF8_BINARY_COLLATION_ID), false) :: Nil))
920-
}
921-
}
922-
}
923-
924-
test("string field with invalid collation provider") {
925-
val json =
926-
s"""
927-
|{
928-
| "type": "struct",
929-
| "fields": [
930-
| {
931-
| "name": "c1",
932-
| "type": "string",
933-
| "nullable": false,
934-
| "metadata": {
935-
| "${DataType.COLLATIONS_METADATA_KEY}": {
936-
| "c1": "INVALID.INVALID"
937-
| }
938-
| }
939-
| }
940-
| ]
941-
|}
942-
|""".stripMargin
943-
944-
945-
// Check that the exception will be thrown in case of invalid collation name and
946-
// UNKNOWN_COLLATION_NAME config not enabled.
947-
checkError(
948-
exception = intercept[SparkException] {
949-
DataType.fromJson(json)
950-
},
951-
condition = "COLLATION_INVALID_PROVIDER",
952-
parameters = Map(
953-
"supportedProviders" -> "spark, icu",
954-
"provider" -> "INVALID"))
955-
956-
// Check that the exception will not be thrown in case of invalid collation name and
957-
// UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be returned.
958-
withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") {
959-
val dataType = DataType.fromJson(json)
960-
assert(dataType === StructType(
961-
StructField("c1", StringType(CollationFactory.UTF8_BINARY_COLLATION_ID), false) :: Nil))
962-
}
963-
}
964-
965879
test("non string field has collation metadata") {
966880
val json =
967881
s"""
@@ -1109,42 +1023,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper {
11091023
assert(parsedWithCollations === ArrayType(StringType(unicodeCollationId)))
11101024
}
11111025

1112-
test("parse array type with invalid collation metadata") {
1113-
val utf8BinaryCollationId = CollationFactory.UTF8_BINARY_COLLATION_ID
1114-
val arrayJson =
1115-
s"""
1116-
|{
1117-
| "type": "array",
1118-
| "elementType": "string",
1119-
| "containsNull": true
1120-
|}
1121-
|""".stripMargin
1122-
1123-
val collationsMap = Map("element" -> "INVALID")
1124-
1125-
// Parse without collations map
1126-
assert(DataType.parseDataType(JsonMethods.parse(arrayJson)) === ArrayType(StringType))
1127-
1128-
// Check that the exception will be thrown in case of invalid collation name and
1129-
// UNKNOWN_COLLATION_NAME config not enabled.
1130-
checkError(
1131-
exception = intercept[SparkException] {
1132-
DataType.parseDataType(JsonMethods.parse(arrayJson), collationsMap = collationsMap)
1133-
},
1134-
condition = "COLLATION_INVALID_NAME",
1135-
parameters = Map(
1136-
"proposals" -> "id",
1137-
"collationName" -> "INVALID"))
1138-
1139-
// Check that the exception will not be thrown in case of invalid collation name and
1140-
// UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be returned.
1141-
withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") {
1142-
val dataType = DataType.parseDataType(
1143-
JsonMethods.parse(arrayJson), collationsMap = collationsMap)
1144-
assert(dataType === ArrayType(StringType(utf8BinaryCollationId)))
1145-
}
1146-
}
1147-
11481026
test("parse map type with collation metadata") {
11491027
val unicodeCollationId = CollationFactory.collationNameToId("UNICODE")
11501028
val mapJson =
@@ -1168,44 +1046,6 @@ class DataTypeSuite extends SparkFunSuite with SQLHelper {
11681046
MapType(StringType(unicodeCollationId), StringType(unicodeCollationId)))
11691047
}
11701048

1171-
test("parse map type with invalid collation metadata") {
1172-
val utf8BinaryCollationId = CollationFactory.UTF8_BINARY_COLLATION_ID
1173-
val mapJson =
1174-
s"""
1175-
|{
1176-
| "type": "map",
1177-
| "keyType": "string",
1178-
| "valueType": "string",
1179-
| "valueContainsNull": true
1180-
|}
1181-
|""".stripMargin
1182-
1183-
val collationsMap = Map("key" -> "INVALID", "value" -> "INVALID")
1184-
1185-
// Parse without collations map
1186-
assert(DataType.parseDataType(JsonMethods.parse(mapJson)) === MapType(StringType, StringType))
1187-
1188-
// Check that the exception will be thrown in case of invalid collation name and
1189-
// UNKNOWN_COLLATION_NAME config not enabled.
1190-
checkError(
1191-
exception = intercept[SparkException] {
1192-
DataType.parseDataType(JsonMethods.parse(mapJson), collationsMap = collationsMap)
1193-
},
1194-
condition = "COLLATION_INVALID_NAME",
1195-
parameters = Map(
1196-
"proposals" -> "id",
1197-
"collationName" -> "INVALID"))
1198-
1199-
// Check that the exception will not be thrown in case of invalid collation name and
1200-
// UNKNOWN_COLLATION_NAME enabled, but UTF8_BINARY collation will be returned.
1201-
withSQLConf(SQLConf.ALLOW_READING_UNKNOWN_COLLATIONS.key -> "true") {
1202-
val dataType = DataType.parseDataType(
1203-
JsonMethods.parse(mapJson), collationsMap = collationsMap)
1204-
assert(dataType === MapType(
1205-
StringType(utf8BinaryCollationId), StringType(utf8BinaryCollationId)))
1206-
}
1207-
}
1208-
12091049
test("SPARK-48680: Add CharType and VarcharType to DataTypes JAVA API") {
12101050
assert(DataTypes.createCharType(1) === CharType(1))
12111051
assert(DataTypes.createVarcharType(100) === VarcharType(100))

0 commit comments

Comments
 (0)