[SPARK-52976][PYTHON] Fix Python UDF not accepting collated string as input param/return type

ilicmarkodb · cloud-fan · commit 52ab9babbae4 · 2025-08-11T12:46:40.000+08:00
### What changes were proposed in this pull request? Fix Python UDF not accepting collated strings as input param/return type. ### Why are the changes needed? Bug fix. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51925 from ilicmarkodb/python_udf_collation_fix. Authored-by: ilicmarkodb <marko.ilic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py
@@ -1337,6 +1337,29 @@ def doubleInDoubleOut(d):
             with self.subTest(chain=chain):
                 assertDataFrameEqual(actual=actual, expected=expected)
 
+    def test_udf_with_collated_string_types(self):
+        @udf("string collate fr")
+        def my_udf(input_val):
+            return "%s - %s" % (type(input_val), input_val)
+
+        string_types = [
+            StringType(),
+            StringType("UTF8_BINARY"),
+            StringType("UTF8_LCASE"),
+            StringType("UNICODE"),
+        ]
+        data = [("hello",)]
+        expected = "<class 'str'> - hello"
+
+        for string_type in string_types:
+            schema = StructType([StructField("input_col", string_type, True)])
+            df = self.spark.createDataFrame(data, schema=schema)
+            df_result = df.select(my_udf(df.input_col).alias("result"))
+            row = df_result.collect()[0][0]
+            self.assertEqual(row, expected)
+            result_type = df_result.schema["result"].dataType
+            self.assertEqual(result_type, StringType("fr"))
+
 
 class UDFTests(BaseUDFTestsMixin, ReusedSQLTestCase):
     @classmethod
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py
@@ -3100,6 +3100,41 @@ def eval(self):
                     udtf(TestUDTF, returnType=ret_type)().collect()
 
 
+def test_udtf_with_collated_string_types(self):
+    @udtf(
+        "out1 string, out2 string collate UTF8_BINARY, out3 string collate UTF8_LCASE,"
+        " out4 string collate UNICODE"
+    )
+    class MyUDTF:
+        def eval(self, v1, v2, v3, v4):
+            yield (v1 + "1", v2 + "2", v3 + "3", v4 + "4")
+
+    schema = StructType(
+        [
+            StructField("col1", StringType(), True),
+            StructField("col2", StringType("UTF8_BINARY"), True),
+            StructField("col3", StringType("UTF8_LCASE"), True),
+            StructField("col4", StringType("UNICODE"), True),
+        ]
+    )
+    df = self.spark.createDataFrame([("hello",) * 4], schema=schema)
+
+    df_out = df.select(MyUDTF(df.col1, df.col2, df.col3, df.col4).alias("out"))
+    result_df = df_out.select("out.*")
+
+    expected_row = ("hello1", "hello2", "hello3", "hello4")
+    self.assertEqual(result_df.collect()[0], expected_row)
+
+    expected_output_types = [
+        StringType(),
+        StringType("UTF8_BINARY"),
+        StringType("UTF8_LCASE"),
+        StringType("UNICODE"),
+    ]
+    for idx, field in enumerate(result_df.schema.fields):
+        self.assertEqual(field.dataType, expected_output_types[idx])
+
+
 class UDTFArrowTests(UDTFArrowTestsMixin, ReusedSQLTestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -447,15 +447,35 @@ object DataType {
   }
 
   /**
-   * Check if `from` is equal to `to` type except for collations, which are checked to be
-   * compatible so that data of type `from` can be interpreted as of type `to`.
+   * Compares two data types, ignoring compatible collation of StringType. If `checkComplexTypes`
+   * is true, it will also ignore collations for nested types.
    */
-  private[sql] def equalsIgnoreCompatibleCollation(from: DataType, to: DataType): Boolean = {
-    (from, to) match {
-      // String types with possibly different collations are compatible.
-      case (a: StringType, b: StringType) => a.constraint == b.constraint
+  private[sql] def equalsIgnoreCompatibleCollation(
+      from: DataType,
+      to: DataType,
+      checkComplexTypes: Boolean = true): Boolean = {
+    def transform: PartialFunction[DataType, DataType] = {
+      case dt @ (_: CharType | _: VarcharType) => dt
+      case _: StringType => StringType
+    }
 
-      case (fromDataType, toDataType) => fromDataType == toDataType
+    if (checkComplexTypes) {
+      from.transformRecursively(transform) == to.transformRecursively(transform)
+    } else {
+      (from, to) match {
+        case (a: StringType, b: StringType) => a.constraint == b.constraint
+
+        case (fromDataType, toDataType) => fromDataType == toDataType
+      }
+    }
+  }
+
+  private[sql] def equalsIgnoreCompatibleCollation(
+      from: Seq[DataType],
+      to: Seq[DataType]): Boolean = {
+    from.length == to.length &&
+    from.zip(to).forall { case (fromDataType, toDataType) =>
+      equalsIgnoreCompatibleCollation(fromDataType, toDataType)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -878,7 +878,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkEqualsIgnoreCompatibleCollation(
     ArrayType(StringType),
     ArrayType(StringType("UTF8_LCASE")),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     ArrayType(StringType),
@@ -888,7 +888,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkEqualsIgnoreCompatibleCollation(
     ArrayType(ArrayType(StringType)),
     ArrayType(ArrayType(StringType("UTF8_LCASE"))),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     ArrayType(ArrayType(StringType)),
@@ -913,12 +913,12 @@ class DataTypeSuite extends SparkFunSuite {
   checkEqualsIgnoreCompatibleCollation(
     MapType(StringType, StringType),
     MapType(StringType, StringType("UTF8_LCASE")),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     MapType(StringType("UTF8_LCASE"), StringType),
     MapType(StringType, StringType),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     MapType(StringType("UTF8_LCASE"), StringType),
@@ -943,7 +943,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkEqualsIgnoreCompatibleCollation(
     MapType(StringType("UTF8_LCASE"), ArrayType(StringType)),
     MapType(StringType("UTF8_LCASE"), ArrayType(StringType("UTF8_LCASE"))),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     MapType(StringType("UTF8_LCASE"), ArrayType(StringType)),
@@ -968,7 +968,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkEqualsIgnoreCompatibleCollation(
     MapType(ArrayType(StringType), IntegerType),
     MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     MapType(ArrayType(StringType("UTF8_LCASE")), IntegerType),
@@ -998,7 +998,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkEqualsIgnoreCompatibleCollation(
     StructType(StructField("a", StringType) :: Nil),
     StructType(StructField("a", StringType("UTF8_LCASE")) :: Nil),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     StructType(StructField("a", StringType) :: Nil),
@@ -1023,7 +1023,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkEqualsIgnoreCompatibleCollation(
     StructType(StructField("a", ArrayType(StringType)) :: Nil),
     StructType(StructField("a", ArrayType(StringType("UTF8_LCASE"))) :: Nil),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     StructType(StructField("a", ArrayType(StringType)) :: Nil),
@@ -1048,7 +1048,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkEqualsIgnoreCompatibleCollation(
     StructType(StructField("a", MapType(StringType, IntegerType)) :: Nil),
     StructType(StructField("a", MapType(StringType("UTF8_LCASE"), IntegerType)) :: Nil),
-    expected = false
+    expected = true
   )
   checkEqualsIgnoreCompatibleCollation(
     StructType(StructField("a", MapType(StringType, IntegerType)) :: Nil),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -463,7 +463,7 @@ case class AlterTableChangeColumnCommand(
   // when altering column. Only changes in collation of data type or its nested types (recursively)
   // are allowed.
   private def canEvolveType(from: StructField, to: StructField): Boolean = {
-    DataType.equalsIgnoreCompatibleCollation(from.dataType, to.dataType)
+    DataType.equalsIgnoreCompatibleCollation(from.dataType, to.dataType, checkComplexTypes = false)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.EvalPythonExec.ArgumentMetadata
+import org.apache.spark.sql.types.DataType.equalsIgnoreCompatibleCollation
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -125,8 +126,9 @@ class ArrowEvalPythonEvaluatorFactory(
 
     columnarBatchIter.flatMap { batch =>
       val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType())
-      assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " +
-        s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}")
+      assert(equalsIgnoreCompatibleCollation(outputTypes, actualDataTypes),
+        s"Invalid schema from arrow-enabled Python UDTF: expected ${outputTypes.mkString(", ")}," +
+          s" got ${actualDataTypes.mkString(", ")}")
       batch.rowIterator.asScala
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonUDTFExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonUDTFExec.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.python.EvalPythonExec.ArgumentMetadata
+import org.apache.spark.sql.types.DataType.equalsIgnoreCompatibleCollation
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 
@@ -81,8 +82,9 @@ case class ArrowEvalPythonUDTFExec(
 
       val actualDataTypes = (0 until flattenedBatch.numCols()).map(
         i => flattenedBatch.column(i).dataType())
-      assert(outputTypes == actualDataTypes, "Invalid schema from arrow-enabled Python UDTF: " +
-        s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}")
+      assert(equalsIgnoreCompatibleCollation(outputTypes, actualDataTypes),
+        s"Invalid schema from arrow-enabled Python UDTF: expected ${outputTypes.mkString(", ")}," +
+          s" got ${actualDataTypes.mkString(", ")}")
 
       flattenedBatch.setNumRows(batch.numRows())
       flattenedBatch.rowIterator().asScala
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -78,7 +78,7 @@ object EvaluatePython {
 
     case (d: Decimal, _) => d.toJavaBigDecimal
 
-    case (s: UTF8String, StringType) => s.toString
+    case (s: UTF8String, _: StringType) => s.toString
 
     case (other, _) => other
   }

Original file line number	Diff line number	Diff line change
`@@ -463,7 +463,7 @@ case class AlterTableChangeColumnCommand(`
`463`	`463`	`// when altering column. Only changes in collation of data type or its nested types (recursively)`
`464`	`464`	`// are allowed.`
`465`	`465`	`private def canEvolveType(from: StructField, to: StructField): Boolean = {`
`466`		`- DataType.equalsIgnoreCompatibleCollation(from.dataType, to.dataType)`
	`466`	`+ DataType.equalsIgnoreCompatibleCollation(from.dataType, to.dataType, checkComplexTypes = false)`
`467`	`467`	`}`
`468`	`468`	`}`
`469`	`469`
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ object EvaluatePython {`
`78`	`78`
`79`	`79`	`case (d: Decimal, _) => d.toJavaBigDecimal`
`80`	`80`
`81`		`- case (s: UTF8String, StringType) => s.toString`
	`81`	`+ case (s: UTF8String, _: StringType) => s.toString`
`82`	`82`
`83`	`83`	`case (other, _) => other`
`84`	`84`	`}`