[SPARK-54763][TEST] Accelerate test_udf_return_types with multi-threading

gaogaotiantian · HyukjinKwon · commit 1aa4b15eec3d · 2025-12-19T17:27:12.000+09:00
### What changes were proposed in this pull request? Use multi-threading to accelerate `test_udf_return_types`. Locally it has more than 2x speed up (113s -> 50s). ### Why are the changes needed? `test_udf_return_types` is one of the slowest test we have. It took 300s on normal CI and even slower on coverage. This simple and straightforward fix can save us >50% of the time spent on this test. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Local speed up. ### Was this patch authored or co-authored using generative AI tooling? No Closes #53533 from gaogaotiantian/udfreturntype-accelerate. Authored-by: Tian Gao <gaogaotiantian@hotmail.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py b/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py
@@ -16,6 +16,7 @@
 #
 
 import array
+import concurrent.futures
 import datetime
 import os
 import platform
@@ -217,9 +218,7 @@ def _compare_or_create_golden_file(self, actual_output, golden_file, test_name):
             self.fail(f"Golden file created for {test_name}. Please review and re-run the test.")
 
     def _generate_udf_return_type_coercion_results(self, use_arrow):
-        results = []
-
-        for spark_type in self.test_types:
+        def work(spark_type):
             result = [spark_type.simpleString()]
             for value in self.test_data:
                 try:
@@ -233,9 +232,10 @@ def _generate_udf_return_type_coercion_results(self, use_arrow):
                 except Exception:
                     result_value = "X"
                 result.append(result_value)
-            results.append(result)
+            return result
 
-        return results
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+            return list(executor.map(work, self.test_types))
 
     def test_pandas_udf_return_type_coercion(self):
         golden_file = os.path.join(
@@ -252,9 +252,7 @@ def test_pandas_udf_return_type_coercion(self):
         self._compare_or_create_golden_file(actual_output, golden_file, test_name)
 
     def _generate_pandas_udf_type_coercion_results(self):
-        results = []
-
-        for spark_type in self.test_types:
+        def work(spark_type):
             result = [spark_type.simpleString()]
             for value in self.pandas_test_data:
                 try:
@@ -276,9 +274,10 @@ def pandas_udf_func(series: pd.Series) -> pd.Series:
                 except Exception:
                     ret_str = "X"
                 result.append(ret_str)
-            results.append(result)
+            return result
 
-        return results
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+            return list(executor.map(work, self.test_types))
 
 
 if __name__ == "__main__":