[SPARK-53676][PYTHON][TESTS] Skip UDF type check with numpy 1.x

zhengruifeng · dongjoon-hyun · commit a13187c2fa46 · 2025-09-23T09:03:12.000-07:00
### What changes were proposed in this pull request? Skip UDF type check in minimum dependency envs ### Why are the changes needed? the two scheduled jobs are still failing after fix #52247 due to different version of numpy/pandas/pyarrow/etc. Actually, we don't need to run this test in every envs, because the result depends on the combination of version of numpy/pandas/pyarrow/etc ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? PR builder with ``` default: '{"PYSPARK_IMAGE_TO_TEST": "python-minimum", "PYTHON_TO_TEST": "python3.10", "ENV_NAME": "PYTHON_MINIMUM"}' ``` https://github.com/zhengruifeng/spark/actions/runs/17940562561/job/51016142810 ### Was this patch authored or co-authored using generative AI tooling? no Closes #52419 from zhengruifeng/restore_old_dep_test. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py b/python/pyspark/sql/tests/udf_type_tests/test_udf_input_types.py
@@ -39,10 +39,29 @@
     StructType,
     TimestampType,
 )
+from pyspark.loose_version import LooseVersion
+from pyspark.testing.utils import (
+    have_pyarrow,
+    have_pandas,
+    have_numpy,
+    pyarrow_requirement_message,
+    pandas_requirement_message,
+    numpy_requirement_message,
+)
 from pyspark.testing.sqlutils import ReusedSQLTestCase
 from .type_table_utils import generate_table_diff, format_type_table
 
+if have_numpy:
+    import numpy as np
+
 
+@unittest.skipIf(
+    not have_pandas
+    or not have_pyarrow
+    or not have_numpy
+    or LooseVersion(np.__version__) < LooseVersion("2.0.0"),
+    pandas_requirement_message or pyarrow_requirement_message or numpy_requirement_message,
+)
 class UDFInputTypeTests(ReusedSQLTestCase):
     @classmethod
     def setUpClass(cls):
@@ -115,38 +134,7 @@ def value_udf(x):
                     return x
 
                 def value_str(x):
-                    class NpPrintable:
-                        def __init__(self, x):
-                            self.x = x
-
-                        def __repr__(self):
-                            return f"np.{self.x.dtype}({self.x.item()})"
-
-                    # Numpy 1.x __repr__ returns a different format, see
-                    # https://numpy.org/doc/stable/release/2.0.0-notes.html#representation-of-numpy-scalars-changed # noqa: E501
-                    # We only care about types and values of the elements,
-                    # so we accept this difference and implement our own repr to make
-                    # tests with numpy 1 return the same format as numpy 2.
-                    def convert_to_numpy_printable(x):
-                        import numpy as np
-
-                        if isinstance(x, Row):
-                            converted_values = tuple(convert_to_numpy_printable(v) for v in x)
-                            new_row = Row(*converted_values)
-                            new_row.__fields__ = x.__fields__
-                            return new_row
-                        elif isinstance(x, (list)):
-                            return [convert_to_numpy_printable(elem) for elem in x]
-                        elif isinstance(x, tuple):
-                            return tuple(convert_to_numpy_printable(elem) for elem in x)
-                        elif isinstance(x, dict):
-                            return {k: convert_to_numpy_printable(v) for k, v in x.items()}
-                        elif isinstance(x, np.generic):
-                            return NpPrintable(x)
-                        else:
-                            return x
-
-                    return str(convert_to_numpy_printable(x))
+                    return str(x)
 
                 type_test_udf = udf(type_udf, returnType=StringType(), useArrow=use_arrow)
                 value_test_udf = udf(value_udf, returnType=spark_type, useArrow=use_arrow)
diff --git a/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py b/python/pyspark/sql/tests/udf_type_tests/test_udf_return_types.py
@@ -43,10 +43,29 @@
     StructType,
     TimestampType,
 )
+from pyspark.loose_version import LooseVersion
+from pyspark.testing.utils import (
+    have_pyarrow,
+    have_pandas,
+    have_numpy,
+    pyarrow_requirement_message,
+    pandas_requirement_message,
+    numpy_requirement_message,
+)
 from pyspark.testing.sqlutils import ReusedSQLTestCase
 from .type_table_utils import generate_table_diff, format_type_table
 
+if have_numpy:
+    import numpy as np
+
 
+@unittest.skipIf(
+    not have_pandas
+    or not have_pyarrow
+    or not have_numpy
+    or LooseVersion(np.__version__) < LooseVersion("2.0.0"),
+    pandas_requirement_message or pyarrow_requirement_message or numpy_requirement_message,
+)
 class UDFReturnTypeTests(ReusedSQLTestCase):
     @classmethod
     def setUpClass(cls):