[SPARK-46076][PYTHON][TESTS] Remove unittest deprecated alias usage for Python 3.12

dongjoon-hyun · HyukjinKwon · commit 2b07d7786208 · 2023-11-24T08:17:35.000+09:00
### What changes were proposed in this pull request? This PR aims to remove `unittest` alias usage for Python 3.12. Currently, it fails like the following. - https://github.com/apache/spark/actions/runs/6971394284/job/18971420822 ``` ====================================================================== ERROR [0.554s]: test_find_spark_home (pyspark.tests.test_util.UtilTests.test_find_spark_home) ---------------------------------------------------------------------- Traceback (most recent call last): File "/__w/spark/spark/python/pyspark/tests/test_util.py", line 83, in test_find_spark_home self.assertEquals(origin, _find_spark_home()) ^^^^^^^^^^^^^^^^^ AttributeError: 'UtilTests' object has no attribute 'assertEquals'. Did you mean: 'assertEqual'? ``` ### Why are the changes needed? Python 3.12 removes the following deprecated aliases. - https://docs.python.org/3/whatsnew/3.12.html#id3 <img width="802" alt="Screenshot 2023-11-23 at 12 52 33 PM" src="https://github.com/apache/spark/assets/9700541/0158c1a4-fcfc-4a02-85c5-7fcbd6c6a034"> ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs with Python 3.9. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43986 from dongjoon-hyun/SPARK-46076. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/ml/tests/test_linalg.py b/python/pyspark/ml/tests/test_linalg.py
@@ -362,7 +362,7 @@ def test_unwrap_udt(self):
             Row(v2=unwrapped_vec(1, None, None, [1.0, 2.0, 3.0])),
             Row(v2=unwrapped_vec(0, 3, [1, 2], [1.0, 5.5])),
         ]
-        self.assertEquals(results, expected)
+        self.assertEqual(results, expected)
 
 
 class MatrixUDTTests(MLlibTestCase):
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -62,11 +62,11 @@ def test_index_basic(self):
             self.assert_eq(psdf.index.dtype, pdf.index.dtype)
 
         self.assert_eq(ps.Index([])._summary(), "Index: 0 entries")
-        with self.assertRaisesRegexp(ValueError, "The truth value of a Index is ambiguous."):
+        with self.assertRaisesRegex(ValueError, "The truth value of a Index is ambiguous."):
             bool(ps.Index([1]))
-        with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
+        with self.assertRaisesRegex(TypeError, "Index.name must be a hashable type"):
             ps.Index([1, 2, 3], name=[(1, 2, 3)])
-        with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
+        with self.assertRaisesRegex(TypeError, "Index.name must be a hashable type"):
             ps.Index([1.0, 2.0, 3.0], name=[(1, 2, 3)])
 
     def test_index_from_series(self):
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -68,9 +68,9 @@ def test_categorical_index(self):
         self.assert_eq(psidx.codes, pd.Index(pidx.codes))
         self.assert_eq(psidx.ordered, pidx.ordered)
 
-        with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
+        with self.assertRaisesRegex(TypeError, "Index.name must be a hashable type"):
             ps.CategoricalIndex([1, 2, 3], name=[(1, 2, 3)])
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             TypeError, "Cannot perform 'all' with this index type: CategoricalIndex"
         ):
             ps.CategoricalIndex([1, 2, 3]).all()
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py
@@ -64,9 +64,9 @@ def _disallow_nanoseconds(self, f):
         self.assertRaises(ValueError, lambda: f(freq="N"))
 
     def test_datetime_index(self):
-        with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
+        with self.assertRaisesRegex(TypeError, "Index.name must be a hashable type"):
             ps.DatetimeIndex(["2004-01-01", "2002-12-31", "2000-04-01"], name=[(1, 2)])
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             TypeError, "Cannot perform 'all' with this index type: DatetimeIndex"
         ):
             ps.DatetimeIndex(["2004-01-01", "2002-12-31", "2000-04-01"]).all()
diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py b/python/pyspark/pandas/tests/indexes/test_timedelta.py
@@ -90,9 +90,9 @@ def test_timedelta_index(self):
         )
 
         # ps.TimedeltaIndex(ps.Index([1, 2, 3]))
-        with self.assertRaisesRegexp(TypeError, "Index.name must be a hashable type"):
+        with self.assertRaisesRegex(TypeError, "Index.name must be a hashable type"):
             ps.TimedeltaIndex([timedelta(1), timedelta(microseconds=2)], name=[(1, 2)])
-        with self.assertRaisesRegexp(
+        with self.assertRaisesRegex(
             TypeError, "Cannot perform 'all' with this index type: TimedeltaIndex"
         ):
             psidx.all()
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -1824,7 +1824,7 @@ def test_observe(self):
 
         self.assert_eq(cdf, df)
 
-        self.assertEquals(cobservation.get, observation.get)
+        self.assertEqual(cobservation.get, observation.get)
 
         observed_metrics = cdf.attrs["observed_metrics"]
         self.assert_eq(len(observed_metrics), 1)
@@ -3449,11 +3449,11 @@ def test_can_create_multiple_sessions_to_different_remotes(self):
         self.assertIsNotNone(self.spark._client)
         # Creates a new remote session.
         other = PySparkSession.builder.remote("sc://other.remote:114/").create()
-        self.assertNotEquals(self.spark, other)
+        self.assertNotEqual(self.spark, other)
 
         # Gets currently active session.
         same = PySparkSession.builder.remote("sc://other.remote.host:114/").getOrCreate()
-        self.assertEquals(other, same)
+        self.assertEqual(other, same)
         same.release_session_on_close = False  # avoid sending release to dummy connection
         same.stop()
 
diff --git a/python/pyspark/sql/tests/connect/test_connect_column.py b/python/pyspark/sql/tests/connect/test_connect_column.py
@@ -379,7 +379,7 @@ def test_simple_binary_expressions(self):
         self.assertEqual(len(pdf.index), 4)
 
         res = pd.DataFrame(data={"id": [0, 30, 60, 90]})
-        self.assert_(pdf.equals(res), f"{pdf.to_string()} != {res.to_string()}")
+        self.assertTrue(pdf.equals(res), f"{pdf.to_string()} != {res.to_string()}")
 
     def test_literal_with_acceptable_type(self):
         for value, dataType in [
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py b/python/pyspark/sql/tests/pandas/test_pandas_map.py
@@ -394,7 +394,7 @@ def func(iterator):
 
             for offheap in ["true", "false"]:
                 with self.sql_conf({"spark.sql.columnVector.offheap.enabled": offheap}):
-                    self.assertEquals(
+                    self.assertEqual(
                         self.spark.read.parquet(path).mapInPandas(func, "id long").head(), Row(0)
                     )
         finally:
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py
@@ -181,7 +181,7 @@ def test_input_nested_structs(self):
 
         mirror = pandas_udf(lambda s: s, df.dtypes[0][1])
 
-        self.assertEquals(
+        self.assertEqual(
             df.select(mirror(df.struct).alias("res")).first(),
             Row(
                 res=Row(
@@ -194,13 +194,13 @@ def test_input_nested_maps(self):
         df = self.df_with_nested_maps
 
         str_repr = pandas_udf(lambda s: s.astype(str), StringType())
-        self.assertEquals(
+        self.assertEqual(
             df.select(str_repr(df.attributes).alias("res")).first(),
             Row(res="{'personal': {'name': 'John', 'city': 'New York'}}"),
         )
 
         extract_name = pandas_udf(lambda s: s.apply(lambda x: x["personal"]["name"]), StringType())
-        self.assertEquals(
+        self.assertEqual(
             df.select(extract_name(df.attributes).alias("res")).first(),
             Row(res="John"),
         )
@@ -209,7 +209,7 @@ def test_input_nested_arrays(self):
         df = self.df_with_nested_arrays
 
         str_repr = pandas_udf(lambda s: s.astype(str), StringType())
-        self.assertEquals(
+        self.assertEqual(
             df.select(str_repr(df.nested_array).alias("res")).first(),
             Row(res="[array([1, 2, 3], dtype=int32) array([4, 5], dtype=int32)]"),
         )
@@ -1450,9 +1450,7 @@ def udf(x):
 
             for offheap in ["true", "false"]:
                 with self.sql_conf({"spark.sql.columnVector.offheap.enabled": offheap}):
-                    self.assertEquals(
-                        self.spark.read.parquet(path).select(udf("id")).head(), Row(0)
-                    )
+                    self.assertEqual(self.spark.read.parquet(path).select(udf("id")).head(), Row(0))
         finally:
             shutil.rmtree(path)
 
diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py
@@ -36,7 +36,7 @@ def test_streaming_query_functions_basic(self):
             .start()
         )
         try:
-            self.assertEquals(query.name, "test_streaming_query_functions_basic")
+            self.assertEqual(query.name, "test_streaming_query_functions_basic")
             self.assertTrue(isinstance(query.id, str))
             self.assertTrue(isinstance(query.runId, str))
             self.assertTrue(query.isActive)
diff --git a/python/pyspark/sql/tests/streaming/test_streaming_listener.py b/python/pyspark/sql/tests/streaming/test_streaming_listener.py
@@ -58,19 +58,19 @@ def check_terminated_event(self, event, exception=None, error_class=None):
         if exception:
             self.assertTrue(exception in event.exception)
         else:
-            self.assertEquals(event.exception, None)
+            self.assertEqual(event.exception, None)
 
         if error_class:
             self.assertTrue(error_class in event.errorClassOnException)
         else:
-            self.assertEquals(event.errorClassOnException, None)
+            self.assertEqual(event.errorClassOnException, None)
 
     def check_streaming_query_progress(self, progress):
         """Check StreamingQueryProgress"""
         self.assertTrue(isinstance(progress, StreamingQueryProgress))
         self.assertTrue(isinstance(progress.id, uuid.UUID))
         self.assertTrue(isinstance(progress.runId, uuid.UUID))
-        self.assertEquals(progress.name, "test")
+        self.assertEqual(progress.name, "test")
         try:
             json.loads(progress.json)
         except Exception:
@@ -208,41 +208,41 @@ def get_number_of_public_methods(clz):
                 ).getMethods()
             )
 
-        self.assertEquals(
+        self.assertEqual(
             get_number_of_public_methods(
                 "org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent"
             ),
             15,
             msg,
         )
-        self.assertEquals(
+        self.assertEqual(
             get_number_of_public_methods(
                 "org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent"
             ),
             12,
             msg,
         )
-        self.assertEquals(
+        self.assertEqual(
             get_number_of_public_methods(
                 "org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent"
             ),
             15,
             msg,
         )
-        self.assertEquals(
+        self.assertEqual(
             get_number_of_public_methods("org.apache.spark.sql.streaming.StreamingQueryProgress"),
             38,
             msg,
         )
-        self.assertEquals(
+        self.assertEqual(
             get_number_of_public_methods("org.apache.spark.sql.streaming.StateOperatorProgress"),
             27,
             msg,
         )
-        self.assertEquals(
+        self.assertEqual(
             get_number_of_public_methods("org.apache.spark.sql.streaming.SourceProgress"), 21, msg
         )
-        self.assertEquals(
+        self.assertEqual(
             get_number_of_public_methods("org.apache.spark.sql.streaming.SinkProgress"), 19, msg
         )
 
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -877,7 +877,7 @@ def check_toPandas_duplicate_field_names(self, arrow_enabled):
                     }
                 ):
                     if arrow_enabled and struct_in_pandas == "legacy":
-                        with self.assertRaisesRegexp(
+                        with self.assertRaisesRegex(
                             UnsupportedOperationException, "DUPLICATED_FIELD_NAME_IN_ARROW_STRUCT"
                         ):
                             df.toPandas()
diff --git a/python/pyspark/sql/tests/test_arrow_python_udf.py b/python/pyspark/sql/tests/test_arrow_python_udf.py
@@ -59,9 +59,9 @@ def test_complex_input_types(self):
             .first()
         )
 
-        self.assertEquals(row[0], "[1, 2, 3]")
-        self.assertEquals(row[1], "{'a': 'b'}")
-        self.assertEquals(row[2], "Row(col1=1, col2=2)")
+        self.assertEqual(row[0], "[1, 2, 3]")
+        self.assertEqual(row[1], "{'a': 'b'}")
+        self.assertEqual(row[2], "Row(col1=1, col2=2)")
 
     def test_use_arrow(self):
         # useArrow=True
@@ -88,7 +88,7 @@ def test_use_arrow(self):
             .first()
         )
 
-        self.assertEquals(row_true[0], row_none[0])  # "[1, 2, 3]"
+        self.assertEqual(row_true[0], row_none[0])  # "[1, 2, 3]"
 
         # useArrow=False
         row_false = (
@@ -101,13 +101,13 @@ def test_use_arrow(self):
             )
             .first()
         )
-        self.assertEquals(row_false[0], "[1, 2, 3]")
+        self.assertEqual(row_false[0], "[1, 2, 3]")
 
     def test_eval_type(self):
-        self.assertEquals(
+        self.assertEqual(
             udf(lambda x: str(x), useArrow=True).evalType, PythonEvalType.SQL_ARROW_BATCHED_UDF
         )
-        self.assertEquals(
+        self.assertEqual(
             udf(lambda x: str(x), useArrow=False).evalType, PythonEvalType.SQL_BATCHED_UDF
         )
 
@@ -118,7 +118,7 @@ def test_register(self):
         str_repr_func = self.spark.udf.register("str_repr", udf(lambda x: str(x), useArrow=True))
 
         # To verify that Arrow optimization is on
-        self.assertEquals(
+        self.assertEqual(
             df.selectExpr("str_repr(array) AS str_id").first()[0],
             "[1, 2, 3]",  # The input is a NumPy array when the Arrow optimization is on
         )
@@ -131,7 +131,7 @@ def test_register(self):
 
     def test_nested_array_input(self):
         df = self.spark.range(1).selectExpr("array(array(1, 2), array(3, 4)) as nested_array")
-        self.assertEquals(
+        self.assertEqual(
             df.select(
                 udf(lambda x: str(x), returnType="string", useArrow=True)("nested_array")
             ).first()[0],
@@ -148,8 +148,8 @@ def test_type_coercion_string_to_numeric(self):
         for ddl_type in int_ddl_types:
             # df_int_value
             res = df_int_value.select(udf(lambda x: x, ddl_type)("value").alias("res"))
-            self.assertEquals(res.collect(), [Row(res=1), Row(res=2)])
-            self.assertEquals(res.dtypes[0][1], ddl_type)
+            self.assertEqual(res.collect(), [Row(res=1), Row(res=2)])
+            self.assertEqual(res.dtypes[0][1], ddl_type)
 
         floating_results = [
             [Row(res=1.1), Row(res=2.2)],
@@ -158,12 +158,12 @@ def test_type_coercion_string_to_numeric(self):
         for ddl_type, floating_res in zip(floating_ddl_types, floating_results):
             # df_int_value
             res = df_int_value.select(udf(lambda x: x, ddl_type)("value").alias("res"))
-            self.assertEquals(res.collect(), [Row(res=1.0), Row(res=2.0)])
-            self.assertEquals(res.dtypes[0][1], ddl_type)
+            self.assertEqual(res.collect(), [Row(res=1.0), Row(res=2.0)])
+            self.assertEqual(res.dtypes[0][1], ddl_type)
             # df_floating_value
             res = df_floating_value.select(udf(lambda x: x, ddl_type)("value").alias("res"))
-            self.assertEquals(res.collect(), floating_res)
-            self.assertEquals(res.dtypes[0][1], ddl_type)
+            self.assertEqual(res.collect(), floating_res)
+            self.assertEqual(res.dtypes[0][1], ddl_type)
 
         # invalid
         with self.assertRaises(PythonException):
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
@@ -903,7 +903,7 @@ def test_nested_higher_order_function(self):
             (3, "c"),
         ]
 
-        self.assertEquals(actual, expected)
+        self.assertEqual(actual, expected)
 
     def test_window_functions(self):
         df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py
diff --git a/python/pyspark/sql/tests/test_utils.py b/python/pyspark/sql/tests/test_utils.py
diff --git a/python/pyspark/tests/test_util.py b/python/pyspark/tests/test_util.py

Original file line number	Diff line number	Diff line change
`@@ -362,7 +362,7 @@ def test_unwrap_udt(self):`
`362`	`362`	`Row(v2=unwrapped_vec(1, None, None, [1.0, 2.0, 3.0])),`
`363`	`363`	`Row(v2=unwrapped_vec(0, 3, [1, 2], [1.0, 5.5])),`
`364`	`364`	`]`
`365`		`- self.assertEquals(results, expected)`
	`365`	`+ self.assertEqual(results, expected)`
`366`	`366`
`367`	`367`
`368`	`368`	`class MatrixUDTTests(MLlibTestCase):`
Original file line number	Diff line number	Diff line change
`@@ -394,7 +394,7 @@ def func(iterator):`
`394`	`394`
`395`	`395`	`for offheap in ["true", "false"]:`
`396`	`396`	`with self.sql_conf({"spark.sql.columnVector.offheap.enabled": offheap}):`
`397`		`- self.assertEquals(`
	`397`	`+ self.assertEqual(`
`398`	`398`	`self.spark.read.parquet(path).mapInPandas(func, "id long").head(), Row(0)`
`399`	`399`	`)`
`400`	`400`	`finally:`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_streaming_query_functions_basic(self):`
`36`	`36`	`.start()`
`37`	`37`	`)`
`38`	`38`	`try:`
`39`		`- self.assertEquals(query.name, "test_streaming_query_functions_basic")`
	`39`	`+ self.assertEqual(query.name, "test_streaming_query_functions_basic")`
`40`	`40`	`self.assertTrue(isinstance(query.id, str))`
`41`	`41`	`self.assertTrue(isinstance(query.runId, str))`
`42`	`42`	`self.assertTrue(query.isActive)`