From 054905f5074b8c543d2830e8e0500227f508a85f Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Fri, 26 Nov 2021 11:16:00 +0800 Subject: [PATCH] Bump minimum pandas version to 1.0.5 --- python/docs/source/getting_started/install.rst | 4 ++-- python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst | 1 + python/docs/source/user_guide/sql/arrow_pandas.rst | 2 +- python/pyspark/pandas/tests/test_series.py | 4 +++- python/pyspark/sql/pandas/utils.py | 2 +- python/setup.py | 2 +- 6 files changed, 9 insertions(+), 6 deletions(-) diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 13c6f8f3a28e2..601b45d00a7cf 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -154,11 +154,11 @@ Dependencies ============= ========================= ====================================== Package Minimum supported version Note ============= ========================= ====================================== -`pandas` 0.23.2 Optional for Spark SQL +`pandas` 1.0.5 Optional for Spark SQL `NumPy` 1.7 Required for MLlib DataFrame-based API `pyarrow` 1.0.0 Optional for Spark SQL `Py4J` 0.10.9.2 Required -`pandas` 0.23.2 Required for pandas API on Spark +`pandas` 1.0.5 Required for pandas API on Spark `pyarrow` 1.0.0 Required for pandas API on Spark `Numpy` 1.14 Required for pandas API on Spark ============= ========================= ====================================== diff --git a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst index 060f24c8f41fa..932fc739bb804 100644 --- a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst +++ b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst @@ -21,3 +21,4 @@ Upgrading from PySpark 3.2 to 3.3 ================================= * In Spark 3.3, the ``drop`` method of pandas API on Spark DataFrame supports dropping rows by ``index``, and sets dropping by index instead of column by default. +* In Spark 3.3, PySpark upgrades Pandas version, the new minimum required version changes from 0.23.2 to 1.0.5. diff --git a/python/docs/source/user_guide/sql/arrow_pandas.rst b/python/docs/source/user_guide/sql/arrow_pandas.rst index 78d3e7ad84e3f..20a9f935d586f 100644 --- a/python/docs/source/user_guide/sql/arrow_pandas.rst +++ b/python/docs/source/user_guide/sql/arrow_pandas.rst @@ -387,7 +387,7 @@ working with timestamps in ``pandas_udf``\s to get the best performance, see Recommended Pandas and PyArrow Versions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For usage with pyspark.sql, the minimum supported versions of Pandas is 0.23.2 and PyArrow is 1.0.0. +For usage with pyspark.sql, the minimum supported versions of Pandas is 1.0.5 and PyArrow is 1.0.0. Higher versions may be used, however, compatibility and data correctness can not be guaranteed and should be verified by the user. diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 72677d18e4b88..2a861fa697023 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -2232,7 +2232,9 @@ def test_mad(self): pser.index = pmidx psser = ps.from_pandas(pser) - self.assert_eq(pser.mad(), psser.mad()) + # Mark almost as True to avoid precision issue like: + # "21.555555555555554 != 21.555555555555557" + self.assert_eq(pser.mad(), psser.mad(), almost=True) def test_to_frame(self): pser = pd.Series(["a", "b", "c"]) diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py index cc0db017c301f..bc6202f854639 100644 --- a/python/pyspark/sql/pandas/utils.py +++ b/python/pyspark/sql/pandas/utils.py @@ -19,7 +19,7 @@ def require_minimum_pandas_version() -> None: """Raise ImportError if minimum version of Pandas is not installed""" # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pandas_version = "0.23.2" + minimum_pandas_version = "1.0.5" from distutils.version import LooseVersion diff --git a/python/setup.py b/python/setup.py index 4507a2686e2c5..174995d4aec49 100755 --- a/python/setup.py +++ b/python/setup.py @@ -111,7 +111,7 @@ def _supports_symlinks(): # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst. -_minimum_pandas_version = "0.23.2" +_minimum_pandas_version = "1.0.5" _minimum_pyarrow_version = "1.0.0"