[SPARK-53619][PYTHON][DOCS][TESTS] Enable doctests for toArrow/toPandas/mapInArrow/mapInPandas

zhengruifeng · zhengruifeng · commit 551e7f2e1e82 · 2025-09-18T10:33:24.000+08:00
### What changes were proposed in this pull request? Enable doctests for toArrow/toPandas/mapInArrow/mapInPandas ### Why are the changes needed? for test coverage, to make sure the examples are correct ### Does this PR introduce _any_ user-facing change? doc-only changes ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? No Closes #52366 from zhengruifeng/enable_doc_map_inxxx. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/sql/classic/dataframe.py b/python/pyspark/sql/classic/dataframe.py
@@ -1938,10 +1938,18 @@ def _test() -> None:
     import doctest
     from pyspark.sql import SparkSession
     import pyspark.sql.dataframe
+    from pyspark.testing.utils import have_pandas, have_pyarrow
 
     # It inherits docstrings but doctests cannot detect them so we run
     # the parent classe's doctests here directly.
     globs = pyspark.sql.dataframe.__dict__.copy()
+
+    if not have_pandas or not have_pyarrow:
+        del pyspark.sql.dataframe.DataFrame.toArrow.__doc__
+        del pyspark.sql.dataframe.DataFrame.toPandas.__doc__
+        del pyspark.sql.dataframe.DataFrame.mapInArrow.__doc__
+        del pyspark.sql.dataframe.DataFrame.mapInPandas.__doc__
+
     spark = (
         SparkSession.builder.master("local[4]").appName("sql.classic.dataframe tests").getOrCreate()
     )
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -2304,6 +2304,7 @@ def _test() -> None:
     from pyspark.util import is_remote_only
     from pyspark.sql import SparkSession as PySparkSession
     import pyspark.sql.dataframe
+    from pyspark.testing.utils import have_pandas, have_pyarrow
 
     # It inherits docstrings but doctests cannot detect them so we run
     # the parent classe's doctests here directly.
@@ -2315,6 +2316,12 @@ def _test() -> None:
         del pyspark.sql.dataframe.DataFrame.toJSON.__doc__
         del pyspark.sql.dataframe.DataFrame.rdd.__doc__
 
+    if not have_pandas or not have_pyarrow:
+        del pyspark.sql.dataframe.DataFrame.toArrow.__doc__
+        del pyspark.sql.dataframe.DataFrame.toPandas.__doc__
+        del pyspark.sql.dataframe.DataFrame.mapInArrow.__doc__
+        del pyspark.sql.dataframe.DataFrame.mapInPandas.__doc__
+
     globs["spark"] = (
         PySparkSession.builder.appName("sql.connect.dataframe tests")
         .remote(os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[4]"))
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -6364,7 +6364,7 @@ def mapInPandas(
         ...     for pdf in iterator:
         ...         yield pdf[pdf.id == 1]
         ...
-        >>> df.mapInPandas(filter_func, df.schema).show()  # doctest: +SKIP
+        >>> df.mapInPandas(filter_func, df.schema).show()
         +---+---+
         | id|age|
         +---+---+
@@ -6377,7 +6377,7 @@ def mapInPandas(
         ...     for pdf in iterator:
         ...         yield pdf.groupby("id").mean().reset_index()
         ...
-        >>> df.mapInPandas(mean_age, "id: bigint, age: double").show()  # doctest: +SKIP
+        >>> df.mapInPandas(mean_age, "id: bigint, age: double").show()
         +---+----+
         | id| age|
         +---+----+
@@ -6393,7 +6393,7 @@ def mapInPandas(
         ...         yield pdf
         ...
         >>> df.mapInPandas(
-        ...     double_age, "id: bigint, age: bigint, double_age: bigint").show()  # doctest: +SKIP
+        ...     double_age, "id: bigint, age: bigint, double_age: bigint").show()
         +---+---+----------+
         | id|age|double_age|
         +---+---+----------+
@@ -6405,7 +6405,7 @@ def mapInPandas(
         barrier mode, it ensures all Python workers in the stage will be
         launched concurrently.
 
-        >>> df.mapInPandas(filter_func, df.schema, barrier=True).collect()  # doctest: +SKIP
+        >>> df.mapInPandas(filter_func, df.schema, barrier=True).collect()
         [Row(id=1, age=21)]
 
         See Also
@@ -6457,13 +6457,13 @@ def mapInArrow(
 
         Examples
         --------
-        >>> import pyarrow  # doctest: +SKIP
+        >>> import pyarrow as pa
         >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
         >>> def filter_func(iterator):
         ...     for batch in iterator:
         ...         pdf = batch.to_pandas()
-        ...         yield pyarrow.RecordBatch.from_pandas(pdf[pdf.id == 1])
-        >>> df.mapInArrow(filter_func, df.schema).show()  # doctest: +SKIP
+        ...         yield pa.RecordBatch.from_pandas(pdf[pdf.id == 1])
+        >>> df.mapInArrow(filter_func, df.schema).show()
         +---+---+
         | id|age|
         +---+---+
@@ -6474,7 +6474,7 @@ def mapInArrow(
         barrier mode, it ensures all Python workers in the stage will be
         launched concurrently.
 
-        >>> df.mapInArrow(filter_func, df.schema, barrier=True).collect()  # doctest: +SKIP
+        >>> df.mapInArrow(filter_func, df.schema, barrier=True).collect()
         [Row(id=1, age=21)]
 
         See Also
@@ -6503,13 +6503,13 @@ def toArrow(self) -> "pa.Table":
         Examples
         --------
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
-        >>> df.toArrow()  # doctest: +SKIP
+        >>> df.coalesce(1).toArrow()
         pyarrow.Table
         age: int64
         name: string
         ----
-        age: [[2],[5]]
-        name: [["Alice"],["Bob"]]
+        age: [[2,5]]
+        name: [["Alice","Bob"]]
         """
         ...
 
@@ -6534,7 +6534,7 @@ def toPandas(self) -> "PandasDataFrameLike":
         Examples
         --------
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
-        >>> df.toPandas()  # doctest: +SKIP
+        >>> df.toPandas()
            age   name
         0    2  Alice
         1    5    Bob