Skip to content

Commit 551e7f2

Browse files
committed
[SPARK-53619][PYTHON][DOCS][TESTS] Enable doctests for toArrow/toPandas/mapInArrow/mapInPandas
### What changes were proposed in this pull request? Enable doctests for toArrow/toPandas/mapInArrow/mapInPandas ### Why are the changes needed? for test coverage, to make sure the examples are correct ### Does this PR introduce _any_ user-facing change? doc-only changes ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? No Closes #52366 from zhengruifeng/enable_doc_map_inxxx. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 3080e61 commit 551e7f2

File tree

3 files changed

+27
-12
lines changed

3 files changed

+27
-12
lines changed

python/pyspark/sql/classic/dataframe.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1938,10 +1938,18 @@ def _test() -> None:
19381938
import doctest
19391939
from pyspark.sql import SparkSession
19401940
import pyspark.sql.dataframe
1941+
from pyspark.testing.utils import have_pandas, have_pyarrow
19411942

19421943
# It inherits docstrings but doctests cannot detect them so we run
19431944
# the parent classe's doctests here directly.
19441945
globs = pyspark.sql.dataframe.__dict__.copy()
1946+
1947+
if not have_pandas or not have_pyarrow:
1948+
del pyspark.sql.dataframe.DataFrame.toArrow.__doc__
1949+
del pyspark.sql.dataframe.DataFrame.toPandas.__doc__
1950+
del pyspark.sql.dataframe.DataFrame.mapInArrow.__doc__
1951+
del pyspark.sql.dataframe.DataFrame.mapInPandas.__doc__
1952+
19451953
spark = (
19461954
SparkSession.builder.master("local[4]").appName("sql.classic.dataframe tests").getOrCreate()
19471955
)

python/pyspark/sql/connect/dataframe.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2304,6 +2304,7 @@ def _test() -> None:
23042304
from pyspark.util import is_remote_only
23052305
from pyspark.sql import SparkSession as PySparkSession
23062306
import pyspark.sql.dataframe
2307+
from pyspark.testing.utils import have_pandas, have_pyarrow
23072308

23082309
# It inherits docstrings but doctests cannot detect them so we run
23092310
# the parent classe's doctests here directly.
@@ -2315,6 +2316,12 @@ def _test() -> None:
23152316
del pyspark.sql.dataframe.DataFrame.toJSON.__doc__
23162317
del pyspark.sql.dataframe.DataFrame.rdd.__doc__
23172318

2319+
if not have_pandas or not have_pyarrow:
2320+
del pyspark.sql.dataframe.DataFrame.toArrow.__doc__
2321+
del pyspark.sql.dataframe.DataFrame.toPandas.__doc__
2322+
del pyspark.sql.dataframe.DataFrame.mapInArrow.__doc__
2323+
del pyspark.sql.dataframe.DataFrame.mapInPandas.__doc__
2324+
23182325
globs["spark"] = (
23192326
PySparkSession.builder.appName("sql.connect.dataframe tests")
23202327
.remote(os.environ.get("SPARK_CONNECT_TESTING_REMOTE", "local[4]"))

python/pyspark/sql/dataframe.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6364,7 +6364,7 @@ def mapInPandas(
63646364
... for pdf in iterator:
63656365
... yield pdf[pdf.id == 1]
63666366
...
6367-
>>> df.mapInPandas(filter_func, df.schema).show() # doctest: +SKIP
6367+
>>> df.mapInPandas(filter_func, df.schema).show()
63686368
+---+---+
63696369
| id|age|
63706370
+---+---+
@@ -6377,7 +6377,7 @@ def mapInPandas(
63776377
... for pdf in iterator:
63786378
... yield pdf.groupby("id").mean().reset_index()
63796379
...
6380-
>>> df.mapInPandas(mean_age, "id: bigint, age: double").show() # doctest: +SKIP
6380+
>>> df.mapInPandas(mean_age, "id: bigint, age: double").show()
63816381
+---+----+
63826382
| id| age|
63836383
+---+----+
@@ -6393,7 +6393,7 @@ def mapInPandas(
63936393
... yield pdf
63946394
...
63956395
>>> df.mapInPandas(
6396-
... double_age, "id: bigint, age: bigint, double_age: bigint").show() # doctest: +SKIP
6396+
... double_age, "id: bigint, age: bigint, double_age: bigint").show()
63976397
+---+---+----------+
63986398
| id|age|double_age|
63996399
+---+---+----------+
@@ -6405,7 +6405,7 @@ def mapInPandas(
64056405
barrier mode, it ensures all Python workers in the stage will be
64066406
launched concurrently.
64076407
6408-
>>> df.mapInPandas(filter_func, df.schema, barrier=True).collect() # doctest: +SKIP
6408+
>>> df.mapInPandas(filter_func, df.schema, barrier=True).collect()
64096409
[Row(id=1, age=21)]
64106410
64116411
See Also
@@ -6457,13 +6457,13 @@ def mapInArrow(
64576457
64586458
Examples
64596459
--------
6460-
>>> import pyarrow # doctest: +SKIP
6460+
>>> import pyarrow as pa
64616461
>>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
64626462
>>> def filter_func(iterator):
64636463
... for batch in iterator:
64646464
... pdf = batch.to_pandas()
6465-
... yield pyarrow.RecordBatch.from_pandas(pdf[pdf.id == 1])
6466-
>>> df.mapInArrow(filter_func, df.schema).show() # doctest: +SKIP
6465+
... yield pa.RecordBatch.from_pandas(pdf[pdf.id == 1])
6466+
>>> df.mapInArrow(filter_func, df.schema).show()
64676467
+---+---+
64686468
| id|age|
64696469
+---+---+
@@ -6474,7 +6474,7 @@ def mapInArrow(
64746474
barrier mode, it ensures all Python workers in the stage will be
64756475
launched concurrently.
64766476
6477-
>>> df.mapInArrow(filter_func, df.schema, barrier=True).collect() # doctest: +SKIP
6477+
>>> df.mapInArrow(filter_func, df.schema, barrier=True).collect()
64786478
[Row(id=1, age=21)]
64796479
64806480
See Also
@@ -6503,13 +6503,13 @@ def toArrow(self) -> "pa.Table":
65036503
Examples
65046504
--------
65056505
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
6506-
>>> df.toArrow() # doctest: +SKIP
6506+
>>> df.coalesce(1).toArrow()
65076507
pyarrow.Table
65086508
age: int64
65096509
name: string
65106510
----
6511-
age: [[2],[5]]
6512-
name: [["Alice"],["Bob"]]
6511+
age: [[2,5]]
6512+
name: [["Alice","Bob"]]
65136513
"""
65146514
...
65156515

@@ -6534,7 +6534,7 @@ def toPandas(self) -> "PandasDataFrameLike":
65346534
Examples
65356535
--------
65366536
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
6537-
>>> df.toPandas() # doctest: +SKIP
6537+
>>> df.toPandas()
65386538
age name
65396539
0 2 Alice
65406540
1 5 Bob

0 commit comments

Comments
 (0)