Skip to content

Commit cb8b3a7

Browse files
committed
1 parent 4a947b9 commit cb8b3a7

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

duckdb/experimental/spark/sql/dataframe.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,5 +1403,35 @@ def construct_row(values, names) -> Row:
14031403
rows = [construct_row(x, columns) for x in result]
14041404
return rows
14051405

1406+
def cache(self) -> "DataFrame":
1407+
"""Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK_DESER`).
1408+
1409+
.. versionadded:: 1.3.0
1410+
1411+
.. versionchanged:: 3.4.0
1412+
Supports Spark Connect.
1413+
1414+
Notes
1415+
-----
1416+
The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
1417+
1418+
Returns
1419+
-------
1420+
:class:`DataFrame`
1421+
Cached DataFrame.
1422+
1423+
Examples
1424+
--------
1425+
>>> df = spark.range(1)
1426+
>>> df.cache()
1427+
DataFrame[id: bigint]
1428+
1429+
>>> df.explain()
1430+
== Physical Plan ==
1431+
InMemoryTableScan ...
1432+
"""
1433+
cached_relation = self.relation.execute()
1434+
return DataFrame(cached_relation, self.session)
1435+
14061436

14071437
__all__ = ["DataFrame"]

tests/fast/spark/test_spark_dataframe.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,3 +421,11 @@ def test_drop(self, spark):
421421
assert df.drop("two", "three").columns == expected
422422
assert df.drop("two", col("three")).columns == expected
423423
assert df.drop("two", col("three"), col("missing")).columns == expected
424+
425+
def test_cache(self, spark):
426+
data = [(1, 2, 3, 4)]
427+
df = spark.createDataFrame(data, ["one", "two", "three", "four"])
428+
cached = df.cache()
429+
assert df is not cached
430+
assert cached.collect() == df.collect()
431+
assert cached.collect() == [Row(one=1, two=2, three=3, four=4)]

0 commit comments

Comments
 (0)