Skip to content

Commit a6ae941

Browse files
authored
Merge pull request #1238 from astronomy-commons/to_ddf
Implement Catalog.to_dask_dataframe
2 parents 7c94386 + 03e22d2 commit a6ae941

File tree

3 files changed

+47
-1
lines changed

3 files changed

+47
-1
lines changed

docs/reference/catalog_dataframe.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ Dataframe Methods
1212
Catalog.compute
1313
Catalog.get_partition
1414
Catalog.get_partition_index
15-
Catalog.prune_empty_partitions
15+
Catalog.prune_empty_partitions
16+
Catalog.to_dask_dataframe

src/lsdb/catalog/dataset/healpix_dataset.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,44 @@ def to_delayed(self, optimize_graph: bool = True) -> list[Delayed]:
169169
"""
170170
return self._ddf.to_delayed(optimize_graph=optimize_graph)
171171

172+
def to_dask_dataframe(self) -> dd.DataFrame:
173+
"""Convert the dataset to a Dask DataFrame.
174+
175+
Returns
176+
-------
177+
dd.DataFrame
178+
The Dask DataFrame representation of the dataset.
179+
180+
Examples
181+
--------
182+
>>> import lsdb
183+
>>> catalog = lsdb.from_dataframe(pd.DataFrame({"ra":[0, 10], "dec":[5, 15],
184+
... "mag":[21, 22], "mag_err":[.1, .2]}))
185+
>>> ddf = catalog.to_dask_dataframe()
186+
>>> ddf # doctest: +NORMALIZE_WHITESPACE
187+
Dask DataFrame Structure:
188+
ra dec mag mag_err
189+
npartitions=1
190+
1369094286720630784 int64[pyarrow] int64[pyarrow] int64[pyarrow] double[pyarrow]
191+
1441151880758558720 ... ... ... ...
192+
Dask Name: nestedframe, 3 expressions
193+
Expr=Dask NestedFrame Structure:
194+
ra dec mag mag_err
195+
npartitions=1
196+
1369094286720630784 int64[pyarrow] int64[pyarrow] int64[pyarrow] double[pyarrow]
197+
1441151880758558720 ... ... ... ...
198+
Dask Name: nestedframe, 3 expressions
199+
Expr=MapPartitions(NestedFrame)
200+
201+
Notes
202+
-----
203+
This method returns a Dask DataFrame. However, be aware that
204+
the underlying in-memory DataFrame for each partition is still a
205+
nested-pandas NestedFrame, rather than a pandas DataFrame.
206+
"""
207+
# self._ddf is a NestedFrame, which is a subclass of dd.DataFrame
208+
return dd.DataFrame(self._ddf)
209+
172210
@property
173211
def name(self):
174212
"""The name of the catalog"""

tests/lsdb/catalog/test_catalog.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,3 +1028,10 @@ def test_estimate_size(small_sky_source_catalog, capsys):
10281028
total_uncompressed_size = sum(col.total_uncompressed_size for col in column_chunks)
10291029
assert pytest.approx(total_uncompressed_size / 1024, 0.01) == 30.4
10301030
assert pytest.approx(total_compressed_size / 1024, 0.01) == 25.2
1031+
1032+
1033+
def test_to_dask_dataframe(small_sky_order1_catalog):
1034+
ddf = small_sky_order1_catalog.to_dask_dataframe()
1035+
assert isinstance(ddf, dd.DataFrame)
1036+
pd.testing.assert_frame_equal(ddf.compute(), small_sky_order1_catalog.compute())
1037+
assert isinstance(ddf.compute(), npd.NestedFrame)

0 commit comments

Comments
 (0)