Merge pull request #1238 from astronomy-commons/to_ddf

dougbrn · web-flow · commit a6ae9411661e · 2026-01-30T11:51:26.000-08:00
Implement Catalog.to_dask_dataframe
diff --git a/docs/reference/catalog_dataframe.rst b/docs/reference/catalog_dataframe.rst
@@ -12,4 +12,5 @@ Dataframe Methods
     Catalog.compute
     Catalog.get_partition
     Catalog.get_partition_index
-    Catalog.prune_empty_partitions
+    Catalog.prune_empty_partitions
+    Catalog.to_dask_dataframe
diff --git a/src/lsdb/catalog/dataset/healpix_dataset.py b/src/lsdb/catalog/dataset/healpix_dataset.py
@@ -169,6 +169,44 @@ def to_delayed(self, optimize_graph: bool = True) -> list[Delayed]:
         """
         return self._ddf.to_delayed(optimize_graph=optimize_graph)
 
+    def to_dask_dataframe(self) -> dd.DataFrame:
+        """Convert the dataset to a Dask DataFrame.
+
+        Returns
+        -------
+        dd.DataFrame
+            The Dask DataFrame representation of the dataset.
+
+        Examples
+        --------
+        >>> import lsdb
+        >>> catalog = lsdb.from_dataframe(pd.DataFrame({"ra":[0, 10], "dec":[5, 15],
+        ...                                             "mag":[21, 22], "mag_err":[.1, .2]}))
+        >>> ddf = catalog.to_dask_dataframe()
+        >>> ddf  # doctest: +NORMALIZE_WHITESPACE
+        Dask DataFrame Structure:
+                                         ra             dec             mag          mag_err
+        npartitions=1
+        1369094286720630784  int64[pyarrow]  int64[pyarrow]  int64[pyarrow]  double[pyarrow]
+        1441151880758558720             ...             ...             ...              ...
+        Dask Name: nestedframe, 3 expressions
+        Expr=Dask NestedFrame Structure:
+                                         ra             dec             mag          mag_err
+        npartitions=1
+        1369094286720630784  int64[pyarrow]  int64[pyarrow]  int64[pyarrow]  double[pyarrow]
+        1441151880758558720             ...             ...             ...              ...
+        Dask Name: nestedframe, 3 expressions
+        Expr=MapPartitions(NestedFrame)
+
+        Notes
+        -----
+        This method returns a Dask DataFrame. However, be aware that
+        the underlying in-memory DataFrame for each partition is still a
+        nested-pandas NestedFrame, rather than a pandas DataFrame.
+        """
+        # self._ddf is a NestedFrame, which is a subclass of dd.DataFrame
+        return dd.DataFrame(self._ddf)
+
     @property
     def name(self):
         """The name of the catalog"""
diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py
@@ -1028,3 +1028,10 @@ def test_estimate_size(small_sky_source_catalog, capsys):
     total_uncompressed_size = sum(col.total_uncompressed_size for col in column_chunks)
     assert pytest.approx(total_uncompressed_size / 1024, 0.01) == 30.4
     assert pytest.approx(total_compressed_size / 1024, 0.01) == 25.2
+
+
+def test_to_dask_dataframe(small_sky_order1_catalog):
+    ddf = small_sky_order1_catalog.to_dask_dataframe()
+    assert isinstance(ddf, dd.DataFrame)
+    pd.testing.assert_frame_equal(ddf.compute(), small_sky_order1_catalog.compute())
+    assert isinstance(ddf.compute(), npd.NestedFrame)