[python][RDF] Extent AsNumpy to convert nested collection data to numpy arrays

lobis · dpiparo · commit 6a6ed1afdd65 · 2025-05-10T10:08:28.000+02:00
diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py
@@ -96,14 +96,35 @@ def pypowarray(numpyvec, pow):
 
 Eventually, you probably would like to inspect the content of the RDataFrame or process the data further
 with Python libraries. For this purpose, we provide the `AsNumpy()` function, which returns the columns
-of your RDataFrame as a dictionary of NumPy arrays. See a simple example below or a full tutorial [here](df026__AsNumpyArrays_8py.html).
+of your RDataFrame as a dictionary of NumPy arrays. See a few simple examples below or a full tutorial [here](df026__AsNumpyArrays_8py.html).
 
+\anchor asnumpy_scalar_columns
+##### Scalar columns
+If your column contains scalar values of fundamental types (e.g., integers, floats), `AsNumpy()` produces NumPy arrays with the appropriate `dtype`:
 ~~~{.py}
-df = ROOT.RDataFrame("myTree", "myFile.root")
-cols = df.Filter("x > 10").AsNumpy(["x", "y"]) # retrieve columns "x" and "y" as NumPy arrays
-print(cols["x"], cols["y"]) # the values of the cols dictionary are NumPy arrays
+rdf = ROOT.RDataFrame(10).Define("int_col", "1").Define("float_col", "2.3")
+print(rdf.AsNumpy(["int_col", "float_col"]))
+# Output: {'int_col': array([...], dtype=int32), 'float_col': array([...], dtype=float64)}
+~~~
+
+Columns containing non-fundamental types (e.g., objects, strings) will result in NumPy arrays with `dtype=object`.
+
+##### Collection Columns
+If your column contains collections of fundamental types (e.g., std::vector<int>), `AsNumpy()` produces a NumPy array with `dtype=object` where each 
+element is a NumPy array representing the collection for its corresponding entry in the column.
+
+If the collection at a certain entry contains values of fundamental types, or if it is a regularly shaped multi-dimensional array of a fundamental type, 
+then the numpy array representing the collection for that entry will have the `dtype` associated with the value type of the collection, for example:
+~~~{.py}
+rdf = rdf.Define("v_col", "std::vector<int>{{1, 2, 3}}")
+print(rdf.AsNumpy(["v_col", "int_col", "float_col"]))
+# Output: {'v_col': array([array([1, 2, 3], dtype=int32), ...], dtype=object), ...}
 ~~~
 
+If the collection at a certain entry contains values of a non-fundamental type, `AsNumpy()` will fallback on the [default behavior](\ref asnumpy_scalar_columns) and produce a NumPy array with `dtype=object` for that collection.
+
+For more complex collection types in your entries, e.g. when every entry has a jagged array value, refer to the section on [interoperability with AwkwardArray](\ref awkward_interop).
+
 #### Processing data stored in NumPy arrays
 
 In case you have data in NumPy arrays in Python and you want to process the data with ROOT, you can easily
@@ -124,6 +145,8 @@ def pypowarray(numpyvec, pow):
 df.Define("z", "x + y").Snapshot("tree", "file.root")
 ~~~
 
+
+\anchor awkward_interop
 ### Interoperability with [AwkwardArray](https://awkward-array.org/doc/main/user-guide/how-to-convert-rdataframe.html)
 
 The function for RDataFrame to Awkward conversion is ak.from_rdataframe(). The argument to this function accepts a tuple of strings that are the RDataFrame column names. By default this function returns ak.Array type.
@@ -204,11 +227,20 @@ def pypowarray(numpyvec, pow):
 \endpythondoc
 '''
 
+from __future__ import annotations
+
+from typing import Iterable, Optional
+
 from . import pythonization
 from ._pyz_utils import MethodTemplateGetter, MethodTemplateWrapper
 
 
-def RDataFrameAsNumpy(df, columns=None, exclude=None, lazy=False):
+def RDataFrameAsNumpy(
+    df: ROOT.RDataFrame,  # noqa: F821
+    columns: Optional[Iterable[str]] = None,
+    exclude: Optional[Iterable[str]] = None,
+    lazy: bool = False,
+):
     """Read-out the RDataFrame as a collection of numpy arrays.
 
     The values of the dataframe are read out as numpy array of the respective type
@@ -226,6 +258,7 @@ def RDataFrameAsNumpy(df, columns=None, exclude=None, lazy=False):
     event-loop.
 
     Parameters:
+        df: The RDataFrame to read out.
         columns: If None return all branches as columns, otherwise specify names in iterable.
         exclude: Exclude branches from selection.
         lazy: Determines whether this action is instant (False, default) or lazy (True).
@@ -240,9 +273,9 @@ def RDataFrameAsNumpy(df, columns=None, exclude=None, lazy=False):
 
     # Sanitize input arguments
     if isinstance(columns, str):
-        raise TypeError("The columns argument requires a list of strings")
+        raise TypeError("The columns argument requires an iterable of strings")
     if isinstance(exclude, str):
-        raise TypeError("The exclude argument requires a list of strings")
+        raise TypeError("The exclude argument requires an iterable of strings")
 
     # Early check for numpy
     try:
@@ -310,7 +343,7 @@ def __init__(self, result_ptrs, columns):
         self._columns = columns
         self._py_arrays = None
 
-    def GetValue(self):
+    def GetValue(self) -> dict:
         """Triggers, if necessary, the event loop to run the Take actions for
         the requested columns and produce the NumPy arrays as result.
 
@@ -334,7 +367,11 @@ def GetValue(self):
                 else:
                     tmp = numpy.empty(len(cpp_reference), dtype=object)
                     for i, x in enumerate(cpp_reference):
-                        tmp[i] = x  # This creates only the wrapping of the objects and does not copy.
+                        if hasattr(x, "__array_interface__"):
+                            tmp[i] = numpy.asarray(x)
+                        else:
+                            tmp[i] = x
+
                     self._py_arrays[column] = ndarray(tmp, self._result_ptrs[column])
 
         return self._py_arrays
diff --git a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py
@@ -1,8 +1,11 @@
-import unittest
-import ROOT
-import numpy as np
 import pickle
+import platform
+import tempfile
+import unittest
+from pathlib import Path
 
+import numpy as np
+import ROOT
 from ROOT._pythonization._rdataframe import _clone_asnumpyresult
 
 
@@ -38,8 +41,7 @@ def make_tree(*dtypes):
         elif "O" in dtype:
             var = np.empty(1, dtype=np.uint8)
         else:
-            raise Exception(
-                "Type {} not known to create branch.".format(dtype))
+            raise Exception("Type {} not known to create branch.".format(dtype))
         col_vars.append(var)
 
     for dtype, name, var in zip(dtypes, col_names, col_vars):
@@ -71,6 +73,7 @@ class RDataFrameAsNumpy(unittest.TestCase):
     """
     Testing of RDataFrame.AsNumpy pythonization
     """
+
     def test_branch_dtypes(self):
         """
         Test supported data-types for read-out
@@ -89,8 +92,8 @@ def test_branch_bool(self):
         """
         df = ROOT.RDataFrame(2).Define("x", "bool(rdfentry_)")
         npy = df.AsNumpy()
-        self.assertTrue(bool(npy["x"][0]) == False)
-        self.assertTrue(bool(npy["x"][1]) == True)
+        self.assertFalse(bool(npy["x"][0]))
+        self.assertTrue(bool(npy["x"][1]))
 
     def test_read_array(self):
         """
@@ -131,12 +134,11 @@ def test_read_vector_constantsize(self):
             return std::vector<unsigned int>({n, n, n});
         }
         """)
-        df = ROOT.ROOT.RDataFrame(5).Define("x",
-                                       "create_vector_constantsize(rdfentry_)")
+        df = ROOT.ROOT.RDataFrame(5).Define("x", "create_vector_constantsize(rdfentry_)")
         npy = df.AsNumpy()
         self.assertEqual(npy["x"].size, 5)
         self.assertEqual(list(npy["x"][0]), [0, 0, 0])
-        self.assertIn("vector<unsigned int>", str(type(npy["x"][0])))
+        self.assertTrue(isinstance(npy["x"], np.ndarray))
 
     def test_read_vector_variablesize(self):
         """
@@ -147,12 +149,11 @@ def test_read_vector_variablesize(self):
             return std::vector<unsigned int>(n);
         }
         """)
-        df = ROOT.ROOT.RDataFrame(5).Define("x",
-                                       "create_vector_variablesize(rdfentry_)")
+        df = ROOT.ROOT.RDataFrame(5).Define("x", "create_vector_variablesize(rdfentry_)")
         npy = df.AsNumpy()
         self.assertEqual(npy["x"].size, 5)
         self.assertEqual(list(npy["x"][3]), [0, 0, 0])
-        self.assertIn("vector<unsigned int>", str(type(npy["x"][0])))
+        self.assertTrue(isinstance(npy["x"], np.ndarray))
 
     def test_read_tlorentzvector(self):
         """
@@ -197,8 +198,7 @@ def test_define_columns(self):
         """
         Testing reading defined columns
         """
-        df = ROOT.ROOT.RDataFrame(4).Define("x", "1").Define("y", "2").Define(
-            "z", "3")
+        df = ROOT.ROOT.RDataFrame(4).Define("x", "1").Define("y", "2").Define("z", "3")
         npy = df.AsNumpy(columns=["x", "y"])
         ref = {"x": np.array([1] * 4), "y": np.array([2] * 4)}
         self.assertTrue(sorted(["x", "y"]) == sorted(npy.keys()))
@@ -209,16 +209,14 @@ def test_exclude_columns(self):
         """
         Testing excluding columns from read-out
         """
-        df = ROOT.ROOT.RDataFrame(4).Define("x", "1").Define("y", "2").Define(
-            "z", "3")
+        df = ROOT.ROOT.RDataFrame(4).Define("x", "1").Define("y", "2").Define("z", "3")
         npy = df.AsNumpy(exclude=["z"])
         ref = {"x": np.array([1] * 4), "y": np.array([2] * 4)}
         self.assertTrue(sorted(["x", "y"]) == sorted(npy.keys()))
         self.assertTrue(all(ref["x"] == npy["x"]))
         self.assertTrue(all(ref["y"] == npy["y"]))
 
-        df2 = ROOT.ROOT.RDataFrame(4).Define("x", "1").Define("y", "2").Define(
-            "z", "3")
+        df2 = ROOT.ROOT.RDataFrame(4).Define("x", "1").Define("y", "2").Define("z", "3")
         npy = df2.AsNumpy(columns=["x", "y"], exclude=["y"])
         ref = {"x": np.array([1] * 4)}
         self.assertTrue(["x"] == list(npy.keys()))
@@ -264,7 +262,7 @@ def test_empty_array(self):
         df = ROOT.ROOT.RDataFrame(1).Define("x", "std::vector<float>()")
         npy = df.AsNumpy(["x"])
         self.assertEqual(npy["x"].size, 1)
-        self.assertTrue(npy["x"][0].empty())
+        self.assertEqual(npy["x"][0].size, 0)
 
     def test_empty_selection(self):
         """
@@ -319,19 +317,15 @@ def test_cloning(self):
 
         # Get the result for the first range
         (begin, end) = ranges.pop(0)
-        ROOT.Internal.RDF.ChangeEmptyEntryRange(
-            ROOT.RDF.AsRNode(df), (begin, end))
+        ROOT.Internal.RDF.ChangeEmptyEntryRange(ROOT.RDF.AsRNode(df), (begin, end))
         asnumpyres = df.AsNumpy(["x"], lazy=True)  # To return an AsNumpyResult
-        self.assertSequenceEqual(
-            asnumpyres.GetValue()["x"].tolist(), np.arange(begin, end).tolist())
+        self.assertSequenceEqual(asnumpyres.GetValue()["x"].tolist(), np.arange(begin, end).tolist())
 
         # Clone the result for following ranges
-        for (begin, end) in ranges:
-            ROOT.Internal.RDF.ChangeEmptyEntryRange(
-                ROOT.RDF.AsRNode(df), (begin, end))
+        for begin, end in ranges:
+            ROOT.Internal.RDF.ChangeEmptyEntryRange(ROOT.RDF.AsRNode(df), (begin, end))
             asnumpyres = _clone_asnumpyresult(asnumpyres)
-            self.assertSequenceEqual(
-                asnumpyres.GetValue()["x"].tolist(), np.arange(begin, end).tolist())
+            self.assertSequenceEqual(asnumpyres.GetValue()["x"].tolist(), np.arange(begin, end).tolist())
 
     def test_bool_column(self):
         """
@@ -343,8 +337,57 @@ def test_bool_column(self):
         df = ROOT.RDataFrame(n_events).Define(name, f"(int)rdfentry_ > {cut}")
         arr = df.AsNumpy([name])[name]
         ref = np.arange(0, n_events) > cut
-        self.assertTrue(all(arr == ref)) # test values
-        self.assertEqual(arr.dtype, ref.dtype) # test type
-
-if __name__ == '__main__':
+        self.assertTrue(all(arr == ref))  # test values
+        self.assertEqual(arr.dtype, ref.dtype)  # test type
+
+    def test_rdataframe_as_numpy_array_regular(self):
+        column_name = "vector"
+        n = 10
+        for from_file in [False, True]:
+            for shape, declaration in [
+                ((n, 3), "std::vector<int>{1,2,3}"),
+                ((n, 3), "std::vector<float>{1,2,3}"),
+                ((n, 3), "std::vector<double>{1,2,3}"),
+            ]:
+                df = ROOT.RDataFrame(10).Define(column_name, declaration)
+                temp_file_path = None
+                if from_file:
+                    # save to disk and read back
+                    temp_file = tempfile.NamedTemporaryFile(delete=False)
+                    temp_file_path = Path(temp_file.name)
+                    temp_file.close()
+
+                    df.Snapshot("tree", str(temp_file_path))
+                    df = ROOT.RDataFrame("tree", str(temp_file_path))
+
+                array = df.AsNumpy([column_name])[column_name]
+                self.assertTrue(isinstance(array, np.ndarray))
+                # self.assertEqual(array.shape, shape) # when we implement regular array handling
+                self.assertTrue(array.shape[0] == n)
+                self.assertTrue(all(x.shape[0] == shape[1] for x in array))
+
+                if from_file and platform.system() != "Windows":
+                    temp_file_path.unlink()
+
+    def test_rdataframe_as_numpy_array_jagged(self):
+        jagged_array = ROOT.std.vector(float)()
+        column_name = "jagged_array"
+        tree = ROOT.TTree("tree", "Tree with Jagged Array")
+        tree.Branch(column_name, jagged_array)
+        n = 10
+        for i in range(n):
+            jagged_array.clear()
+            for j in range(i):
+                jagged_array.push_back(j)
+            tree.Fill()
+
+        df = ROOT.RDataFrame(tree)
+        array = df.AsNumpy([column_name])[column_name]
+        self.assertTrue(isinstance(array, np.ndarray))
+        self.assertTrue(array.shape[0] == n)
+        self.assertTrue(all(isinstance(x, np.ndarray) for x in array))
+        self.assertTrue(all(len(x) == i for i, x in enumerate(array)))
+
+
+if __name__ == "__main__":
     unittest.main()