diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py index f9bb2771a73a8..c784c9ca1957c 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rdataframe.py @@ -296,10 +296,6 @@ def RDataFrameAsNumpy( result_ptrs = {} for column in columns: column_type = df.GetColumnType(column) - # bool columns should be taken as unsigned chars, because NumPy stores - # bools in bytes - different from the std::vector returned by the - # action, which might do some space optimization - column_type = "unsigned char" if column_type == "bool" else column_type # If the column type is a class, make sure cling knows about it tclass = ROOT.TClass.GetClass(column_type) @@ -307,8 +303,10 @@ def RDataFrameAsNumpy( raise RuntimeError( f'The column named "{column}" is of type "{column_type}", which is not known to the ROOT interpreter. Please load the corresponding header files or dictionaries.' ) - - result_ptrs[column] = df.Take[column_type](column) + # We take the values via ROOT::RVec to avoid having to deal with std::vector + # This uses one single data structure for all array types, which exposes the array interface + # allowing zero-copy conversion to numpy array + result_ptrs[column] = df.Take[f"{column_type}, ROOT::RVec<{column_type}>"](column) result = AsNumpyResult(result_ptrs, columns) diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rvec.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rvec.py index d92b0f2c3ab57..7e5f8c42e1ad6 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rvec.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_rvec.py @@ -62,16 +62,73 @@ from . import pythonization +# This map includes all relevant C++ fundamental types found at +# https://en.cppreference.com/w/cpp/language/types.html and the associated +# ROOT portable types when available. _array_interface_dtype_map = { - "Long64_t": "i", - "ULong64_t": "u", - "double": "f", - "float": "f", + # Integral types + # C++ standard integer types + "short": "i", + "short int": "i", + "signed short": "i", + "signed short int": "i", + "unsigned short": "u", + "unsigned short int": "u", "int": "i", - "long": "i", - "unsigned char": "b", + "signed": "i", + "signed int": "i", + "unsigned": "u", "unsigned int": "u", + "long": "i", + "long int": "i", + "signed long": "i", + "signed long int": "i", "unsigned long": "u", + "unsigned long int": "u", + "long long": "i", + "long long int": "i", + "signed long long": "i", + "signed long long int": "i", + "unsigned long long": "u", + "unsigned long long int": "u", + "std::size_t": "i", + # Extended standard integer types + "std::int8_t": "i", + "std::int16_t": "i", + "std::int32_t": "i", + "std::int64_t": "i", + "std::uint8_t": "u", + "std::uint16_t": "u", + "std::uint32_t": "u", + "std::uint64_t": "u", + # ROOT integer types + "Int_t": "i", + "UInt_t": "u", + "Short_t": "i", + "UShort_t": "u", + "Long_t": "i", + "ULong_t": "u", + "Long64_t": "i", + "ULong64_t": "u", + # Boolean type + "bool": "b", + "Bool_t": "b", + # Character types + "char": "i", + "Char_t": "i", + "signed char": "i", + "unsigned char": "u", + "UChar_t": "u", + "char16_t": "i", + "char32_t": "i", + # Floating-point types + # C++ standard floating-point types + "float": "f", + "double": "f", + "long double": "f", + # ROOT floating-point types + "Float_t": "f", + "Double_t": "f", } diff --git a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py index dd9dd87f64507..0bc84e84b5b41 100644 --- a/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py +++ b/bindings/pyroot/pythonizations/test/rdataframe_asnumpy.py @@ -7,7 +7,7 @@ import numpy as np import ROOT from ROOT._pythonization._rdataframe import _clone_asnumpyresult - +import os def make_tree(*dtypes): """ @@ -90,10 +90,22 @@ def test_branch_bool(self): Test bool data-type as a special case since we cannot adopt the std::vector with numpy arrays """ - df = ROOT.RDataFrame(2).Define("x", "bool(rdfentry_)") + treename = "test_branch_bool" + filename = "test_branch_bool.root" + # Snapshot a TTree so that column 'x' will be of type 'Bool_t' + ROOT.RDataFrame(2).Define("x", "bool(rdfentry_)").Snapshot(treename, filename) + # The column 'y' will instead have type 'bool' + df = ROOT.RDataFrame(treename, filename).Define("y", "bool(rdfentry_)") + self.assertEqual(df.GetColumnType("x"), "Bool_t") + self.assertEqual(df.GetColumnType("y"), "bool") npy = df.AsNumpy() - self.assertFalse(bool(npy["x"][0])) - self.assertTrue(bool(npy["x"][1])) + # Both numpy arrays should have dtype bool + self.assertEqual(npy["x"].dtype, bool) + self.assertEqual(npy["y"].dtype, bool) + self.assertFalse(npy["x"][0]) + self.assertTrue(npy["x"][1]) + self.assertFalse(npy["y"][0]) + self.assertTrue(npy["y"][1]) def test_read_array(self): """