feat: add RNTuple writing support for IndexedArray and fix IndexedOptionArray (#1493)

ariostas · web-flow · commit 1359273128f1 · 2025-09-05T17:16:26.000-04:00
* Added writing support for IndexedArray and fixed IndexedOptionArray

* Slightly better test

* Fixed issue with 32-bit indices

* Updated ROOT test
diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py
@@ -544,8 +544,11 @@ def field_form(self, this_id, keys, ak_add_doc=False):
             if this_id in self._related_ids:
                 child_id = self._related_ids[this_id][0]
             inner = self.field_form(child_id, keys, ak_add_doc=ak_add_doc)
+            idx_type = (
+                "i32" if self._column_records_dict[cfid][0].nbits == 32 else "i64"
+            )
             return ak.forms.ListOffsetForm(
-                "i64", inner, form_key=keyname, parameters=parameters
+                idx_type, inner, form_key=keyname, parameters=parameters
             )
         elif structural_role == uproot.const.RNTupleFieldRole.RECORD:
             newids = []
diff --git a/src/uproot/writing/_cascadentuple.py b/src/uproot/writing/_cascadentuple.py
@@ -111,7 +111,7 @@ def _cpp_typename(akform, subcall=False):
     elif isinstance(akform, awkward.forms.UnionForm):
         field_typenames = [_cpp_typename(t, subcall=True) for t in akform.contents]
         typename = f"std::variant<{','.join(field_typenames)}>"
-    elif isinstance(akform, awkward.forms.UnmaskedForm):
+    elif isinstance(akform, (awkward.forms.UnmaskedForm, awkward.forms.IndexedForm)):
         return _cpp_typename(akform.content, subcall=True)
     else:
         raise NotImplementedError(f"Form type {type(akform)} cannot be written yet")
@@ -484,7 +484,10 @@ def _build_field_col_records(
                     field_name=subfield_name,
                     parent_fid=field_id,
                 )
-        elif isinstance(akform, awkward.forms.UnmaskedForm):
+        elif isinstance(
+            akform, (awkward.forms.UnmaskedForm, awkward.forms.IndexedForm)
+        ):
+            # IndexedForms just get rearranged, so they are transparent
             # Do nothing
             self._build_field_col_records(
                 akform.content,
@@ -930,18 +933,24 @@ def extend(self, file, sink, data):
 
         cluster_page_data = []  # list of list of (locator, len, offset)
         data_buffers = awkward.to_buffers(data)[2]
-        for idx, key in enumerate(self._header._column_keys):
-            if "switch" in key:
+
+        # We need to modify make a few modifications since not everything directly translates to RNTuples
+        for key in list(data_buffers.keys()):
+            barekey = key.split("-")[0]
+            if "offset" in key:
+                # RNTuples don't store the first offset
+                data_buffers[key] = data_buffers[key][1:]
+            elif "index" in key and barekey + "-tags" in data_buffers:
+                # We group indices and tags into a single array
                 dtype = numpy.dtype([("index", "int64"), ("tag", "int32")])
-                indices = data_buffers[key.split("-")[0] + "-index"]
-                tags = data_buffers[key.split("-")[0] + "-tags"]
+                indices = data_buffers[barekey + "-index"]
+                tags = data_buffers[barekey + "-tags"]
                 switches = numpy.zeros(len(indices), dtype=dtype)
                 switches["index"] = indices
                 switches["tag"] = tags + 1
-                col_data = switches
-            elif "startstop" in key:
+                data_buffers[barekey + "-switch"] = switches
+            elif "start" in key:
                 # ListArrays need to be converted to ListOffsetArrays
-                barekey = key.split("-")[0]
                 starts = awkward.index.Index(data_buffers[f"{barekey}-starts"])
                 stops = awkward.index.Index(data_buffers[f"{barekey}-stops"])
                 next_barekey = f"node{int(barekey[4:])+1}"
@@ -953,17 +962,20 @@ def extend(self, file, sink, data):
                         starts, stops, content
                     ).to_ListOffsetArray64()
                 )[2]
+                data_buffers[f"{barekey}-startstop"] = tmp_buffers["node0-offsets"][1:]
                 data_buffers[f"{next_barekey}-data"] = tmp_buffers["node1-data"]
-                col_data = tmp_buffers["node0-offsets"][1:]
-                # no longer need the temporary data
-                del starts, stops, content, tmp_buffers
-            else:
-                col_data = data_buffers[key]
-            if "offsets" in key:
-                col_data = col_data[1:]
             elif "index" in key:
-                deltas = numpy.array(col_data != -1, dtype=col_data.dtype)
-                col_data = numpy.cumsum(deltas)
+                # We need to rearrange the data
+                next_barekey = f"node{int(barekey[4:])+1}"
+                index = data_buffers[key]
+                content = data_buffers[f"{next_barekey}-data"]
+                content = content[index[index >= 0]]  # Rearrange data
+                deltas = numpy.array(index >= 0, dtype=index.dtype)
+                data_buffers[key] = numpy.cumsum(deltas, dtype=deltas.dtype)
+                data_buffers[f"{next_barekey}-data"] = content
+
+        for idx, key in enumerate(self._header._column_keys):
+            col_data = data_buffers[key]
             col_len = len(col_data.reshape(-1))
             raw_data = col_data.reshape(-1).view("uint8")
             if col_data.dtype == numpy.dtype("bool"):
diff --git a/tests/test_1395_rntuple_writing_lists_and_structs.py b/tests/test_1395_rntuple_writing_lists_and_structs.py
@@ -44,6 +44,18 @@
             ak.index.Index([1, 2, 4]),
             ak.contents.NumpyArray([0, 1, 2, 3, 4, 5]),
         ),
+        "indexed_option_array": ak.contents.IndexedOptionArray(
+            ak.index.Index([3, -1, 1]),
+            ak.contents.NumpyArray([0, 1, 2, 3, 4, 5]),
+        ),
+        "indexed_option_array32": ak.contents.IndexedOptionArray(
+            ak.index.Index32([3, -1, 1]),
+            ak.contents.NumpyArray([0, 1, 2, 3, 4, 5]),
+        ),
+        "indexed_array": ak.contents.IndexedArray(
+            ak.index.Index([3, 0, 1]),
+            ak.contents.NumpyArray([0, 1, 2, 3, 4, 5]),
+        ),
     }
 )
 
@@ -71,7 +83,7 @@ def test_writing_and_reading(tmp_path):
     arrays = obj.arrays()
 
     for f in data.fields:
-        if f == "optional":
+        if f in ("optional", "indexed_option_array", "indexed_option_array32"):
             assert [t[0] if len(t) > 0 else None for t in arrays[f][:3]] == data[
                 f
             ].tolist()
@@ -143,6 +155,15 @@ def test_writing_then_reading_with_ROOT(tmp_path, capfd):
         in out
     )
     assert "* Field 17           : list_array (std::vector<std::int64_t>)" in out
+    assert (
+        "* Field 18           : indexed_option_array (std::optional<std::int64_t>)"
+        in out
+    )
+    assert (
+        "* Field 19           : indexed_option_array32 (std::optional<std::int64_t>)"
+        in out
+    )
+    assert "* Field 20           : indexed_array (std::int64_t)" in out
 
 
 def test_field_descriptions(tmp_path):