Fix JaggedTensor single-element constructor unconditionally initializing CUDA via pinned_memory (openvdb#468)

swahtz · web-flow · commit b67b22735c37 · 2026-02-12T21:17:14.000-08:00
## Summary - Make `pinned_memory` conditional on the tensor device being CUDA in two locations where single-element `JaggedTensor` construction unconditionally allocated pinned (page-locked) memory via `cudaHostAlloc`, which forced CUDA runtime initialization even for CPU-only tensors. - This caused crashes in forked `DataLoader` worker processes (where re-initializing CUDA after `fork()` is forbidden) and added unnecessary overhead for CPU-only workloads. - Add a test verifying that CPU single-element `JaggedTensor` offsets are not pinned. Fixes openvdb#467 ## Changes ### `src/fvdb/JaggedTensor.cpp` `.pinned_memory(true)` → `.pinned_memory(mData.device().is_cuda())` in the `JaggedTensor(const std::vector<torch::Tensor>&)` single-element branch. ### `src/fvdb/detail/ops/JOffsetsFromJIdx.cu` `.pinned_memory(true)` → `.pinned_memory(jdata.device().is_cuda())` in `joffsetsFromJIdx()`, which is the shared implementation called by CPU, CUDA, and PrivateUse1 dispatch paths. ### `tests/unit/test_jagged_tensor.py` New `test_cpu_single_element_no_cuda_init` verifying both constructor paths produce non-pinned offsets for CPU tensors. Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
diff --git a/src/fvdb/JaggedTensor.cpp b/src/fvdb/JaggedTensor.cpp
@@ -79,11 +79,12 @@ JaggedTensor::JaggedTensor(const std::vector<torch::Tensor> &tensors) {
                     "assigned data must have shape [N, ...], but got data.dim() = 0");
         mBatchIdx =
             torch::empty({0}, torch::TensorOptions().dtype(JIdxScalarType).device(mData.device()));
-        mOffsets = torch::tensor({JOffsetsType(0), mData.size(0)},
-                                 torch::TensorOptions()
-                                     .dtype(JOffsetsScalarType)
-                                     .device(mData.device())
-                                     .pinned_memory(true));
+        mOffsets = torch::tensor(
+            {JOffsetsType(0), mData.size(0)},
+            torch::TensorOptions()
+                .dtype(JOffsetsScalarType)
+                .device(mData.device())
+                .pinned_memory(mData.device().is_cuda() || mData.device().is_privateuseone()));
         mListIdx = torch::empty(
             {0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device()));
         mNumOuterLists = 1;
diff --git a/src/fvdb/detail/ops/JOffsetsFromJIdx.cu b/src/fvdb/detail/ops/JOffsetsFromJIdx.cu
@@ -23,8 +23,11 @@ joffsetsFromJIdx(torch::Tensor jidx, torch::Tensor jdata, int64_t numTensors) {
     TORCH_CHECK_VALUE(jidx.dim() == 1, "jidx must be a 1D tensor");
 
     if (jidx.size(0) == 0 && numTensors == 1) {
-        torch::Tensor ret =
-            torch::empty({2}, torch::TensorOptions().dtype(JOffsetsScalarType).pinned_memory(true));
+        torch::Tensor ret = torch::empty(
+            {2},
+            torch::TensorOptions()
+                .dtype(JOffsetsScalarType)
+                .pinned_memory(jdata.device().is_cuda() || jdata.device().is_privateuseone()));
         auto acc = ret.accessor<JOffsetsType, 1>();
         acc[0]   = 0;
         acc[1]   = jdata.size(0);
diff --git a/tests/unit/test_jagged_tensor.py b/tests/unit/test_jagged_tensor.py
@@ -2738,6 +2738,28 @@ def test_from_data_indices_and_list_ids(self, device, dtype):
     #         self.assertTrue(torch.all(data_sorted == jt_s[i].jdata).item())
     #         self.assertTrue(torch.all(data_sorted == jt[i].jdata[idx[i].jdata]).item())
 
+    def test_cpu_single_element_no_cuda_init(self):
+        """Test that constructing a single-element CPU JaggedTensor does not use pinned memory.
+
+        Pinned memory allocation triggers CUDA runtime initialization, which causes crashes
+        in forked DataLoader worker processes. This test verifies the fix for issue #467.
+        """
+        # Test the list-of-tensors constructor (single element)
+        cpu_tensor = torch.randn(5, 3)
+        jt = fvdb.JaggedTensor([cpu_tensor])
+        self.assertFalse(jt.joffsets.is_pinned(), "CPU single-element JaggedTensor offsets should not be pinned")
+        self.assertEqual(jt.joffsets.device.type, "cpu")
+        self.assertEqual(jt.jdata.shape, torch.Size([5, 3]))
+        self.assertTrue(torch.equal(jt.jdata, cpu_tensor))
+
+        # Test the bare-tensor constructor (dispatches through joffsetsFromJIdx)
+        cpu_tensor2 = torch.randn(10)
+        jt2 = fvdb.JaggedTensor(cpu_tensor2)
+        self.assertFalse(jt2.joffsets.is_pinned(), "CPU bare-tensor JaggedTensor offsets should not be pinned")
+        self.assertEqual(jt2.joffsets.device.type, "cpu")
+        self.assertEqual(jt2.jdata.shape, torch.Size([10]))
+        self.assertTrue(torch.equal(jt2.jdata, cpu_tensor2))
+
 
 if __name__ == "__main__":
     unittest.main()