Update Array2D accessing pattern & use CUDA_CALLABLE macro (#17)

horizon-blue · web-flow · commit 4bee9157aaf8 · 2025-11-23T00:36:07.000-05:00
(Closes MET-47) ## Summary of Changes This PR addresses some of the suggestions that @mugamma brought up in #14. In particular, it defines the `operator[]` on `Array2D` to return a 1D view of a row, so we can use patterns like `array2d[i][j]` instead of `array2d(i, j)` to access the element. Another nice thing about returning the 1D span is that we can use range-based for loop to go over the elements in a row as well, e.g. ```cpp for (auto& val : array2d[rows]) { /* do something with val*/ } ``` You can find some example usages in the [included test file](https://github.com/probcomp/GenMetaBalls/pull/17/files#diff-92c53773082b537451d1c0e757c8ac6b5c6f85fd4bcd6e01bf82cade202575fb). Another minor change in this PR is the refactoring of`Array2D` methods to use the new `CUDA_CALLABLE` macro that Arijit introduced recently. ## Test Plan To run the included unit tests: ```bash pixi run test ```
diff --git a/genmetaballs/src/cuda/core/utils.cuh b/genmetaballs/src/cuda/core/utils.cuh
@@ -3,7 +3,10 @@
 #include <cmath>
 #include <cstdint>
 #include <cuda/std/mdspan>
+#include <cuda/std/span>
 #include <cuda_runtime.h>
+#include <memory>
+#include <thrust/memory.h>
 
 #define CUDA_CALLABLE __host__ __device__
 
@@ -28,28 +31,36 @@ private:
 
 public:
     // constructor
-    __host__ __device__ constexpr Array2D(T* data, uint32_t rows, uint32_t cols)
-        : data_view_(data, rows, cols) {}
+    template <typename Pointer>
+    CUDA_CALLABLE constexpr Array2D(Pointer data_ptr, uint32_t rows, uint32_t cols)
+        : data_view_(thrust::raw_pointer_cast(data_ptr), rows, cols) {}
 
-    // accessor methods
-    __host__ __device__ constexpr T& operator()(uint32_t row, uint32_t col) {
-        return data_view_(row, col);
-    }
-    __host__ __device__ constexpr T operator()(uint32_t row, uint32_t col) const {
-        return data_view_(row, col);
+    // getting a 1D view of a specific row
+    // this supports array2d[row][col] access pattern and range-based for loops
+    // e.g., for (auto val : array2d[row]) { ... }
+    CUDA_CALLABLE constexpr auto operator[](uint32_t row) const {
+        return cuda::std::span<T>(data_view_.data_handle() + row * num_cols(), num_cols());
     }
+
     // size methods
-    __host__ __device__ constexpr auto num_rows() const noexcept {
+    CUDA_CALLABLE constexpr auto num_rows() const noexcept {
         return data_view_.extent(0);
     }
-    __host__ __device__ constexpr auto num_cols() const noexcept {
+    CUDA_CALLABLE constexpr auto num_cols() const noexcept {
         return data_view_.extent(1);
     }
 
-    __host__ __device__ constexpr auto rank() const noexcept {
+    CUDA_CALLABLE constexpr auto ndim() const noexcept {
         return data_view_.rank();
     }
-    __host__ __device__ constexpr auto size() const noexcept {
+    CUDA_CALLABLE constexpr auto size() const noexcept {
         return data_view_.size();
     }
 }; // class Array2D
+
+// Type deduction guide
+// if initialized with (Pointer, int, int), deduce T by looking at what raw_pointer_cast returns
+// so we can write Array2D(array_ptr, rows, cols) instead of Array2D<Type>(array_ptr, rows, cols)
+template <typename Pointer>
+Array2D(Pointer, uint32_t, uint32_t)
+    -> Array2D<typename std::pointer_traits<Pointer>::element_type>;
diff --git a/tests/cpp_tests/test_utils.cu b/tests/cpp_tests/test_utils.cu
@@ -92,15 +92,21 @@ TEST(GpuSigmoidTest, SigmoidGPUWithinBounds) {
     }
 }
 
+namespace test_utils_gpu {
 // CUDA kernel to fill Array2D with sequential values
 __global__ void fill_array2d_kernel(Array2D<float> array2d) {
     uint32_t i = threadIdx.x;
     uint32_t j = threadIdx.y;
 
     if (i < array2d.num_rows() && j < array2d.num_cols()) {
-        array2d(i, j) = i * array2d.num_cols() + j;
+        if (i == array2d.num_rows() - 1) {
+            array2d[i][j] = -1.0f; // last row set to -1
+        } else {
+            array2d[i][j] = i * array2d.num_cols() + j;
+        }
     }
 }
+} // namespace test_utils_gpu
 
 template <typename Container>
 class Array2DTestFixture : public ::testing::Test {};
@@ -115,33 +121,80 @@ TYPED_TEST(Array2DTestFixture, CreateAndAccessArray2D) {
 
     auto data = TypeParam(rows * cols);
     // create 2D view into the underlying data on host or device
-    auto array2d = Array2D(thrust::raw_pointer_cast(data.data()), rows, cols);
+    auto array2d = Array2D(data.data(), rows, cols);
 
     if constexpr (std::is_same_v<TypeParam, std::vector<float>>) {
-        for (auto i = 0; i < rows; i++) {
+        for (auto i = 0; i < rows - 1; i++) {
             for (auto j = 0; j < cols; j++) {
-                array2d(i, j) = i * cols + j;
+                array2d[i][j] = i * cols + j;
             }
         }
+        for (auto& val : array2d[rows - 1]) {
+            val = -1.0f; // setting last row to -1 with range-based for loop
+        }
     } else {
         // Launch kernel to fill Array2D on device
         // Note: we could've simply use thrust::sequence to fill the device vector,
         // but this is a simple example to demonstrate how to pass an Array2D to a kernel.
-        fill_array2d_kernel<<<1, dim3(rows, cols)>>>(array2d);
+        test_utils_gpu::fill_array2d_kernel<<<1, dim3(rows, cols)>>>(array2d);
         CUDA_CHECK(cudaGetLastError());
         CUDA_CHECK(cudaDeviceSynchronize());
     }
 
     EXPECT_EQ(array2d.size(), rows * cols);
     EXPECT_EQ(array2d.num_rows(), rows);
     EXPECT_EQ(array2d.num_cols(), cols);
-    EXPECT_EQ(array2d.rank(), 2); // 2D array
+    EXPECT_EQ(array2d.ndim(), 2); // 2D array
 
     // create host vector to verify the data
     // for std::vector, this simply duplicate the vector.
     // for thrust::device_vector, it will copy the data to the host.
     thrust::host_vector<float> host_data = data;
-    for (auto idx = 0; idx < rows * cols; idx++) {
+    for (auto idx = 0; idx < (rows - 1) * cols; idx++) {
         EXPECT_FLOAT_EQ(host_data[idx], idx);
     }
+    for (auto idx = (rows - 1) * cols; idx < rows * cols; idx++) {
+        EXPECT_FLOAT_EQ(host_data[idx], -1.0f);
+    }
+}
+
+// Test that modifications through view affect underlying data
+TYPED_TEST(Array2DTestFixture, ViewModifiesUnderlyingData) {
+    if constexpr (std::is_same_v<TypeParam, std::vector<float>>) {
+        uint32_t rows = 3;
+        uint32_t cols = 4;
+        auto data = TypeParam(rows * cols, 0.0f);
+        auto array2d = Array2D(data.data(), rows, cols);
+
+        // Modify through view
+        array2d[1][2] = 42.5f;
+        // Verify underlying data changed
+        EXPECT_FLOAT_EQ(data[1 * cols + 2], 42.5f);
+
+        // Modify underlying data directly
+        data[0 * cols + 1] = 99.9f;
+        // Verify view reflects change
+        EXPECT_FLOAT_EQ(array2d[0][1], 99.9f);
+    }
+}
+
+// Test multiple views of the same data
+TYPED_TEST(Array2DTestFixture, MultipleViewsOfSameData) {
+    if constexpr (std::is_same_v<TypeParam, std::vector<float>>) {
+        uint32_t rows = 2;
+        uint32_t cols = 3;
+        auto data = TypeParam(rows * cols, 0.0f);
+        auto view1 = Array2D(data.data(), rows, cols);
+        auto view2 = Array2D(data.data(), rows, cols);
+
+        // Modify through view1
+        view1[0][0] = 100.0f;
+        // Verify view2 sees the change
+        EXPECT_FLOAT_EQ(view2[0][0], 100.0f);
+
+        // Modify through view2
+        view2[1][2] = 200.0f;
+        // Verify view1 sees the change
+        EXPECT_FLOAT_EQ(view1[1][2], 200.0f);
+    }
 }