diff --git a/.clang-format b/.clang-format
index 48bf157..e0d7241 100644
--- a/.clang-format
+++ b/.clang-format
@@ -115,5 +115,6 @@ SpacesInSquareBrackets: false
 StatementMacros:
   - Q_UNUSED
   - QT_REQUIRE_VERSION
+InsertNewlineAtEOF: true
 ...
 
diff --git a/.clang-tidy b/.clang-tidy
index 8bf217c..db4c130 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -37,7 +37,8 @@ Checks: >
   -modernize-avoid-c-arrays,
   -readability-magic-numbers,
   -cert-dcl37-c,
-  -cert-dcl51-cpp
+  -cert-dcl51-cpp,
+  -cert-dcl58-cpp,
 
 WarningsAsErrors: ''
 HeaderFilterRegex: '.*'
@@ -51,4 +52,7 @@ CheckOptions:
     value: 'std::vector;.*Iterator'
   - key: modernize-use-nodiscard.Macros
     value: 'CUDA_CHECK'
+  - key: readability-function-cognitive-complexity.IgnoreMacros
+    # GTest macros can artificially increase the cognitive complexity, so we ignore them.
+    value: true
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 285c6dd..054ac28 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,9 +69,9 @@ find_package(GTest CONFIG REQUIRED)
 
 # C++/CUDA test executable
 file(GLOB CPP_CUDA_TEST_SOURCES tests/cpp_tests/*.cu tests/cpp_tests/*.cpp)
-add_executable(test_add ${CPP_CUDA_TEST_SOURCES})
+add_executable(cpp_tests ${CPP_CUDA_TEST_SOURCES})
 
-target_link_libraries(test_add
+target_link_libraries(cpp_tests
   PRIVATE
     genmetaballs_core
     GTest::gtest
@@ -79,9 +79,9 @@ target_link_libraries(test_add
 )
 
 # Enable CUDA for the test executable
-set_target_properties(test_add PROPERTIES
+set_target_properties(cpp_tests PROPERTIES
   CUDA_SEPARABLE_COMPILATION ON
 )
 
 include(GoogleTest)
-gtest_discover_tests(test_add)
+gtest_discover_tests(cpp_tests)
diff --git a/genmetaballs/src/cuda/bindings.cu b/genmetaballs/src/cuda/bindings.cu
index 9e29186..15cbcc0 100644
--- a/genmetaballs/src/cuda/bindings.cu
+++ b/genmetaballs/src/cuda/bindings.cu
@@ -1,5 +1,6 @@
 #include <cstdint>
 #include <nanobind/nanobind.h>
+#include <nanobind/operators.h>
 #include <nanobind/stl/vector.h>
 
 #include "core/add.cuh"
@@ -20,10 +21,10 @@ NB_MODULE(_genmetaballs_bindings, m) {
         .def_rw("x", &Vec3D::x)
         .def_rw("y", &Vec3D::y)
         .def_rw("z", &Vec3D::z)
-        .def("__add__", &operator+)
-        .def("__sub__", &operator-)
+        .def(nb::self + nb::self)
+        .def(nb::self - nb::self)
         .def("__repr__", [](const Vec3D& v) {
             nb::str s = nb::str("Vec3D({}, {}, {})").format(v.x, v.y, v.z);
             return s;
         });
-}
\ No newline at end of file
+}
diff --git a/genmetaballs/src/cuda/core/utils.cuh b/genmetaballs/src/cuda/core/utils.cuh
index cf114ae..108cd4d 100644
--- a/genmetaballs/src/cuda/core/utils.cuh
+++ b/genmetaballs/src/cuda/core/utils.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <cstdint>
+#include <cuda/std/mdspan>
 #include <cuda_runtime.h>
 
 #define CUDA_CHECK(x)                                                                              \
@@ -10,27 +11,38 @@
 
 void cuda_check(cudaError_t code, const char* file, int line);
 
-// XXX container_t should be a thrust container type
-template <typename container_t>
+// Non-owning 2D view into a contiguous array in either host or device memory
+template <typename T>
 class Array2D {
 private:
-    // XXX TODO: make sure this works
-    container_t data_;
+    cuda::std::mdspan<
+        T, cuda::std::extents<uint32_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>>
+        data_view_;
 
 public:
-    __host__ __device__ __forceinline__ container_t& at(const uint32_t i, const uint32_t j) {
-        return data_;
-        // return data_[i * width + j];
-    }
+    // constructor
+    __host__ __device__ constexpr Array2D(T* data, uint32_t rows, uint32_t cols)
+        : data_view_(data, rows, cols) {}
 
-    __host__ __device__ __forceinline__ const container_t& at(const uint32_t i,
-                                                              const uint32_t j) const {
-        return data_;
-        // return data_[i * width + j];
+    // accessor methods
+    __host__ __device__ constexpr T& operator()(uint32_t row, uint32_t col) {
+        return data_view_(row, col);
+    }
+    __host__ __device__ constexpr T operator()(uint32_t row, uint32_t col) const {
+        return data_view_(row, col);
+    }
+    // size methods
+    __host__ __device__ constexpr auto num_rows() const noexcept {
+        return data_view_.extent(0);
+    }
+    __host__ __device__ constexpr auto num_cols() const noexcept {
+        return data_view_.extent(1);
     }
 
-    __host__ __device__ constexpr uint32_t size() const {
-        return 0;
-        // return width * height;
+    __host__ __device__ constexpr auto rank() const noexcept {
+        return data_view_.rank();
+    }
+    __host__ __device__ constexpr auto size() const noexcept {
+        return data_view_.size();
     }
 };
diff --git a/tests/cpp_tests/test_utils.cu b/tests/cpp_tests/test_utils.cu
new file mode 100644
index 0000000..9fb97e8
--- /dev/null
+++ b/tests/cpp_tests/test_utils.cu
@@ -0,0 +1,63 @@
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <type_traits>
+#include <vector>
+
+#include "core/utils.cuh"
+
+// CUDA kernel to fill Array2D with sequential values
+__global__ void fill_array2d_kernel(Array2D<float> array2d) {
+    uint32_t i = threadIdx.x;
+    uint32_t j = threadIdx.y;
+
+    if (i < array2d.num_rows() && j < array2d.num_cols()) {
+        array2d(i, j) = i * array2d.num_cols() + j;
+    }
+}
+
+template <typename Container>
+class Array2DTestFixture : public ::testing::Test {};
+
+using ContainerTypes = ::testing::Types<std::vector<float>, thrust::device_vector<float>>;
+
+TYPED_TEST_SUITE(Array2DTestFixture, ContainerTypes);
+
+TYPED_TEST(Array2DTestFixture, CreateAndAccessArray2D) {
+    uint32_t rows = 4;
+    uint32_t cols = 6;
+
+    auto data = TypeParam(rows * cols);
+    // create 2D view into the underlying data on host or device
+    auto array2d = Array2D(thrust::raw_pointer_cast(data.data()), rows, cols);
+
+    if constexpr (std::is_same_v<TypeParam, std::vector<float>>) {
+        for (auto i = 0; i < rows; i++) {
+            for (auto j = 0; j < cols; j++) {
+                array2d(i, j) = i * cols + j;
+            }
+        }
+    } else {
+        // Launch kernel to fill Array2D on device
+        // Note: we could've simply use thrust::sequence to fill the device vector,
+        // but this is a simple example to demonstrate how to pass an Array2D to a kernel.
+        fill_array2d_kernel<<<1, dim3(rows, cols)>>>(array2d);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    EXPECT_EQ(array2d.size(), rows * cols);
+    EXPECT_EQ(array2d.num_rows(), rows);
+    EXPECT_EQ(array2d.num_cols(), cols);
+    EXPECT_EQ(array2d.rank(), 2); // 2D array
+
+    // create host vector to verify the data
+    // for std::vector, this simply duplicate the vector.
+    // for thrust::device_vector, it will copy the data to the host.
+    thrust::host_vector<float> host_data = data;
+    for (auto idx = 0; idx < rows * cols; idx++) {
+        EXPECT_FLOAT_EQ(host_data[idx], idx);
+    }
+}