From 795a1b9c0810567f59f265abe940381037aff800 Mon Sep 17 00:00:00 2001
From: Matin Ghavami <mghavami@mit.edu>
Date: Tue, 11 Nov 2025 16:19:29 -0500
Subject: [PATCH 1/5] add forward skeleton

---
 genmetaballs/src/cuda/core/forward.cu | 62 +++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 genmetaballs/src/cuda/core/forward.cu
diff --git a/genmetaballs/src/cuda/core/forward.cu b/genmetaballs/src/cuda/core/forward.cu
new file mode 100644
index 0000000..c5289a8
--- /dev/null
+++ b/genmetaballs/src/cuda/core/forward.cu
@@ -0,0 +1,62 @@
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+constexpr NUM_BLOCKS dim3(10); //XXX madeup
+constexpr THREADS_PER_BLOCK dim3(10);
+
+namespace FMB {
+
+__device__ __host__
+std::vector<std::pair<PixelCoord, Ray>>
+get_pixel_coords_and_rays(const dim3 thread_idx, const dim3 block_idx)
+{
+    std::vector<std::pair<PixelCoord, Ray>> res;
+
+    uint32_t i_beg = 0; // XXX TODO
+    uint32_t i_end = 0; // XXX TODO
+
+    for(int i = i_beg; i < i_end; i+=blockDim.x) {
+        //...
+    }
+
+    return res;
+}
+
+template<class Getter, class Intersector, class Blender, class Confidence>
+__global__
+render_kernel(
+    const typename Getter::Getter &fmb_getter,
+    const Intrinsics &intr,
+    const Pose &extr,
+    Image *img
+) {
+    // TODO how to find the relevant chunk of computation from threadIdx,
+    // blockIdx, etc
+    auto pixel_coords_and_rays = get_pixel_coords_and_rays(threadIdx, blockIdx, ...);
+
+    for(const auto &[pixel_coords, ray]: pixel_coords_and_rays) {
+        float w0 = 0.0f, tf = 0.0f, confidence = 0.0f;
+        for(const auto &fmb: fmb_getter.get_metaballs(ray)) {
+             t = Intersector::intersect(fmb, ray);
+             w = Blender::blend(t, fmb, ray);
+             confidence = Confidence::update(confidence, t, w);
+             tf += t;
+             w0 += w;
+        }
+        img.confidence.at(pixel_coords) = confidence;
+        img.depth.at(pixel_coords) = tf / w0;
+    }
+}
+
+template<class Getter, class Intersector, class Blender, class Confidence>
+void
+render_fmbs(const FMBs &fmbs, const Intrinsics &intr, const Pose &extr)
+{
+     // initialize the fmb_getter
+    typename Getter::Getter fmb_getter(fmbs, intr, extr);
+    auto kernel = render_kernel<Getter, Intersector, Blender, Confidence>;
+    kernel<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(fmb_getter, fmbs, intr, extr);
+}
+
+};

From 66e10b255e36aba90ff0cd5041fa2ec68054d6a7 Mon Sep 17 00:00:00 2001
From: Matin Ghavami <mghavami@mit.edu>
Date: Sun, 16 Nov 2025 18:03:26 -0500
Subject: [PATCH 2/5] wip scaffolding

---
 genmetaballs/src/cuda/core/blender.cuh     | 14 ++++++++
 genmetaballs/src/cuda/core/camera.cuh      | 14 ++++++++
 genmetaballs/src/cuda/core/confidence.cuh  | 12 +++++++
 genmetaballs/src/cuda/core/fmb.cuh         | 23 +++++++++++++
 genmetaballs/src/cuda/core/forward.cu      | 27 +++++++++------
 genmetaballs/src/cuda/core/geometry.cuh    | 33 ++++++++++++++++++
 genmetaballs/src/cuda/core/getter.cuh      |  0
 genmetaballs/src/cuda/core/image.cuh       | 12 +++++++
 genmetaballs/src/cuda/core/intersector.cuh | 14 ++++++++
 genmetaballs/src/cuda/core/utils.cu        |  2 +-
 genmetaballs/src/cuda/core/utils.cuh       | 40 ++++++++++++++++++++++
 genmetaballs/src/cuda/core/utils.h         | 10 ------
 12 files changed, 179 insertions(+), 22 deletions(-)
 create mode 100644 genmetaballs/src/cuda/core/blender.cuh
 create mode 100644 genmetaballs/src/cuda/core/camera.cuh
 create mode 100644 genmetaballs/src/cuda/core/confidence.cuh
 create mode 100644 genmetaballs/src/cuda/core/fmb.cuh
 create mode 100644 genmetaballs/src/cuda/core/geometry.cuh
 create mode 100644 genmetaballs/src/cuda/core/getter.cuh
 create mode 100644 genmetaballs/src/cuda/core/image.cuh
 create mode 100644 genmetaballs/src/cuda/core/intersector.cuh
 create mode 100644 genmetaballs/src/cuda/core/utils.cuh
 delete mode 100644 genmetaballs/src/cuda/core/utils.h

diff --git a/genmetaballs/src/cuda/core/blender.cuh b/genmetaballs/src/cuda/core/blender.cuh
new file mode 100644
index 0000000..2f56efd
--- /dev/null
+++ b/genmetaballs/src/cuda/core/blender.cuh
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "fmb.h"
+#include "geometry.h"
+
+
+struct ThreeParameterBlender {
+    float beta1;
+    float beta2;
+    float eta;
+
+    __host__ __device__ __forceinline__ // TODO inline?
+    float blend(float t, float d, const FMB &fmb, const Ray &ray) const;
+};
diff --git a/genmetaballs/src/cuda/core/camera.cuh b/genmetaballs/src/cuda/core/camera.cuh
new file mode 100644
index 0000000..4b2d2b2
--- /dev/null
+++ b/genmetaballs/src/cuda/core/camera.cuh
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cstdint>
+
+struct Intrinsics {
+    uint32_t height;
+    uint32_t width;
+    float fx;
+    float fy;
+    float cx;
+    float cy;
+    float near;
+    float far;
+};
diff --git a/genmetaballs/src/cuda/core/confidence.cuh b/genmetaballs/src/cuda/core/confidence.cuh
new file mode 100644
index 0000000..a56d278
--- /dev/null
+++ b/genmetaballs/src/cuda/core/confidence.cuh
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <cmath>
+
+
+struct TwoParameterConfidence {
+    float beta4;
+    float beta5;
+
+    __host__ __device__ __forceinline__
+    float get_confidence(float sumexpd) { return 0; } // TODO
+};
diff --git a/genmetaballs/src/cuda/core/fmb.cuh b/genmetaballs/src/cuda/core/fmb.cuh
new file mode 100644
index 0000000..a6e6aa1
--- /dev/null
+++ b/genmetaballs/src/cuda/core/fmb.cuh
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "geometry.cuh"
+
+struct FMB {
+    Pose pose; // mean + orientation
+    float3 extent;
+};
+
+template<typename containter_template>
+class FMBs {
+private:
+    containter_template<FMB> fmbs_;
+    containter_template<float> log_weights_;
+
+public:
+    FMBs(uint32_t size):
+        fmbs_(size),
+        log_weights_(size),
+    {
+        // TODO: set all log_weights_ to 0
+    }
+};
diff --git a/genmetaballs/src/cuda/core/forward.cu b/genmetaballs/src/cuda/core/forward.cu
index c5289a8..79606f6 100644
--- a/genmetaballs/src/cuda/core/forward.cu
+++ b/genmetaballs/src/cuda/core/forward.cu
@@ -23,29 +23,34 @@ get_pixel_coords_and_rays(const dim3 thread_idx, const dim3 block_idx)
     return res;
 }
 
+
 template<class Getter, class Intersector, class Blender, class Confidence>
 __global__
 render_kernel(
-    const typename Getter::Getter &fmb_getter,
-    const Intrinsics &intr,
-    const Pose &extr,
+    const Getter fmb_getter,
+    const Blender blender,
+    Confidence const *confidence,
+    Intrinsics const *intr,
+    Pose const *extr,
     Image *img
 ) {
     // TODO how to find the relevant chunk of computation from threadIdx,
     // blockIdx, etc
-    auto pixel_coords_and_rays = get_pixel_coords_and_rays(threadIdx, blockIdx, ...);
+    auto pixel_coords_and_rays = get_pixel_coords_and_rays(
+        threadIdx, blockIdx, blockDim, gridDim, intr, extr
+    );
 
     for(const auto &[pixel_coords, ray]: pixel_coords_and_rays) {
-        float w0 = 0.0f, tf = 0.0f, confidence = 0.0f;
-        for(const auto &fmb: fmb_getter.get_metaballs(ray)) {
-             t = Intersector::intersect(fmb, ray);
-             w = Blender::blend(t, fmb, ray);
-             confidence = Confidence::update(confidence, t, w);
+        float w0 = 0.0f, tf = 0.0f, sumexpd = 0.0f;
+        for(const auto &fmb: fmb_getter->get_metaballs(ray)) {
+             const auto &[t, d] = Intersector::intersect(fmb, ray);
+             w = blender->blend(t, d, fmb, ray);
+             sumexpd += exp(d);
              tf += t;
              w0 += w;
         }
-        img.confidence.at(pixel_coords) = confidence;
-        img.depth.at(pixel_coords) = tf / w0;
+        img->confidence.at(pixel_coords) = confidence->get_confidence(sumexpd);
+        img->depth.at(pixel_coords) = tf / w0;
     }
 }
 
diff --git a/genmetaballs/src/cuda/core/geometry.cuh b/genmetaballs/src/cuda/core/geometry.cuh
new file mode 100644
index 0000000..b5914e1
--- /dev/null
+++ b/genmetaballs/src/cuda/core/geometry.cuh
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+
+typedef Vec3D = float3;
+
+
+class Rotation {
+
+    private:
+        // ...
+        float rotmat_[9];
+
+    public:
+        Vec3D apply(const Vec3D vec) const;
+        Rotation compose(const Rotation &rot) const;
+        Rotation inv() const;
+};
+
+struct Pose {
+    Rotation rot;
+    Vec3D tran;
+
+    Vec3D apply(const Vec3D vec) const;
+    Pose compose(const Pose &pose) const;
+    Pose inv() const;
+};
+
+struct Ray {
+    Vec3D start;
+    Vec3D direction;
+};
diff --git a/genmetaballs/src/cuda/core/getter.cuh b/genmetaballs/src/cuda/core/getter.cuh
new file mode 100644
index 0000000..e69de29
diff --git a/genmetaballs/src/cuda/core/image.cuh b/genmetaballs/src/cuda/core/image.cuh
new file mode 100644
index 0000000..f2eb1b0
--- /dev/null
+++ b/genmetaballs/src/cuda/core/image.cuh
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <cstdint>
+
+#include "utils.cuh"
+#include "geometry.cuh"
+
+template<uint32_t width, uint32_t height>
+struct Image {
+    Array2D<float, width, height> confidence;
+    Array2D<float, width, height> depth;
+};
diff --git a/genmetaballs/src/cuda/core/intersector.cuh b/genmetaballs/src/cuda/core/intersector.cuh
new file mode 100644
index 0000000..5dea2dc
--- /dev/null
+++ b/genmetaballs/src/cuda/core/intersector.cuh
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <utility>
+
+#include "fmb.h"
+#include "geometry.h"
+
+// implement equation (6) in the paper
+class LinearIntersector {
+
+    static __device__ __host__
+    std::pair<float, float>
+    intersect(const FMB &fmb, const Ray &ray) const;
+};
diff --git a/genmetaballs/src/cuda/core/utils.cu b/genmetaballs/src/cuda/core/utils.cu
index 5643c32..7c91259 100644
--- a/genmetaballs/src/cuda/core/utils.cu
+++ b/genmetaballs/src/cuda/core/utils.cu
@@ -2,7 +2,7 @@
 #include <cuda_runtime.h>
 #include <iostream>
 
-#include "utils.h"
+#include "utils.cuh"
 
 void cuda_check(cudaError_t code, const char *file, int line) {
     if (code != cudaSuccess) {
diff --git a/genmetaballs/src/cuda/core/utils.cuh b/genmetaballs/src/cuda/core/utils.cuh
new file mode 100644
index 0000000..5fea960
--- /dev/null
+++ b/genmetaballs/src/cuda/core/utils.cuh
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(x) \
+    do { \
+        cuda_check((x), __FILE__, __LINE__); \
+    } while (0)
+
+
+void cuda_check(cudaError_t code, const char *file, int line);
+
+// XXX container_t should be a thrust container type
+template<typename container_t>
+class Array2D {
+private:
+    //XXX TODO: make sure this works
+    container_t data_;
+
+public:
+
+    __host__ __device__ __forceinline__
+    T &at(const uint32_t i, const uint32_t j)
+    {
+        return data_[i * width + j];
+    }
+
+    __host__ __device__ __forceinline__
+    const T &at(const uint32_t i, const uint32_t j) const
+    {
+        return data_[i * width + j];
+    }
+
+    __host__ __device__ 
+    constexpr uint32_t size() const
+    {
+        return width * height;
+    }
+};
+
diff --git a/genmetaballs/src/cuda/core/utils.h b/genmetaballs/src/cuda/core/utils.h
deleted file mode 100644
index 2a22197..0000000
--- a/genmetaballs/src/cuda/core/utils.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#pragma once
-
-#include <cuda_runtime.h>
-
-#define CUDA_CHECK(x) \
-    do { \
-        cuda_check((x), __FILE__, __LINE__); \
-    } while (0)
-
-void cuda_check(cudaError_t code, const char *file, int line);

From 8822bfe10b0f799cfdaef49b7fabe9c997020bdf Mon Sep 17 00:00:00 2001
From: mugamma <matin@frogrocket.ai>
Date: Sun, 16 Nov 2025 23:33:47 +0000
Subject: [PATCH 3/5] add correct binding which compiles

---
 CMakeLists.txt                          |  4 +++-
 genmetaballs/src/cuda/bindings.cu       | 14 ++++++++++++++
 genmetaballs/src/cuda/core/add.cuh      |  2 +-
 genmetaballs/src/cuda/core/geometry.cu  |  6 ++++++
 genmetaballs/src/cuda/core/geometry.cuh |  4 +++-
 genmetaballs/src/cuda/core/utils.cuh    | 14 +++++++++-----
 tests/test_geometry.py                  | 13 +++++++++++++
 7 files changed, 49 insertions(+), 8 deletions(-)
 create mode 100644 genmetaballs/src/cuda/core/geometry.cu
 create mode 100644 tests/test_geometry.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3520ea8..6813b2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,8 +12,10 @@ set(CMAKE_CXX_STANDARD 20)
 
 add_library(genmetaballs_core
   genmetaballs/src/cuda/core/utils.cu
-  genmetaballs/src/cuda/core/utils.h
+  genmetaballs/src/cuda/core/utils.cuh
   genmetaballs/src/cuda/core/add.cuh
+  genmetaballs/src/cuda/core/geometry.cuh
+  genmetaballs/src/cuda/core/geometry.cu
 )
 
 # Set include directories for the core library
diff --git a/genmetaballs/src/cuda/bindings.cu b/genmetaballs/src/cuda/bindings.cu
index f32e6a6..a694bbb 100644
--- a/genmetaballs/src/cuda/bindings.cu
+++ b/genmetaballs/src/cuda/bindings.cu
@@ -4,6 +4,7 @@
 #include <nanobind/stl/vector.h>
 
 #include "core/add.cuh"
+#include "core/geometry.cuh"
 
 constexpr uint32_t GRID_DIM = 4096;
 constexpr uint32_t BLOCK_DIM = 1024;
@@ -17,4 +18,17 @@ NB_MODULE(_genmetaballs_bindings, m) {
         "Add two lists elementwise on the GPU",
         nb::arg("a"), nb::arg("b")
     );
+
+    nb::class_<Vec3D>(m, "Vec3D")
+        .def(nb::init<>())
+        .def(nb::init<float, float, float>())
+        .def_rw("x", &Vec3D::x)
+        .def_rw("y", &Vec3D::y)
+        .def_rw("z", &Vec3D::z)
+        .def("__add__", &operator+)
+        .def("__repr__", [](const Vec3D &v) {
+            return "Vec3D(" + std::to_string(v.x) + ", " +
+                   std::to_string(v.y) + ", " + std::to_string(v.z) + ")";
+        });
+
 }
diff --git a/genmetaballs/src/cuda/core/add.cuh b/genmetaballs/src/cuda/core/add.cuh
index 20fc703..bc8a09b 100644
--- a/genmetaballs/src/cuda/core/add.cuh
+++ b/genmetaballs/src/cuda/core/add.cuh
@@ -2,7 +2,7 @@
 #include <cuda_runtime.h>
 #include <vector>
 
-#include "utils.h"
+#include "utils.cuh"
 
 __global__ void add_kernel(
     float const *a,
diff --git a/genmetaballs/src/cuda/core/geometry.cu b/genmetaballs/src/cuda/core/geometry.cu
new file mode 100644
index 0000000..53b0161
--- /dev/null
+++ b/genmetaballs/src/cuda/core/geometry.cu
@@ -0,0 +1,6 @@
+#include "geometry.cuh"
+
+Vec3D operator+(const Vec3D a, const Vec3D b)
+{
+    return {1.0f, 2.0f, 3.0f};
+}
diff --git a/genmetaballs/src/cuda/core/geometry.cuh b/genmetaballs/src/cuda/core/geometry.cuh
index b5914e1..0e25185 100644
--- a/genmetaballs/src/cuda/core/geometry.cuh
+++ b/genmetaballs/src/cuda/core/geometry.cuh
@@ -3,7 +3,9 @@
 #include <cuda_runtime.h>
 
 
-typedef Vec3D = float3;
+using Vec3D = float3;
+
+Vec3D operator+(const Vec3D a, const Vec3D b);
 
 
 class Rotation {
diff --git a/genmetaballs/src/cuda/core/utils.cuh b/genmetaballs/src/cuda/core/utils.cuh
index 5fea960..04fc60f 100644
--- a/genmetaballs/src/cuda/core/utils.cuh
+++ b/genmetaballs/src/cuda/core/utils.cuh
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <cuda_runtime.h>
 
 #define CUDA_CHECK(x) \
@@ -20,21 +21,24 @@ private:
 public:
 
     __host__ __device__ __forceinline__
-    T &at(const uint32_t i, const uint32_t j)
+    container_t &at(const uint32_t i, const uint32_t j)
     {
-        return data_[i * width + j];
+        return data_;
+        //return data_[i * width + j];
     }
 
     __host__ __device__ __forceinline__
-    const T &at(const uint32_t i, const uint32_t j) const
+    const container_t &at(const uint32_t i, const uint32_t j) const
     {
-        return data_[i * width + j];
+        return data_;
+        //return data_[i * width + j];
     }
 
     __host__ __device__ 
     constexpr uint32_t size() const
     {
-        return width * height;
+        return 0;
+        //return width * height;
     }
 };
 
diff --git a/tests/test_geometry.py b/tests/test_geometry.py
new file mode 100644
index 0000000..75820cd
--- /dev/null
+++ b/tests/test_geometry.py
@@ -0,0 +1,13 @@
+import numpy as np
+import pytest
+
+from genmetaballs import _genmetaballs_bindings as _gmbb
+
+
+@pytest.fixture
+def rng() -> np.random.Generator:
+    return np.random.default_rng(0)
+
+
+def test_vec3d(rng: np.random.Generator) -> None:
+    _gmbb.Vec3D(0, 0, 0)

From ee36a7ee983fd267c3296eb21304c245d7cda3dc Mon Sep 17 00:00:00 2001
From: mugamma <matin@frogrocket.ai>
Date: Sun, 16 Nov 2025 23:40:58 +0000
Subject: [PATCH 4/5] vector addition works

---
 genmetaballs/src/cuda/core/geometry.cu |  2 +-
 tests/test_geometry.py                 | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/genmetaballs/src/cuda/core/geometry.cu b/genmetaballs/src/cuda/core/geometry.cu
index 53b0161..147a8cd 100644
--- a/genmetaballs/src/cuda/core/geometry.cu
+++ b/genmetaballs/src/cuda/core/geometry.cu
@@ -2,5 +2,5 @@
 
 Vec3D operator+(const Vec3D a, const Vec3D b)
 {
-    return {1.0f, 2.0f, 3.0f};
+    return {a.x + b.x, a.y + b.y, a.z + b.z};
 }
diff --git a/tests/test_geometry.py b/tests/test_geometry.py
index 75820cd..00a64c3 100644
--- a/tests/test_geometry.py
+++ b/tests/test_geometry.py
@@ -9,5 +9,14 @@ def rng() -> np.random.Generator:
     return np.random.default_rng(0)
 
 
-def test_vec3d(rng: np.random.Generator) -> None:
+def test_vec3d_smoke() -> None:
     _gmbb.Vec3D(0, 0, 0)
+
+def test_vec3d_smoke(rng: np.random.Generator) -> None:
+    _a, _b = rng.uniform(size=3), rng.uniform(size=3)
+    a, b = _gmbb.Vec3D(*_a), _gmbb.Vec3D(*_b)
+    c = a + b
+    _c = _a + _b
+    assert all(
+        [np.isclose(c.x, _c[0]), np.isclose(c.y, _c[1]), np.isclose(c.z, _c[2])]
+    )

From 1ce26bd5580f1bdee80d18ef724c34654051a9fc Mon Sep 17 00:00:00 2001
From: mugamma <matin@frogrocket.ai>
Date: Sun, 16 Nov 2025 23:54:19 +0000
Subject: [PATCH 5/5] vec3d subtraction

---
 genmetaballs/src/cuda/bindings.cu          | 15 +++---
 genmetaballs/src/cuda/core/blender.cuh     |  4 +-
 genmetaballs/src/cuda/core/confidence.cuh  |  6 +--
 genmetaballs/src/cuda/core/fmb.cuh         |  7 +--
 genmetaballs/src/cuda/core/forward.cu      | 56 +++++++++-------------
 genmetaballs/src/cuda/core/geometry.cu     |  7 ++-
 genmetaballs/src/cuda/core/geometry.cuh    | 19 ++++----
 genmetaballs/src/cuda/core/image.cuh       |  4 +-
 genmetaballs/src/cuda/core/intersector.cuh |  5 +-
 genmetaballs/src/cuda/core/utils.cuh       | 34 +++++--------
 tests/test_geometry.py                     | 15 ++++--
 11 files changed, 77 insertions(+), 95 deletions(-)

diff --git a/genmetaballs/src/cuda/bindings.cu b/genmetaballs/src/cuda/bindings.cu
index 0fafe2c..940bc02 100644
--- a/genmetaballs/src/cuda/bindings.cu
+++ b/genmetaballs/src/cuda/bindings.cu
@@ -11,12 +11,8 @@ constexpr uint32_t BLOCK_DIM = 1024;
 namespace nb = nanobind;
 
 NB_MODULE(_genmetaballs_bindings, m) {
-    m.def(
-        "gpu_add",
-        &gpu_add<GRID_DIM, BLOCK_DIM>,
-        "Add two lists elementwise on the GPU",
-        nb::arg("a"), nb::arg("b")
-    );
+    m.def("gpu_add", &gpu_add<GRID_DIM, BLOCK_DIM>, "Add two lists elementwise on the GPU",
+          nb::arg("a"), nb::arg("b"));
 
     nb::class_<Vec3D>(m, "Vec3D")
         .def(nb::init<>())
@@ -25,8 +21,9 @@ NB_MODULE(_genmetaballs_bindings, m) {
         .def_rw("y", &Vec3D::y)
         .def_rw("z", &Vec3D::z)
         .def("__add__", &operator+)
-        .def("__repr__", [](const Vec3D &v) {
-            return "Vec3D(" + std::to_string(v.x) + ", " +
-                   std::to_string(v.y) + ", " + std::to_string(v.z) + ")";
+        .def("__sub__", &operator-)
+        .def("__repr__", [](const Vec3D& v) {
+            return "Vec3D(" + std::to_string(v.x) + ", " + std::to_string(v.y) + ", " +
+                   std::to_string(v.z) + ")";
         });
 }
diff --git a/genmetaballs/src/cuda/core/blender.cuh b/genmetaballs/src/cuda/core/blender.cuh
index 2f56efd..8c4e21f 100644
--- a/genmetaballs/src/cuda/core/blender.cuh
+++ b/genmetaballs/src/cuda/core/blender.cuh
@@ -3,12 +3,12 @@
 #include "fmb.h"
 #include "geometry.h"
 
-
 struct ThreeParameterBlender {
     float beta1;
     float beta2;
     float eta;
 
     __host__ __device__ __forceinline__ // TODO inline?
-    float blend(float t, float d, const FMB &fmb, const Ray &ray) const;
+        float
+        blend(float t, float d, const FMB& fmb, const Ray& ray) const;
 };
diff --git a/genmetaballs/src/cuda/core/confidence.cuh b/genmetaballs/src/cuda/core/confidence.cuh
index a56d278..b79c3ad 100644
--- a/genmetaballs/src/cuda/core/confidence.cuh
+++ b/genmetaballs/src/cuda/core/confidence.cuh
@@ -2,11 +2,11 @@
 
 #include <cmath>
 
-
 struct TwoParameterConfidence {
     float beta4;
     float beta5;
 
-    __host__ __device__ __forceinline__
-    float get_confidence(float sumexpd) { return 0; } // TODO
+    __host__ __device__ __forceinline__ float get_confidence(float sumexpd) {
+        return 0;
+    } // TODO
 };
diff --git a/genmetaballs/src/cuda/core/fmb.cuh b/genmetaballs/src/cuda/core/fmb.cuh
index a6e6aa1..14b09f0 100644
--- a/genmetaballs/src/cuda/core/fmb.cuh
+++ b/genmetaballs/src/cuda/core/fmb.cuh
@@ -7,17 +7,14 @@ struct FMB {
     float3 extent;
 };
 
-template<typename containter_template>
+template <typename containter_template>
 class FMBs {
 private:
     containter_template<FMB> fmbs_;
     containter_template<float> log_weights_;
 
 public:
-    FMBs(uint32_t size):
-        fmbs_(size),
-        log_weights_(size),
-    {
+    FMBs(uint32_t size) : fmbs_(size), log_weights_(size), {
         // TODO: set all log_weights_ to 0
     }
 };
diff --git a/genmetaballs/src/cuda/core/forward.cu b/genmetaballs/src/cuda/core/forward.cu
index 79606f6..00b8b44 100644
--- a/genmetaballs/src/cuda/core/forward.cu
+++ b/genmetaballs/src/cuda/core/forward.cu
@@ -1,67 +1,55 @@
 #include <cstdint>
-
 #include <cuda_runtime.h>
 
-constexpr NUM_BLOCKS dim3(10); //XXX madeup
+constexpr NUM_BLOCKS dim3(10); // XXX madeup
 constexpr THREADS_PER_BLOCK dim3(10);
 
 namespace FMB {
 
-__device__ __host__
-std::vector<std::pair<PixelCoord, Ray>>
-get_pixel_coords_and_rays(const dim3 thread_idx, const dim3 block_idx)
-{
+__device__ __host__ std::vector<std::pair<PixelCoord, Ray>>
+get_pixel_coords_and_rays(
+    const dim3 thread_idx, const dim3 block_idx) {
     std::vector<std::pair<PixelCoord, Ray>> res;
 
     uint32_t i_beg = 0; // XXX TODO
     uint32_t i_end = 0; // XXX TODO
 
-    for(int i = i_beg; i < i_end; i+=blockDim.x) {
+    for (int i = i_beg; i < i_end; i += blockDim.x) {
         //...
     }
 
     return res;
 }
 
-
-template<class Getter, class Intersector, class Blender, class Confidence>
-__global__
-render_kernel(
-    const Getter fmb_getter,
-    const Blender blender,
-    Confidence const *confidence,
-    Intrinsics const *intr,
-    Pose const *extr,
-    Image *img
-) {
+template <class Getter, class Intersector, class Blender, class Confidence>
+__global__ render_kernel(const Getter fmb_getter, const Blender blender,
+                         Confidence const* confidence, Intrinsics const* intr, Pose const* extr,
+                         Image* img) {
     // TODO how to find the relevant chunk of computation from threadIdx,
     // blockIdx, etc
-    auto pixel_coords_and_rays = get_pixel_coords_and_rays(
-        threadIdx, blockIdx, blockDim, gridDim, intr, extr
-    );
+    auto pixel_coords_and_rays =
+        get_pixel_coords_and_rays(threadIdx, blockIdx, blockDim, gridDim, intr, extr);
 
-    for(const auto &[pixel_coords, ray]: pixel_coords_and_rays) {
+    for (const auto& [pixel_coords, ray] : pixel_coords_and_rays) {
         float w0 = 0.0f, tf = 0.0f, sumexpd = 0.0f;
-        for(const auto &fmb: fmb_getter->get_metaballs(ray)) {
-             const auto &[t, d] = Intersector::intersect(fmb, ray);
-             w = blender->blend(t, d, fmb, ray);
-             sumexpd += exp(d);
-             tf += t;
-             w0 += w;
+        for (const auto& fmb : fmb_getter->get_metaballs(ray)) {
+            const auto& [t, d] = Intersector::intersect(fmb, ray);
+            w = blender->blend(t, d, fmb, ray);
+            sumexpd += exp(d);
+            tf += t;
+            w0 += w;
         }
         img->confidence.at(pixel_coords) = confidence->get_confidence(sumexpd);
         img->depth.at(pixel_coords) = tf / w0;
     }
 }
 
-template<class Getter, class Intersector, class Blender, class Confidence>
-void
-render_fmbs(const FMBs &fmbs, const Intrinsics &intr, const Pose &extr)
-{
-     // initialize the fmb_getter
+template <class Getter, class Intersector, class Blender, class Confidence>
+void render_fmbs(const FMBs& fmbs, const Intrinsics& intr, const Pose& extr) {
+    // initialize the fmb_getter
     typename Getter::Getter fmb_getter(fmbs, intr, extr);
     auto kernel = render_kernel<Getter, Intersector, Blender, Confidence>;
     kernel<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(fmb_getter, fmbs, intr, extr);
 }
 
-};
+}; // namespace FMB
diff --git a/genmetaballs/src/cuda/core/geometry.cu b/genmetaballs/src/cuda/core/geometry.cu
index 147a8cd..9e5c886 100644
--- a/genmetaballs/src/cuda/core/geometry.cu
+++ b/genmetaballs/src/cuda/core/geometry.cu
@@ -1,6 +1,9 @@
 #include "geometry.cuh"
 
-Vec3D operator+(const Vec3D a, const Vec3D b)
-{
+Vec3D operator+(const Vec3D a, const Vec3D b) {
     return {a.x + b.x, a.y + b.y, a.z + b.z};
 }
+
+Vec3D operator-(const Vec3D a, const Vec3D b) {
+    return {a.x - b.x, a.y - b.y, a.z - b.z};
+}
diff --git a/genmetaballs/src/cuda/core/geometry.cuh b/genmetaballs/src/cuda/core/geometry.cuh
index 0e25185..e1f6334 100644
--- a/genmetaballs/src/cuda/core/geometry.cuh
+++ b/genmetaballs/src/cuda/core/geometry.cuh
@@ -2,22 +2,21 @@
 
 #include <cuda_runtime.h>
 
-
 using Vec3D = float3;
 
 Vec3D operator+(const Vec3D a, const Vec3D b);
-
+Vec3D operator-(const Vec3D a, const Vec3D b);
 
 class Rotation {
 
-    private:
-        // ...
-        float rotmat_[9];
+private:
+    // ...
+    float rotmat_[9];
 
-    public:
-        Vec3D apply(const Vec3D vec) const;
-        Rotation compose(const Rotation &rot) const;
-        Rotation inv() const;
+public:
+    Vec3D apply(const Vec3D vec) const;
+    Rotation compose(const Rotation& rot) const;
+    Rotation inv() const;
 };
 
 struct Pose {
@@ -25,7 +24,7 @@ struct Pose {
     Vec3D tran;
 
     Vec3D apply(const Vec3D vec) const;
-    Pose compose(const Pose &pose) const;
+    Pose compose(const Pose& pose) const;
     Pose inv() const;
 };
 
diff --git a/genmetaballs/src/cuda/core/image.cuh b/genmetaballs/src/cuda/core/image.cuh
index f2eb1b0..d8ac7f5 100644
--- a/genmetaballs/src/cuda/core/image.cuh
+++ b/genmetaballs/src/cuda/core/image.cuh
@@ -2,10 +2,10 @@
 
 #include <cstdint>
 
-#include "utils.cuh"
 #include "geometry.cuh"
+#include "utils.cuh"
 
-template<uint32_t width, uint32_t height>
+template <uint32_t width, uint32_t height>
 struct Image {
     Array2D<float, width, height> confidence;
     Array2D<float, width, height> depth;
diff --git a/genmetaballs/src/cuda/core/intersector.cuh b/genmetaballs/src/cuda/core/intersector.cuh
index 5dea2dc..acfb3a1 100644
--- a/genmetaballs/src/cuda/core/intersector.cuh
+++ b/genmetaballs/src/cuda/core/intersector.cuh
@@ -8,7 +8,6 @@
 // implement equation (6) in the paper
 class LinearIntersector {
 
-    static __device__ __host__
-    std::pair<float, float>
-    intersect(const FMB &fmb, const Ray &ray) const;
+    static __device__ __host__ std::pair<float, float> intersect(const FMB& fmb,
+                                                                 const Ray& ray) const;
 };
diff --git a/genmetaballs/src/cuda/core/utils.cuh b/genmetaballs/src/cuda/core/utils.cuh
index 04fc60f..cf114ae 100644
--- a/genmetaballs/src/cuda/core/utils.cuh
+++ b/genmetaballs/src/cuda/core/utils.cuh
@@ -3,42 +3,34 @@
 #include <cstdint>
 #include <cuda_runtime.h>
 
-#define CUDA_CHECK(x) \
-    do { \
-        cuda_check((x), __FILE__, __LINE__); \
+#define CUDA_CHECK(x)                                                                              \
+    do {                                                                                           \
+        cuda_check((x), __FILE__, __LINE__);                                                       \
     } while (0)
 
-
-void cuda_check(cudaError_t code, const char *file, int line);
+void cuda_check(cudaError_t code, const char* file, int line);
 
 // XXX container_t should be a thrust container type
-template<typename container_t>
+template <typename container_t>
 class Array2D {
 private:
-    //XXX TODO: make sure this works
+    // XXX TODO: make sure this works
     container_t data_;
 
 public:
-
-    __host__ __device__ __forceinline__
-    container_t &at(const uint32_t i, const uint32_t j)
-    {
+    __host__ __device__ __forceinline__ container_t& at(const uint32_t i, const uint32_t j) {
         return data_;
-        //return data_[i * width + j];
+        // return data_[i * width + j];
     }
 
-    __host__ __device__ __forceinline__
-    const container_t &at(const uint32_t i, const uint32_t j) const
-    {
+    __host__ __device__ __forceinline__ const container_t& at(const uint32_t i,
+                                                              const uint32_t j) const {
         return data_;
-        //return data_[i * width + j];
+        // return data_[i * width + j];
     }
 
-    __host__ __device__ 
-    constexpr uint32_t size() const
-    {
+    __host__ __device__ constexpr uint32_t size() const {
         return 0;
-        //return width * height;
+        // return width * height;
     }
 };
-
diff --git a/tests/test_geometry.py b/tests/test_geometry.py
index 00a64c3..18bce0e 100644
--- a/tests/test_geometry.py
+++ b/tests/test_geometry.py
@@ -12,11 +12,18 @@ def rng() -> np.random.Generator:
 def test_vec3d_smoke() -> None:
     _gmbb.Vec3D(0, 0, 0)
 
-def test_vec3d_smoke(rng: np.random.Generator) -> None:
+
+def test_vec3d_add(rng: np.random.Generator) -> None:
     _a, _b = rng.uniform(size=3), rng.uniform(size=3)
     a, b = _gmbb.Vec3D(*_a), _gmbb.Vec3D(*_b)
     c = a + b
     _c = _a + _b
-    assert all(
-        [np.isclose(c.x, _c[0]), np.isclose(c.y, _c[1]), np.isclose(c.z, _c[2])]
-    )
+    assert all([np.isclose(c.x, _c[0]), np.isclose(c.y, _c[1]), np.isclose(c.z, _c[2])])
+
+
+def test_vec3d_sub(rng: np.random.Generator) -> None:
+    _a, _b = rng.uniform(size=3), rng.uniform(size=3)
+    a, b = _gmbb.Vec3D(*_a), _gmbb.Vec3D(*_b)
+    c = a - b
+    _c = _a - _b
+    assert all([np.isclose(c.x, _c[0]), np.isclose(c.y, _c[1]), np.isclose(c.z, _c[2])])