diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6fa89b0..fe2d88f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,10 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 add_library(genmetaballs_core
   genmetaballs/src/cuda/core/utils.cu
-  genmetaballs/src/cuda/core/utils.h
+  genmetaballs/src/cuda/core/utils.cuh
   genmetaballs/src/cuda/core/add.cuh
+  genmetaballs/src/cuda/core/geometry.cuh
+  genmetaballs/src/cuda/core/geometry.cu
 )
 
 # Set include directories for the core library
diff --git a/genmetaballs/src/cuda/bindings.cu b/genmetaballs/src/cuda/bindings.cu
index 6354aa2..940bc02 100644
--- a/genmetaballs/src/cuda/bindings.cu
+++ b/genmetaballs/src/cuda/bindings.cu
@@ -3,6 +3,7 @@
 #include <nanobind/stl/vector.h>
 
 #include "core/add.cuh"
+#include "core/geometry.cuh"
 
 constexpr uint32_t GRID_DIM = 4096;
 constexpr uint32_t BLOCK_DIM = 1024;
@@ -12,4 +13,17 @@ namespace nb = nanobind;
 NB_MODULE(_genmetaballs_bindings, m) {
     m.def("gpu_add", &gpu_add<GRID_DIM, BLOCK_DIM>, "Add two lists elementwise on the GPU",
           nb::arg("a"), nb::arg("b"));
+
+    nb::class_<Vec3D>(m, "Vec3D")
+        .def(nb::init<>())
+        .def(nb::init<float, float, float>())
+        .def_rw("x", &Vec3D::x)
+        .def_rw("y", &Vec3D::y)
+        .def_rw("z", &Vec3D::z)
+        .def("__add__", &operator+)
+        .def("__sub__", &operator-)
+        .def("__repr__", [](const Vec3D& v) {
+            return "Vec3D(" + std::to_string(v.x) + ", " + std::to_string(v.y) + ", " +
+                   std::to_string(v.z) + ")";
+        });
 }
diff --git a/genmetaballs/src/cuda/core/add.cuh b/genmetaballs/src/cuda/core/add.cuh
index 9a068b1..b96d92b 100644
--- a/genmetaballs/src/cuda/core/add.cuh
+++ b/genmetaballs/src/cuda/core/add.cuh
@@ -2,7 +2,7 @@
 #include <cuda_runtime.h>
 #include <vector>
 
-#include "utils.h"
+#include "utils.cuh"
 
 __global__ void add_kernel(float const* a, float const* b, const uint32_t n, float* sum) {
     const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/genmetaballs/src/cuda/core/blender.cuh b/genmetaballs/src/cuda/core/blender.cuh
new file mode 100644
index 0000000..8c4e21f
--- /dev/null
+++ b/genmetaballs/src/cuda/core/blender.cuh
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "fmb.h"
+#include "geometry.h"
+
+struct ThreeParameterBlender {
+    float beta1;
+    float beta2;
+    float eta;
+
+    __host__ __device__ __forceinline__ // TODO inline?
+        float
+        blend(float t, float d, const FMB& fmb, const Ray& ray) const;
+};
diff --git a/genmetaballs/src/cuda/core/camera.cuh b/genmetaballs/src/cuda/core/camera.cuh
new file mode 100644
index 0000000..4b2d2b2
--- /dev/null
+++ b/genmetaballs/src/cuda/core/camera.cuh
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cstdint>
+
+struct Intrinsics {
+    uint32_t height;
+    uint32_t width;
+    float fx;
+    float fy;
+    float cx;
+    float cy;
+    float near;
+    float far;
+};
diff --git a/genmetaballs/src/cuda/core/confidence.cuh b/genmetaballs/src/cuda/core/confidence.cuh
new file mode 100644
index 0000000..b79c3ad
--- /dev/null
+++ b/genmetaballs/src/cuda/core/confidence.cuh
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <cmath>
+
+struct TwoParameterConfidence {
+    float beta4;
+    float beta5;
+
+    __host__ __device__ __forceinline__ float get_confidence(float sumexpd) {
+        return 0;
+    } // TODO
+};
diff --git a/genmetaballs/src/cuda/core/fmb.cuh b/genmetaballs/src/cuda/core/fmb.cuh
new file mode 100644
index 0000000..14b09f0
--- /dev/null
+++ b/genmetaballs/src/cuda/core/fmb.cuh
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "geometry.cuh"
+
+struct FMB {
+    Pose pose; // mean + orientation
+    float3 extent;
+};
+
+template <typename containter_template>
+class FMBs {
+private:
+    containter_template<FMB> fmbs_;
+    containter_template<float> log_weights_;
+
+public:
+    FMBs(uint32_t size) : fmbs_(size), log_weights_(size), {
+        // TODO: set all log_weights_ to 0
+    }
+};
diff --git a/genmetaballs/src/cuda/core/forward.cu b/genmetaballs/src/cuda/core/forward.cu
new file mode 100644
index 0000000..00b8b44
--- /dev/null
+++ b/genmetaballs/src/cuda/core/forward.cu
@@ -0,0 +1,55 @@
+#include <cstdint>
+#include <cuda_runtime.h>
+
+constexpr NUM_BLOCKS dim3(10); // XXX madeup
+constexpr THREADS_PER_BLOCK dim3(10);
+
+namespace FMB {
+
+__device__ __host__ std::vector<std::pair<PixelCoord, Ray>>
+get_pixel_coords_and_rays(
+    const dim3 thread_idx, const dim3 block_idx) {
+    std::vector<std::pair<PixelCoord, Ray>> res;
+
+    uint32_t i_beg = 0; // XXX TODO
+    uint32_t i_end = 0; // XXX TODO
+
+    for (int i = i_beg; i < i_end; i += blockDim.x) {
+        //...
+    }
+
+    return res;
+}
+
+template <class Getter, class Intersector, class Blender, class Confidence>
+__global__ render_kernel(const Getter fmb_getter, const Blender blender,
+                         Confidence const* confidence, Intrinsics const* intr, Pose const* extr,
+                         Image* img) {
+    // TODO how to find the relevant chunk of computation from threadIdx,
+    // blockIdx, etc
+    auto pixel_coords_and_rays =
+        get_pixel_coords_and_rays(threadIdx, blockIdx, blockDim, gridDim, intr, extr);
+
+    for (const auto& [pixel_coords, ray] : pixel_coords_and_rays) {
+        float w0 = 0.0f, tf = 0.0f, sumexpd = 0.0f;
+        for (const auto& fmb : fmb_getter->get_metaballs(ray)) {
+            const auto& [t, d] = Intersector::intersect(fmb, ray);
+            w = blender->blend(t, d, fmb, ray);
+            sumexpd += exp(d);
+            tf += t;
+            w0 += w;
+        }
+        img->confidence.at(pixel_coords) = confidence->get_confidence(sumexpd);
+        img->depth.at(pixel_coords) = tf / w0;
+    }
+}
+
+template <class Getter, class Intersector, class Blender, class Confidence>
+void render_fmbs(const FMBs& fmbs, const Intrinsics& intr, const Pose& extr) {
+    // initialize the fmb_getter
+    typename Getter::Getter fmb_getter(fmbs, intr, extr);
+    auto kernel = render_kernel<Getter, Intersector, Blender, Confidence>;
+    kernel<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(fmb_getter, fmbs, intr, extr);
+}
+
+}; // namespace FMB
diff --git a/genmetaballs/src/cuda/core/geometry.cu b/genmetaballs/src/cuda/core/geometry.cu
new file mode 100644
index 0000000..9e5c886
--- /dev/null
+++ b/genmetaballs/src/cuda/core/geometry.cu
@@ -0,0 +1,9 @@
+#include "geometry.cuh"
+
+Vec3D operator+(const Vec3D a, const Vec3D b) {
+    return {a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+Vec3D operator-(const Vec3D a, const Vec3D b) {
+    return {a.x - b.x, a.y - b.y, a.z - b.z};
+}
diff --git a/genmetaballs/src/cuda/core/geometry.cuh b/genmetaballs/src/cuda/core/geometry.cuh
new file mode 100644
index 0000000..e1f6334
--- /dev/null
+++ b/genmetaballs/src/cuda/core/geometry.cuh
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+using Vec3D = float3;
+
+Vec3D operator+(const Vec3D a, const Vec3D b);
+Vec3D operator-(const Vec3D a, const Vec3D b);
+
+class Rotation {
+
+private:
+    // ...
+    float rotmat_[9];
+
+public:
+    Vec3D apply(const Vec3D vec) const;
+    Rotation compose(const Rotation& rot) const;
+    Rotation inv() const;
+};
+
+struct Pose {
+    Rotation rot;
+    Vec3D tran;
+
+    Vec3D apply(const Vec3D vec) const;
+    Pose compose(const Pose& pose) const;
+    Pose inv() const;
+};
+
+struct Ray {
+    Vec3D start;
+    Vec3D direction;
+};
diff --git a/genmetaballs/src/cuda/core/getter.cuh b/genmetaballs/src/cuda/core/getter.cuh
new file mode 100644
index 0000000..e69de29
diff --git a/genmetaballs/src/cuda/core/image.cuh b/genmetaballs/src/cuda/core/image.cuh
new file mode 100644
index 0000000..d8ac7f5
--- /dev/null
+++ b/genmetaballs/src/cuda/core/image.cuh
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <cstdint>
+
+#include "geometry.cuh"
+#include "utils.cuh"
+
+template <uint32_t width, uint32_t height>
+struct Image {
+    Array2D<float, width, height> confidence;
+    Array2D<float, width, height> depth;
+};
diff --git a/genmetaballs/src/cuda/core/intersector.cuh b/genmetaballs/src/cuda/core/intersector.cuh
new file mode 100644
index 0000000..acfb3a1
--- /dev/null
+++ b/genmetaballs/src/cuda/core/intersector.cuh
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <utility>
+
+#include "fmb.h"
+#include "geometry.h"
+
+// implement equation (6) in the paper
+class LinearIntersector {
+
+    static __device__ __host__ std::pair<float, float> intersect(const FMB& fmb,
+                                                                 const Ray& ray) const;
+};
diff --git a/genmetaballs/src/cuda/core/utils.cu b/genmetaballs/src/cuda/core/utils.cu
index 022a023..c2df092 100644
--- a/genmetaballs/src/cuda/core/utils.cu
+++ b/genmetaballs/src/cuda/core/utils.cu
@@ -2,7 +2,7 @@
 #include <cuda_runtime.h>
 #include <iostream>
 
-#include "utils.h"
+#include "utils.cuh"
 
 void cuda_check(cudaError_t code, const char* file, int line) {
     if (code != cudaSuccess) {
diff --git a/genmetaballs/src/cuda/core/utils.cuh b/genmetaballs/src/cuda/core/utils.cuh
new file mode 100644
index 0000000..cf114ae
--- /dev/null
+++ b/genmetaballs/src/cuda/core/utils.cuh
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+#define CUDA_CHECK(x)                                                                              \
+    do {                                                                                           \
+        cuda_check((x), __FILE__, __LINE__);                                                       \
+    } while (0)
+
+void cuda_check(cudaError_t code, const char* file, int line);
+
+// XXX container_t should be a thrust container type
+template <typename container_t>
+class Array2D {
+private:
+    // XXX TODO: make sure this works
+    container_t data_;
+
+public:
+    __host__ __device__ __forceinline__ container_t& at(const uint32_t i, const uint32_t j) {
+        return data_;
+        // return data_[i * width + j];
+    }
+
+    __host__ __device__ __forceinline__ const container_t& at(const uint32_t i,
+                                                              const uint32_t j) const {
+        return data_;
+        // return data_[i * width + j];
+    }
+
+    __host__ __device__ constexpr uint32_t size() const {
+        return 0;
+        // return width * height;
+    }
+};
diff --git a/genmetaballs/src/cuda/core/utils.h b/genmetaballs/src/cuda/core/utils.h
deleted file mode 100644
index 2a22197..0000000
--- a/genmetaballs/src/cuda/core/utils.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#pragma once
-
-#include <cuda_runtime.h>
-
-#define CUDA_CHECK(x) \
-    do { \
-        cuda_check((x), __FILE__, __LINE__); \
-    } while (0)
-
-void cuda_check(cudaError_t code, const char *file, int line);
diff --git a/tests/test_geometry.py b/tests/test_geometry.py
new file mode 100644
index 0000000..18bce0e
--- /dev/null
+++ b/tests/test_geometry.py
@@ -0,0 +1,29 @@
+import numpy as np
+import pytest
+
+from genmetaballs import _genmetaballs_bindings as _gmbb
+
+
+@pytest.fixture
+def rng() -> np.random.Generator:
+    return np.random.default_rng(0)
+
+
+def test_vec3d_smoke() -> None:
+    _gmbb.Vec3D(0, 0, 0)
+
+
+def test_vec3d_add(rng: np.random.Generator) -> None:
+    _a, _b = rng.uniform(size=3), rng.uniform(size=3)
+    a, b = _gmbb.Vec3D(*_a), _gmbb.Vec3D(*_b)
+    c = a + b
+    _c = _a + _b
+    assert all([np.isclose(c.x, _c[0]), np.isclose(c.y, _c[1]), np.isclose(c.z, _c[2])])
+
+
+def test_vec3d_sub(rng: np.random.Generator) -> None:
+    _a, _b = rng.uniform(size=3), rng.uniform(size=3)
+    a, b = _gmbb.Vec3D(*_a), _gmbb.Vec3D(*_b)
+    c = a - b
+    _c = _a - _b
+    assert all([np.isclose(c.x, _c[0]), np.isclose(c.y, _c[1]), np.isclose(c.z, _c[2])])