Make KernelDescriptor bind to the current CUDA context.

stijnh · stijnh · commit 02d0bb96bad6 · 2023-11-23T13:36:48.000+01:00
A `KernelDescriptor` will now store a reference to the current CUDA context
and only compare equal to another `KernelDescriptor` if they have a matching
CUDA context. This prevents bugs when dealing with multiple CUDA contexts and
switching between these. Previously, it was assume that there was only a single
CUDA context.
diff --git a/include/kernel_launcher/arg.h b/include/kernel_launcher/arg.h
@@ -3,6 +3,7 @@
 
 #include <cuda.h>
 
+#include <array>
 #include <cstring>
 #include <iostream>
 #include <utility>
diff --git a/include/kernel_launcher/cuda.h b/include/kernel_launcher/cuda.h
@@ -192,7 +192,7 @@ struct CudaDevice {
  */
 struct CudaContextHandle {
     CudaContextHandle() = default;
-    CudaContextHandle(CUcontext c) : context_(c) {};
+    CudaContextHandle(CUcontext c);
 
     /**
      * Returns the current CUDA context or throws an error if CUDA has not
@@ -205,17 +205,25 @@ struct CudaContextHandle {
      */
     CudaDevice device() const;
 
-    void with(std::function<void()> f) const;
-
     /**
      * Returns the underlying `CUcontext`.
      */
     CUcontext get() const {
         return context_;
     }
 
+    bool operator==(const CudaContextHandle& that) const;
+    bool operator!=(const CudaContextHandle& that) const;
+
   private:
     CUcontext context_ = nullptr;
+    unsigned long long id_ = ~0ULL;
+};
+
+struct CudaContextGuard {
+    CudaContextGuard(CudaContextHandle ctx) : CudaContextGuard(ctx.get()) {}
+    CudaContextGuard(CUcontext ctx);
+    ~CudaContextGuard();
 };
 
 /**
diff --git a/include/kernel_launcher/registry.h b/include/kernel_launcher/registry.h
@@ -53,14 +53,28 @@ struct KernelDescriptor {
     KernelDescriptor(KernelDescriptor&) noexcept = default;
     KernelDescriptor(const KernelDescriptor&) = default;
 
-    template<typename D>
-    KernelDescriptor(D&& descriptor) {
-        using T = typename std::decay<D>::type;
-        descriptor_type_ = type_of<T>();
-        descriptor_ = std::make_shared<T>(std::forward<D>(descriptor));
-        hash_ = hash_fields(descriptor_type_, descriptor_->hash());
+    KernelDescriptor(
+        std::shared_ptr<IKernelDescriptor> descriptor,
+        CudaContextHandle ctx = CudaContextHandle::current()) :
+        ctx_(ctx),
+        descriptor_(std::move(descriptor)) {
+        const IKernelDescriptor& inner = *descriptor_;
+        hash_ = hash_fields(
+            typeid(inner).hash_code(),
+            descriptor_->hash(),
+            ctx_.get());
     }
 
+    template<typename D>
+    KernelDescriptor(std::shared_ptr<D> descriptor) :
+        KernelDescriptor(
+            std::shared_ptr<IKernelDescriptor>(std::move(descriptor))) {}
+
+    template<typename D>
+    KernelDescriptor(D&& descriptor) :
+        KernelDescriptor(
+            std::make_shared<std::decay_t<D>>(std::forward<D>(descriptor))) {}
+
     const IKernelDescriptor& get() const {
         return *descriptor_;
     }
@@ -70,7 +84,7 @@ struct KernelDescriptor {
     }
 
     bool operator==(const KernelDescriptor& that) const {
-        return that.hash_ == hash_ && that.descriptor_type_ == descriptor_type_
+        return that.hash_ == hash_ && that.ctx_ == ctx_
             && that.descriptor_->equals(*descriptor_);
     }
 
@@ -80,7 +94,7 @@ struct KernelDescriptor {
 
   private:
     hash_t hash_;
-    TypeInfo descriptor_type_;
+    CudaContextHandle ctx_;
     std::shared_ptr<IKernelDescriptor> descriptor_;
 };
 }  // namespace kernel_launcher
diff --git a/src/cuda.cpp b/src/cuda.cpp
@@ -8,19 +8,23 @@
 
 namespace kernel_launcher {
 
-void cuda_check(CUresult result, const char* msg) {
-    if (result != CUDA_SUCCESS) {
-        const char* name = "???";
-        const char* description = "???";
+CudaException build_cuda_exception(CUresult& result, const char* msg) {
+    const char* name = "???";
+    const char* description = "???";
 
-        // Ignore error since we are already handling another error
-        cuGetErrorName(result, &name);
-        cuGetErrorString(result, &description);
+    // Ignore error since we are already handling another error
+    cuGetErrorName(result, &name);
+    cuGetErrorString(result, &description);
+
+    std::stringstream display;
+    display << "CUDA error: " << name << " (" << description << "): " << msg;
+    auto e = CudaException(result, display.str());
+    return e;
+}
 
-        std::stringstream display;
-        display << "CUDA error: " << name << " (" << description
-                << "): " << msg;
-        throw CudaException(result, display.str());
+void cuda_check(CUresult result, const char* msg) {
+    if (result != CUDA_SUCCESS) {
+        throw build_cuda_exception(result, msg);
     }
 }
 
@@ -129,7 +133,15 @@ std::string CudaDevice::uuid() const {
 CudaArch CudaDevice::arch() const {
     int minor = attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
     int major = attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
-    return CudaArch(major, minor);
+    return {major, minor};
+}
+
+CudaContextHandle::CudaContextHandle(CUcontext c) {
+    context_ = c;
+
+#if CUDA_VERSION >= 12000
+    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxGetId(context_, &id_));
+#endif
 }
 
 CudaContextHandle CudaContextHandle::current() {
@@ -146,21 +158,27 @@ CudaContextHandle CudaContextHandle::current() {
 }
 
 CudaDevice CudaContextHandle::device() const {
+    CudaContextGuard guard {context_};
     CUdevice d = -1;
-    with([&]() { KERNEL_LAUNCHER_CUDA_CHECK(cuCtxGetDevice(&d)); });
+    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxGetDevice(&d));
     return CudaDevice(d);
 }
 
-void CudaContextHandle::with(std::function<void()> f) const {
-    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxPushCurrent(context_));
-    try {
-        f();
-        KERNEL_LAUNCHER_CUDA_CHECK(cuCtxPopCurrent(nullptr));
-    } catch (...) {
-        // Ignore errors. There is not much we can do at this point.
-        cuCtxPopCurrent(nullptr);
-        throw;
-    }
+bool CudaContextHandle::operator==(const CudaContextHandle& that) const {
+    return id_ == that.id_ && context_ == that.context_;
+}
+
+bool CudaContextHandle::operator!=(const CudaContextHandle& that) const {
+    return !(*this == that);
+}
+
+CudaContextGuard::CudaContextGuard(CUcontext ctx) {
+    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxPushCurrent(ctx));
+}
+
+CudaContextGuard::~CudaContextGuard() {
+    CUcontext current;
+    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxPopCurrent(&current));
 }
 
 void cuda_raw_copy(const void* src, void* dst, size_t num_bytes) {
diff --git a/tests/registry.cpp b/tests/registry.cpp
@@ -15,6 +15,110 @@ struct VectorAddDescriptor: IKernelDescriptor {
     }
 };
 
+struct MatrixMulDescriptor: IKernelDescriptor {
+    MatrixMulDescriptor(int size) : size_(size) {}
+
+    KernelBuilder build() const override {
+        return KernelBuilder("matrix_mul", "TODO");
+    }
+
+    bool equals(const IKernelDescriptor& that) const override {
+        if (auto ptr = dynamic_cast<const MatrixMulDescriptor*>(&that)) {
+            return ptr->size_ == size_;
+        } else {
+            return false;
+        }
+    }
+
+    hash_t hash() const override {
+        return size_;
+    }
+
+    int size_;
+};
+
+TEST_CASE("KernelDescriptor", "[CUDA]") {
+    CUcontext ctx, ctx2;
+    KERNEL_LAUNCHER_CUDA_CHECK(cuInit(0));
+
+    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxCreate(&ctx, 0, 0));
+    auto a = KernelDescriptor(VectorAddDescriptor());
+    auto b = KernelDescriptor(std::make_shared<MatrixMulDescriptor>(1));
+    auto c = KernelDescriptor(std::shared_ptr<IKernelDescriptor>(
+        std::make_shared<MatrixMulDescriptor>(1)));
+    auto d = KernelDescriptor(MatrixMulDescriptor(2));
+
+    // A KernelDescriptor is based on the current CUDA context.
+    // Creating a new CUDA context here will mean that new descriptors will be
+    // based on a different CUDA context than before.
+    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxCreate(&ctx2, 0, 0));
+    auto e = KernelDescriptor(VectorAddDescriptor());
+
+    CHECK(a == a);
+    CHECK_FALSE(a == b);
+    CHECK_FALSE(a == c);
+    CHECK_FALSE(a == d);
+    CHECK_FALSE(a == e);
+
+    CHECK_FALSE(b == a);
+    CHECK(b == b);
+    CHECK(b == c);
+    CHECK_FALSE(b == d);
+    CHECK_FALSE(b == e);
+
+    CHECK_FALSE(c == a);
+    CHECK(c == b);
+    CHECK(c == c);
+    CHECK_FALSE(c == d);
+    CHECK_FALSE(c == e);
+
+    CHECK_FALSE(d == a);
+    CHECK_FALSE(d == b);
+    CHECK_FALSE(d == c);
+    CHECK(d == d);
+    CHECK_FALSE(d == e);
+
+    CHECK_FALSE(e == a);
+    CHECK_FALSE(e == b);
+    CHECK_FALSE(e == c);
+    CHECK_FALSE(e == d);
+    CHECK(e == e);
+
+    // These match the ones above
+    CHECK(a.hash() == a.hash());
+    CHECK_FALSE(a.hash() == b.hash());
+    CHECK_FALSE(a.hash() == c.hash());
+    CHECK_FALSE(a.hash() == d.hash());
+    CHECK_FALSE(a.hash() == e.hash());
+
+    CHECK_FALSE(b.hash() == a.hash());
+    CHECK(b.hash() == b.hash());
+    CHECK(b.hash() == c.hash());
+    CHECK_FALSE(b.hash() == d.hash());
+    CHECK_FALSE(b.hash() == e.hash());
+
+    CHECK_FALSE(c.hash() == a.hash());
+    CHECK(c.hash() == b.hash());
+    CHECK(c.hash() == c.hash());
+    CHECK_FALSE(c.hash() == d.hash());
+    CHECK_FALSE(c.hash() == e.hash());
+
+    CHECK_FALSE(d.hash() == a.hash());
+    CHECK_FALSE(d.hash() == b.hash());
+    CHECK_FALSE(d.hash() == c.hash());
+    CHECK(d.hash() == d.hash());
+    CHECK_FALSE(d.hash() == e.hash());
+
+    CHECK_FALSE(e.hash() == a.hash());
+    CHECK_FALSE(e.hash() == b.hash());
+    CHECK_FALSE(e.hash() == c.hash());
+    CHECK_FALSE(e.hash() == d.hash());
+    CHECK(e.hash() == e.hash());
+
+    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxDestroy(ctx));
+    KERNEL_LAUNCHER_CUDA_CHECK(cuCtxDestroy(ctx2));
+}
+
 TEST_CASE("KernelRegistry", "[CUDA]") {
     CUcontext ctx;
     KERNEL_LAUNCHER_CUDA_CHECK(cuInit(0));