Use thread-safe gpu-lite

Luthaf · Luthaf · commit 9ecf27c6b0e9 · 2026-02-11T16:22:12.000+01:00
diff --git a/tox.ini b/tox.ini
@@ -125,7 +125,7 @@ deps = cmake
 commands =
     cmake -B {envtmpdir} -S vesin -DVESIN_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug
     cmake --build {envtmpdir} --config Debug
-    ctest --test-dir {envtmpdir} --build-config Debug
+    ctest --test-dir {envtmpdir} --build-config Debug --output-on-failure
 
 
 [testenv:fortran-tests]
@@ -137,7 +137,7 @@ deps = cmake
 commands =
     cmake -B {envtmpdir} -S fortran -DVESIN_FORTRAN_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug
     cmake --build {envtmpdir} --config Debug
-    ctest --test-dir {envtmpdir} --build-config Debug
+    ctest --test-dir {envtmpdir} --build-config Debug --output-on-failure
 
 
 [testenv:lint]
diff --git a/vesin/CMakeLists.txt b/vesin/CMakeLists.txt
@@ -81,7 +81,7 @@ FetchContent_Declare(
     # GIT_REPOSITORY https://github.com/rubber-duck-debug/gpu-lite.git
     # GIT_TAG 78b4bad091e329b332def47ee9692e367a28ea85 # v1.0.0
     GIT_REPOSITORY https://github.com/Luthaf/gpu-lite.git
-    GIT_TAG f59550e6eafdf51a3fd266a50ec080f640c0991c
+    GIT_TAG e50589e9f78a154425917ebcfee56471f1d067fc
     EXCLUDE_FROM_ALL
 )
 FetchContent_MakeAvailable(gpulite)
diff --git a/vesin/src/vesin_cuda.cpp b/vesin/src/vesin_cuda.cpp
@@ -520,7 +520,7 @@ void vesin::cuda::neighbors(
     auto* d_overflow_flag = extras->overflow_flag;
     size_t max_pairs = extras->max_pairs;
 
-    auto& factory = KernelFactory::instance();
+    auto& factory = KernelFactory::instance(device_id);
 
     if (extras->box_diag == nullptr) {
         CUDART_SAFE_CALL(CUDART_INSTANCE.cudaMalloc((void**)&extras->box_diag, sizeof(double) * 3));
diff --git a/vesin/tests/CMakeLists.txt b/vesin/tests/CMakeLists.txt
@@ -23,6 +23,8 @@ else()
     set(TEST_COMMAND "")
 endif()
 
+find_package(CUDAToolkit)
+
 
 file(GLOB ALL_TESTS *.cpp)
 foreach(_file_ ${ALL_TESTS})
@@ -34,4 +36,9 @@ foreach(_file_ ${ALL_TESTS})
         NAME ${_name_}
         COMMAND ${TEST_COMMAND} $<TARGET_FILE:${_name_}>
     )
+
+    if (CUDAToolkit_FOUND)
+        target_compile_definitions(${_name_} PRIVATE VESIN_TESTS_WITH_CUDA)
+        target_link_libraries(${_name_} CUDA::cudart)
+    endif()
 endforeach()
diff --git a/vesin/tests/cuda.cpp b/vesin/tests/cuda.cpp
@@ -0,0 +1,163 @@
+#include <catch2/catch_test_macros.hpp>
+
+#ifdef VESIN_TESTS_WITH_CUDA
+
+#include <cmath>
+#include <thread>
+
+#include <cuda_runtime.h>
+
+#include <vesin.h>
+
+void check_cuda(cudaError_t status) {
+    if (status != cudaSuccess) {
+        const char* message = cudaGetErrorString(status);
+        FAIL(message);
+    }
+}
+
+void run_cuda_test(int device_id) {
+    check_cuda(cudaSetDevice(device_id));
+
+    double points[][3] = {
+        {0.0, 0.0, 0.0},
+        {1.0, 1.0, 1.0},
+        {2.0, 2.0, 2.0},
+    };
+    size_t n_points = 3;
+    double (*d_points)[3] = nullptr;
+    check_cuda(cudaMalloc(&d_points, sizeof(double) * n_points * 3));
+    check_cuda(cudaMemcpy(d_points, points, sizeof(double) * n_points * 3, cudaMemcpyHostToDevice));
+
+    double box[3][3] = {
+        {0.0, 3.0, 3.0},
+        {3.0, 0.0, 3.0},
+        {3.0, 3.0, 0.0},
+    };
+    double (*d_box)[3] = nullptr;
+    check_cuda(cudaMalloc(&d_box, sizeof(double) * 9));
+    check_cuda(cudaMemcpy(d_box, box, sizeof(double) * 9, cudaMemcpyHostToDevice));
+
+    bool periodic[3] = {true, true, true};
+    bool* d_periodic = nullptr;
+    check_cuda(cudaMalloc(&d_periodic, sizeof(bool) * 3));
+    check_cuda(cudaMemcpy(d_periodic, periodic, sizeof(bool) * 3, cudaMemcpyHostToDevice));
+
+    VesinNeighborList neighbors;
+
+    auto options = VesinOptions();
+    options.cutoff = 3.0;
+    options.full = false;
+    options.sorted = false;
+    options.algorithm = VesinAutoAlgorithm;
+    options.return_shifts = true;
+    options.return_distances = true;
+    options.return_vectors = true;
+
+    const char* error_message = nullptr;
+    auto status = vesin_neighbors(
+        d_points,
+        n_points,
+        d_box,
+        d_periodic,
+        {VesinDeviceKind::VesinCUDA, device_id},
+        options,
+        &neighbors,
+        &error_message
+    );
+
+    REQUIRE(error_message == nullptr);
+    REQUIRE(status == EXIT_SUCCESS);
+
+    CHECK(neighbors.length == 5);
+    CHECK(neighbors.pairs != nullptr);
+    CHECK(neighbors.shifts != nullptr);
+    CHECK(neighbors.distances != nullptr);
+    CHECK(neighbors.vectors != nullptr);
+
+    auto* h_pairs = static_cast<size_t (*)[2]>(malloc(sizeof(size_t) * neighbors.length * 2));
+    check_cuda(cudaMemcpy(h_pairs, neighbors.pairs, sizeof(size_t) * neighbors.length * 2, cudaMemcpyDeviceToHost));
+
+    auto* h_shifts = static_cast<int32_t (*)[3]>(malloc(sizeof(int32_t) * neighbors.length * 3));
+    check_cuda(cudaMemcpy(h_shifts, neighbors.shifts, sizeof(int32_t) * neighbors.length * 3, cudaMemcpyDeviceToHost));
+
+    auto* h_distances = static_cast<double*>(malloc(sizeof(double) * neighbors.length));
+    check_cuda(cudaMemcpy(h_distances, neighbors.distances, sizeof(double) * neighbors.length, cudaMemcpyDeviceToHost));
+
+    auto* h_vectors = static_cast<double (*)[3]>(malloc(sizeof(double) * neighbors.length * 3));
+    check_cuda(cudaMemcpy(h_vectors, neighbors.vectors, sizeof(double) * neighbors.length * 3, cudaMemcpyDeviceToHost));
+
+    for (size_t i = 0; i < neighbors.length; ++i) {
+        if (h_pairs[i][0] == 0 && h_pairs[i][1] == 2) {
+            // we have three pairs between 0 and 2 with shifts (-1, 0, 0),
+            // (0, -1, 0), and (0, 0, -1)
+            CHECK(h_distances[i] == std::sqrt(6.0));
+
+            if (h_shifts[i][0] == -1 && h_shifts[i][1] == 0 && h_shifts[i][2] == 0) {
+                CHECK(h_vectors[i][0] == 2.0);
+                CHECK(h_vectors[i][1] == -1.0);
+                CHECK(h_vectors[i][2] == -1.0);
+            } else if (h_shifts[i][0] == 0 && h_shifts[i][1] == -1 && h_shifts[i][2] == 0) {
+                CHECK(h_vectors[i][0] == -1.0);
+                CHECK(h_vectors[i][1] == 2.0);
+                CHECK(h_vectors[i][2] == -1.0);
+            } else if (h_shifts[i][0] == 0 && h_shifts[i][1] == 0 && h_shifts[i][2] == -1) {
+                CHECK(h_vectors[i][0] == -1.0);
+                CHECK(h_vectors[i][1] == -1.0);
+                CHECK(h_vectors[i][2] == 2.0);
+            } else {
+                FAIL("Unexpected shift for pair (0, 2): (" + std::to_string(h_shifts[i][0]) + ", " + std::to_string(h_shifts[i][1]) + ", " + std::to_string(h_shifts[i][2]) + ")");
+            }
+
+        } else if ((h_pairs[i][0] == 0 && h_pairs[i][1] == 1) || (h_pairs[i][0] == 1 && h_pairs[i][1] == 2)) {
+            // pairs between 0-1 or 1-2 should have zero shifts, distance
+            // sqrt(3), and vector (1, 1, 1)
+            CHECK(h_shifts[i][0] == 0);
+            CHECK(h_shifts[i][1] == 0);
+            CHECK(h_shifts[i][2] == 0);
+
+            CHECK(h_distances[i] == std::sqrt(3.0));
+            CHECK(h_vectors[i][0] == 1.0);
+            CHECK(h_vectors[i][1] == 1.0);
+            CHECK(h_vectors[i][2] == 1.0);
+        } else {
+            FAIL("Unexpected pair: (" + std::to_string(h_pairs[i][0]) + ", " + std::to_string(h_pairs[i][1]) + ")");
+        }
+    }
+
+    // Clean up
+    vesin_free(&neighbors);
+
+    free(h_pairs);
+    free(h_shifts);
+    free(h_distances);
+    free(h_vectors);
+
+    check_cuda(cudaFree(d_points));
+    check_cuda(cudaFree(d_box));
+    check_cuda(cudaFree(d_periodic));
+}
+
+TEST_CASE("Test CUDA") {
+    // get the number of CUDA devices
+    int n_devices = 0;
+    check_cuda(cudaGetDeviceCount(&n_devices));
+    REQUIRE(n_devices > 0);
+
+    // start multiple threads to test concurrent execution
+    auto threads = std::vector<std::thread>();
+    for (int thread_id = 0; thread_id < 10; ++thread_id) {
+        std::thread t(run_cuda_test, thread_id % n_devices);
+        threads.push_back(std::move(t));
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+}
+
+#else
+
+TEST_CASE("CUDA tests are disabled") {}
+
+#endif

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@ FetchContent_Declare(`
`81`	`81`	`# GIT_REPOSITORY https://github.com/rubber-duck-debug/gpu-lite.git`
`82`	`82`	`# GIT_TAG 78b4bad091e329b332def47ee9692e367a28ea85 # v1.0.0`
`83`	`83`	`GIT_REPOSITORY https://github.com/Luthaf/gpu-lite.git`
`84`		`- GIT_TAG f59550e6eafdf51a3fd266a50ec080f640c0991c`
	`84`	`+ GIT_TAG e50589e9f78a154425917ebcfee56471f1d067fc`
`85`	`85`	`EXCLUDE_FROM_ALL`
`86`	`86`	`)`
`87`	`87`	`FetchContent_MakeAvailable(gpulite)`