Merge pull request #288 from chillenzer/native-cuda-example

psychocoderHPC · web-flow · commit 65c15d7cc46a · 2025-02-12T11:22:36.000+01:00
Add infrastructure and example for native CUDA
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,6 +17,17 @@ if(PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)
   )
 endif()
 
+# ---- Options ----
+
+option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
+option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF)
+if (mallocMC_BUILD_TESTING OR mallocMC_BUILD_EXAMPLES)
+  enable_testing()
+endif()
+if (mallocMC_BUILD_TESTING)
+  set(alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE ON CACHE BOOL "" FORCE)
+endif()
+
 # ---- Add dependencies via CPM ----
 # see https://github.com/TheLartians/CPM.cmake for more info
 
@@ -39,6 +50,12 @@ set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
 if(alpaka_ACC_GPU_CUDA_ENABLE)
   add_controlled("Gallatin")
 
+  if (TARGET gallatin::gallatin)
+    set(mallocMC_HAS_Gallatin_AVAILABLE YES)
+  else()
+    set(mallocMC_HAS_Gallatin_AVAILABLE NO)
+  endif()
+
   # Gallatin needs some fairly recent compute capability from CUDA.
   # CMake defaults to taking the oldest supported by the device
   # (https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html)
@@ -56,9 +73,13 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
         "If the architecture set is too old, this can lead to compilation errors with Gallatin. "
         "If Gallatin is needed, please set CMAKE_CUDA_ARCHITECTURES to the correct value >= 70."
       )
+    set(mallocMC_HAS_Gallatin_AVAILABLE NO)
   endif()
 
-  target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
+  if (mallocMC_HAS_Gallatin_AVAILABLE)
+    target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
+    target_compile_definitions(${PROJECT_NAME} INTERFACE mallocMC_HAS_Gallatin_AVAILABLE)
+  endif()
 endif()
 
 # being a cross-platform target, we enforce standards conformance on MSVC
@@ -68,15 +89,14 @@ target_include_directories(
   ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
                          $<INSTALL_INTERFACE:include/${PROJECT_NAME}-${PROJECT_VERSION}>
 )
+target_link_libraries(${PROJECT_NAME} INTERFACE alpaka::alpaka)
+
 
-option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
 if(mallocMC_BUILD_TESTING)
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake)
-  enable_testing()
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/test ${CMAKE_BINARY_DIR}/test)
 endif()
 
-option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF)
 if(mallocMC_BUILD_EXAMPLES)
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake)
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples ${CMAKE_BINARY_DIR}/examples)
diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake
@@ -11,10 +11,11 @@ CPMDeclarePackage(PackageProject.cmake
 # alpaka
 CPMDeclarePackage(alpaka
   NAME alpaka
-  GIT_TAG 1.2.0
+  # This is a development version slightly after 1.2.0 because we needed a patch
+  GIT_TAG 95c0bf2397255a89467bb5c151a96367ad1d1f93
   GITHUB_REPOSITORY alpaka-group/alpaka
   OPTIONS
-    "alpaka_CXX_STANDARD 20"
+    "alpaka_CXX_STANDARD 20;alpaka_INSTALL ON"
   # It is recommended to let CPM cache dependencies in order to reduce redundant downloads.
   # However, we might in the foreseeable future turn to unstable references like the `dev` branch here.
   # Setting the following option tells CPM to not use the cache.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -12,8 +12,22 @@ add_subdirectory(
     ${CMAKE_BINARY_DIR}/examples/getAvailableSlots
 )
 
-add_custom_target(
+check_language(CUDA)
+if (CMAKE_CUDA_COMPILER AND alpaka_ACC_GPU_CUDA_ENABLE)
+  add_subdirectory(
+    ${CMAKE_CURRENT_LIST_DIR}/native-cuda
+    ${CMAKE_BINARY_DIR}/examples/native-cuda
+  )
+
+  add_custom_target(
+    mallocMCExamples
+    DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots mallocMCExampleNativeCuda
+    COMMENT "Shortcut for building all examples."
+  )
+else()
+  add_custom_target(
     mallocMCExamples
     DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots
     COMMENT "Shortcut for building all examples."
-)
+  )
+endif()
diff --git a/examples/getAvailableSlots/CMakeLists.txt b/examples/getAvailableSlots/CMakeLists.txt
@@ -32,3 +32,4 @@ set_target_properties(${PROJECT_NAME}
 )
 
 target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)
+add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
diff --git a/examples/getAvailableSlots/source/main.cpp b/examples/getAvailableSlots/source/main.cpp
@@ -136,14 +136,17 @@ auto main(int /*argc*/, char* /*argv*/[]) -> int
     example03<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
     example03<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#    ifdef mallocMC_HAS_Gallatin_AVAILABLE
     example03<
         mallocMC::CreationPolicies::GallatinCuda<>,
         mallocMC::ReservePoolPolicies::Noop,
         mallocMC::AlignmentPolicies::Noop>();
     // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
     example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
+#    else
     // This should normally be:
-    //    example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+    example03<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+#    endif
 #else
     example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
 #endif
diff --git a/examples/native-cuda/CMakeLists.txt b/examples/native-cuda/CMakeLists.txt
@@ -0,0 +1,31 @@
+cmake_minimum_required(VERSION 3.14...3.22)
+
+project(mallocMCExampleNativeCuda LANGUAGES CXX CUDA)
+
+# --- Import tools ----
+
+include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake)
+
+# ---- Dependencies ----
+
+include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake)
+CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake)
+
+if(NOT TARGET mallocMC)
+  CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
+endif()
+
+# ---- Create standalone executable ----
+
+add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cu)
+
+set_target_properties(${PROJECT_NAME}
+    PROPERTIES
+        CXX_STANDARD 20
+        OUTPUT_NAME ${PROJECT_NAME}
+        CXX_STANDARD_REQUIRED ON
+        CXX_EXTENSIONS OFF
+  )
+
+target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC ${CUDA_LIBRARIES})
+add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
diff --git a/examples/native-cuda/source/main.cu b/examples/native-cuda/source/main.cu
@@ -0,0 +1,104 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  https://www.hzdr.de/crp
+
+  Copyright 2025 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Julian Lenz - j.lenz ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include <mallocMC/mallocMC.cuh>
+
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <span>
+
+/**
+ * @brief Computes the sum of squares of the first `n` natural numbers.
+ *
+ * This function calculates the sum of squares of the first `n` natural numbers using the formula:
+ * \[
+ * \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6}
+ * \]
+ * It's used to check the computed value in the kernel.
+ *
+ * @param n The number of natural numbers to consider.
+ * @return The sum of squares of the first `n` natural numbers.
+ */
+__device__ auto sumOfSquares(auto const n)
+{
+    return (n * (n + 1) * (2 * n + 1)) / 6;
+}
+
+/**
+ * @brief Computes the dot product of two vectors for each thread.
+ *
+ * This kernel computes the dot product of two vectors, `a` and `b`, for each thread.
+ * Each thread allocates memory for its own vectors, initializes them with consecutive values,
+ * computes the dot product, and checks if the result matches the expected value.
+ * If the result does not match, the thread prints an error message and halts execution.
+ *
+ * @param memoryManager A CUDA memory manager object used for memory allocation and deallocation.
+ * @param numValues The number of elements in each vector.
+ *
+ * @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a
+ * native CUDA application.
+ */
+__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues)
+{
+    uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    // Not very realistic, all threads are doing this on their own:
+    auto a = std::span<uint64_t>(
+        reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
+        numValues);
+    auto b = std::span<uint64_t>(
+        reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
+        numValues);
+
+    std::iota(std::begin(a), std::end(a), tid);
+    std::iota(std::begin(b), std::end(b), tid);
+
+    uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U);
+
+    auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0);
+    if(result != expected)
+    {
+        printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected);
+        __trap();
+    }
+
+    memoryManager.free(a.data());
+    memoryManager.free(b.data());
+}
+
+int main()
+{
+    size_t const heapSize = 1024U * 1024U * 1024U;
+    uint64_t const numValues = 32U;
+    mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize};
+    auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure};
+
+    std::cout << "Running native CUDA kernel." << std::endl;
+    oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues);
+}
diff --git a/examples/vectorAdd/CMakeLists.txt b/examples/vectorAdd/CMakeLists.txt
@@ -32,3 +32,5 @@ set_target_properties(${PROJECT_NAME}
 )
 
 target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)
+
+add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
diff --git a/examples/vectorAdd/source/main.cpp b/examples/vectorAdd/source/main.cpp
@@ -229,15 +229,19 @@ auto main(int /*argc*/, char* /*argv*/[]) -> int
 {
     example01<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
     example01<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
+
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#    ifdef mallocMC_HAS_Gallatin_AVAILABLE
     example01<
         mallocMC::CreationPolicies::GallatinCuda<>,
         mallocMC::ReservePoolPolicies::Noop,
         mallocMC::AlignmentPolicies::Noop>();
     // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
     example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
+#    else
     // This should normally be:
-    //    example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+    example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+#    endif
 #else
     example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
 #endif
diff --git a/include/mallocMC/allocator.hpp b/include/mallocMC/allocator.hpp
@@ -198,7 +198,7 @@ namespace mallocMC
         }
 
         ALPAKA_FN_HOST
-        auto getAllocatorHandle() -> AllocatorHandle
+        auto getAllocatorHandle() const -> AllocatorHandle
         {
             return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)};
         }
diff --git a/include/mallocMC/creationPolicies/GallatinCuda.hpp b/include/mallocMC/creationPolicies/GallatinCuda.hpp
@@ -30,7 +30,7 @@
 
 #include <alpaka/alpaka.hpp>
 
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#ifdef mallocMC_HAS_Gallatin_AVAILABLE
 #    include <gallatin/allocators/gallatin.cuh>
 #else
 
@@ -45,6 +45,23 @@ namespace gallatin::allocators
         {
             return nullptr;
         }
+
+        template<typename... T>
+        auto malloc(T... /*unused*/) -> void*
+        {
+            // This always triggers but it depends on the template parameter, so it's only instantiated if we actually
+            // use it.
+            static_assert(sizeof...(T) < 0, "Attempt to use malloc of unavailable gallatin prototype.");
+            return nullptr;
+        }
+
+        template<typename... T>
+        auto free(T... /*unused*/)
+        {
+            // This always triggers but it depends on the template parameter, so it's only instantiated if we actually
+            // use it.
+            static_assert(sizeof...(T) < 0, "Attempt to use free of unavailable gallatin prototype.");
+        }
     };
 } // namespace gallatin::allocators
 
@@ -89,7 +106,7 @@ namespace mallocMC
             static constexpr auto providesAvailableSlots = false;
 
             template<typename AlpakaAcc>
-            ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t bytes) const -> void*
+            ALPAKA_FN_ACC auto create(AlpakaAcc const& /*acc*/, uint32_t bytes) const -> void*
             {
                 return heap->malloc(static_cast<size_t>(bytes));
             }
@@ -107,7 +124,7 @@ namespace mallocMC
 
             template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue, typename T_DeviceAllocator>
             static void initHeap(
-                AlpakaDevice& dev,
+                AlpakaDevice& /*dev*/,
                 AlpakaQueue& queue,
                 T_DeviceAllocator* devAllocator,
                 void*,
diff --git a/include/mallocMC/mallocMC.cuh b/include/mallocMC/mallocMC.cuh
diff --git a/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp b/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
diff --git a/test/unit/source/Allocator.cpp b/test/unit/source/Allocator.cpp

Original file line number	Diff line number	Diff line change
`@@ -32,3 +32,4 @@ set_target_properties(${PROJECT_NAME}`
`32`	`32`	`)`
`33`	`33`
`34`	`34`	`target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)`
	`35`	`+add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})`
Original file line number	Diff line number	Diff line change
`@@ -32,3 +32,5 @@ set_target_properties(${PROJECT_NAME}`
`32`	`32`	`)`
`33`	`33`
`34`	`34`	`target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)`
	`35`	`+`
	`36`	`+add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})`
Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,7 @@ namespace mallocMC`
`198`	`198`	`}`
`199`	`199`
`200`	`200`	`ALPAKA_FN_HOST`
`201`		`- auto getAllocatorHandle() -> AllocatorHandle`
	`201`	`+ auto getAllocatorHandle() const -> AllocatorHandle`
`202`	`202`	`{`
`203`	`203`	`return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)};`
`204`	`204`	`}`