Skip to content

Commit 65c15d7

Browse files
Merge pull request #288 from chillenzer/native-cuda-example
Add infrastructure and example for native CUDA
2 parents 510520d + 9b6f17c commit 65c15d7

File tree

14 files changed

+397
-16
lines changed

14 files changed

+397
-16
lines changed

CMakeLists.txt

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@ if(PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)
1717
)
1818
endif()
1919

20+
# ---- Options ----
21+
22+
option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
23+
option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF)
24+
if (mallocMC_BUILD_TESTING OR mallocMC_BUILD_EXAMPLES)
25+
enable_testing()
26+
endif()
27+
if (mallocMC_BUILD_TESTING)
28+
set(alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE ON CACHE BOOL "" FORCE)
29+
endif()
30+
2031
# ---- Add dependencies via CPM ----
2132
# see https://github.com/TheLartians/CPM.cmake for more info
2233

@@ -39,6 +50,12 @@ set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
3950
if(alpaka_ACC_GPU_CUDA_ENABLE)
4051
add_controlled("Gallatin")
4152

53+
if (TARGET gallatin::gallatin)
54+
set(mallocMC_HAS_Gallatin_AVAILABLE YES)
55+
else()
56+
set(mallocMC_HAS_Gallatin_AVAILABLE NO)
57+
endif()
58+
4259
# Gallatin needs some fairly recent compute capability from CUDA.
4360
# CMake defaults to taking the oldest supported by the device
4461
# (https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html)
@@ -56,9 +73,13 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
5673
"If the architecture set is too old, this can lead to compilation errors with Gallatin. "
5774
"If Gallatin is needed, please set CMAKE_CUDA_ARCHITECTURES to the correct value >= 70."
5875
)
76+
set(mallocMC_HAS_Gallatin_AVAILABLE NO)
5977
endif()
6078

61-
target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
79+
if (mallocMC_HAS_Gallatin_AVAILABLE)
80+
target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
81+
target_compile_definitions(${PROJECT_NAME} INTERFACE mallocMC_HAS_Gallatin_AVAILABLE)
82+
endif()
6283
endif()
6384

6485
# being a cross-platform target, we enforce standards conformance on MSVC
@@ -68,15 +89,14 @@ target_include_directories(
6889
${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
6990
$<INSTALL_INTERFACE:include/${PROJECT_NAME}-${PROJECT_VERSION}>
7091
)
92+
target_link_libraries(${PROJECT_NAME} INTERFACE alpaka::alpaka)
93+
7194

72-
option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
7395
if(mallocMC_BUILD_TESTING)
7496
include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake)
75-
enable_testing()
7697
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/test ${CMAKE_BINARY_DIR}/test)
7798
endif()
7899

79-
option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF)
80100
if(mallocMC_BUILD_EXAMPLES)
81101
include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake)
82102
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples ${CMAKE_BINARY_DIR}/examples)

cmake/package-lock.cmake

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@ CPMDeclarePackage(PackageProject.cmake
1111
# alpaka
1212
CPMDeclarePackage(alpaka
1313
NAME alpaka
14-
GIT_TAG 1.2.0
14+
# This is a development version slightly after 1.2.0 because we needed a patch
15+
GIT_TAG 95c0bf2397255a89467bb5c151a96367ad1d1f93
1516
GITHUB_REPOSITORY alpaka-group/alpaka
1617
OPTIONS
17-
"alpaka_CXX_STANDARD 20"
18+
"alpaka_CXX_STANDARD 20;alpaka_INSTALL ON"
1819
# It is recommended to let CPM cache dependencies in order to reduce redundant downloads.
1920
# However, we might in the foreseeable future turn to unstable references like the `dev` branch here.
2021
# Setting the following option tells CPM to not use the cache.

examples/CMakeLists.txt

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,22 @@ add_subdirectory(
1212
${CMAKE_BINARY_DIR}/examples/getAvailableSlots
1313
)
1414

15-
add_custom_target(
15+
check_language(CUDA)
16+
if (CMAKE_CUDA_COMPILER AND alpaka_ACC_GPU_CUDA_ENABLE)
17+
add_subdirectory(
18+
${CMAKE_CURRENT_LIST_DIR}/native-cuda
19+
${CMAKE_BINARY_DIR}/examples/native-cuda
20+
)
21+
22+
add_custom_target(
23+
mallocMCExamples
24+
DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots mallocMCExampleNativeCuda
25+
COMMENT "Shortcut for building all examples."
26+
)
27+
else()
28+
add_custom_target(
1629
mallocMCExamples
1730
DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots
1831
COMMENT "Shortcut for building all examples."
19-
)
32+
)
33+
endif()

examples/getAvailableSlots/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ set_target_properties(${PROJECT_NAME}
3232
)
3333

3434
target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)
35+
add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})

examples/getAvailableSlots/source/main.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,14 +136,17 @@ auto main(int /*argc*/, char* /*argv*/[]) -> int
136136
example03<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
137137
example03<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
138138
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
139+
# ifdef mallocMC_HAS_Gallatin_AVAILABLE
139140
example03<
140141
mallocMC::CreationPolicies::GallatinCuda<>,
141142
mallocMC::ReservePoolPolicies::Noop,
142143
mallocMC::AlignmentPolicies::Noop>();
143144
// GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
144145
example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
146+
# else
145147
// This should normally be:
146-
// example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
148+
example03<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
149+
# endif
147150
#else
148151
example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
149152
#endif
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
cmake_minimum_required(VERSION 3.14...3.22)
2+
3+
project(mallocMCExampleNativeCuda LANGUAGES CXX CUDA)
4+
5+
# --- Import tools ----
6+
7+
include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake)
8+
9+
# ---- Dependencies ----
10+
11+
include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake)
12+
CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake)
13+
14+
if(NOT TARGET mallocMC)
15+
CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
16+
endif()
17+
18+
# ---- Create standalone executable ----
19+
20+
add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cu)
21+
22+
set_target_properties(${PROJECT_NAME}
23+
PROPERTIES
24+
CXX_STANDARD 20
25+
OUTPUT_NAME ${PROJECT_NAME}
26+
CXX_STANDARD_REQUIRED ON
27+
CXX_EXTENSIONS OFF
28+
)
29+
30+
target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC ${CUDA_LIBRARIES})
31+
add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
mallocMC: Memory Allocator for Many Core Architectures.
3+
https://www.hzdr.de/crp
4+
5+
Copyright 2025 Institute of Radiation Physics,
6+
Helmholtz-Zentrum Dresden - Rossendorf
7+
8+
Author(s): Julian Lenz - j.lenz ( at ) hzdr.de
9+
10+
Permission is hereby granted, free of charge, to any person obtaining a copy
11+
of this software and associated documentation files (the "Software"), to deal
12+
in the Software without restriction, including without limitation the rights
13+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14+
copies of the Software, and to permit persons to whom the Software is
15+
furnished to do so, subject to the following conditions:
16+
17+
The above copyright notice and this permission notice shall be included in
18+
all copies or substantial portions of the Software.
19+
20+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26+
THE SOFTWARE.
27+
*/
28+
29+
#include <mallocMC/mallocMC.cuh>
30+
31+
#include <cstdint>
32+
#include <cstdlib>
33+
#include <functional>
34+
#include <span>
35+
36+
/**
37+
* @brief Computes the sum of squares of the first `n` natural numbers.
38+
*
39+
* This function calculates the sum of squares of the first `n` natural numbers using the formula:
40+
* \[
41+
* \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6}
42+
* \]
43+
* It's used to check the computed value in the kernel.
44+
*
45+
* @param n The number of natural numbers to consider.
46+
* @return The sum of squares of the first `n` natural numbers.
47+
*/
48+
__device__ auto sumOfSquares(auto const n)
49+
{
50+
return (n * (n + 1) * (2 * n + 1)) / 6;
51+
}
52+
53+
/**
54+
* @brief Computes the dot product of two vectors for each thread.
55+
*
56+
* This kernel computes the dot product of two vectors, `a` and `b`, for each thread.
57+
* Each thread allocates memory for its own vectors, initializes them with consecutive values,
58+
* computes the dot product, and checks if the result matches the expected value.
59+
* If the result does not match, the thread prints an error message and halts execution.
60+
*
61+
* @param memoryManager A CUDA memory manager object used for memory allocation and deallocation.
62+
* @param numValues The number of elements in each vector.
63+
*
64+
* @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a
65+
* native CUDA application.
66+
*/
67+
__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues)
68+
{
69+
uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
70+
71+
// Not very realistic, all threads are doing this on their own:
72+
auto a = std::span<uint64_t>(
73+
reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
74+
numValues);
75+
auto b = std::span<uint64_t>(
76+
reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
77+
numValues);
78+
79+
std::iota(std::begin(a), std::end(a), tid);
80+
std::iota(std::begin(b), std::end(b), tid);
81+
82+
uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U);
83+
84+
auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0);
85+
if(result != expected)
86+
{
87+
printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected);
88+
__trap();
89+
}
90+
91+
memoryManager.free(a.data());
92+
memoryManager.free(b.data());
93+
}
94+
95+
int main()
96+
{
97+
size_t const heapSize = 1024U * 1024U * 1024U;
98+
uint64_t const numValues = 32U;
99+
mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize};
100+
auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure};
101+
102+
std::cout << "Running native CUDA kernel." << std::endl;
103+
oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues);
104+
}

examples/vectorAdd/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,5 @@ set_target_properties(${PROJECT_NAME}
3232
)
3333

3434
target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)
35+
36+
add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})

examples/vectorAdd/source/main.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,15 +229,19 @@ auto main(int /*argc*/, char* /*argv*/[]) -> int
229229
{
230230
example01<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
231231
example01<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
232+
232233
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
234+
# ifdef mallocMC_HAS_Gallatin_AVAILABLE
233235
example01<
234236
mallocMC::CreationPolicies::GallatinCuda<>,
235237
mallocMC::ReservePoolPolicies::Noop,
236238
mallocMC::AlignmentPolicies::Noop>();
237239
// GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
238240
example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
241+
# else
239242
// This should normally be:
240-
// example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
243+
example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
244+
# endif
241245
#else
242246
example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
243247
#endif

include/mallocMC/allocator.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ namespace mallocMC
198198
}
199199

200200
ALPAKA_FN_HOST
201-
auto getAllocatorHandle() -> AllocatorHandle
201+
auto getAllocatorHandle() const -> AllocatorHandle
202202
{
203203
return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)};
204204
}

0 commit comments

Comments
 (0)