|
| 1 | +/* |
| 2 | + mallocMC: Memory Allocator for Many Core Architectures. |
| 3 | + https://www.hzdr.de/crp |
| 4 | +
|
| 5 | + Copyright 2025 Institute of Radiation Physics, |
| 6 | + Helmholtz-Zentrum Dresden - Rossendorf |
| 7 | +
|
| 8 | + Author(s): Julian Lenz - j.lenz ( at ) hzdr.de |
| 9 | +
|
| 10 | + Permission is hereby granted, free of charge, to any person obtaining a copy |
| 11 | + of this software and associated documentation files (the "Software"), to deal |
| 12 | + in the Software without restriction, including without limitation the rights |
| 13 | + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 14 | + copies of the Software, and to permit persons to whom the Software is |
| 15 | + furnished to do so, subject to the following conditions: |
| 16 | +
|
| 17 | + The above copyright notice and this permission notice shall be included in |
| 18 | + all copies or substantial portions of the Software. |
| 19 | +
|
| 20 | + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 21 | + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 22 | + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 23 | + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 24 | + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 25 | + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 26 | + THE SOFTWARE. |
| 27 | +*/ |
| 28 | + |
| 29 | +#include <mallocMC/mallocMC.cuh> |
| 30 | + |
| 31 | +#include <cstdint> |
| 32 | +#include <cstdlib> |
| 33 | +#include <functional> |
| 34 | +#include <span> |
| 35 | + |
| 36 | +/** |
| 37 | + * @brief Computes the sum of squares of the first `n` natural numbers. |
| 38 | + * |
| 39 | + * This function calculates the sum of squares of the first `n` natural numbers using the formula: |
| 40 | + * \[ |
| 41 | + * \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6} |
| 42 | + * \] |
| 43 | + * It's used to check the computed value in the kernel. |
| 44 | + * |
| 45 | + * @param n The number of natural numbers to consider. |
| 46 | + * @return The sum of squares of the first `n` natural numbers. |
| 47 | + */ |
| 48 | +__device__ auto sumOfSquares(auto const n) |
| 49 | +{ |
| 50 | + return (n * (n + 1) * (2 * n + 1)) / 6; |
| 51 | +} |
| 52 | + |
| 53 | +/** |
| 54 | + * @brief Computes the dot product of two vectors for each thread. |
| 55 | + * |
| 56 | + * This kernel computes the dot product of two vectors, `a` and `b`, for each thread. |
| 57 | + * Each thread allocates memory for its own vectors, initializes them with consecutive values, |
| 58 | + * computes the dot product, and checks if the result matches the expected value. |
| 59 | + * If the result does not match, the thread prints an error message and halts execution. |
| 60 | + * |
| 61 | + * @param memoryManager A CUDA memory manager object used for memory allocation and deallocation. |
| 62 | + * @param numValues The number of elements in each vector. |
| 63 | + * |
| 64 | + * @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a |
| 65 | + * native CUDA application. |
| 66 | + */ |
| 67 | +__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues) |
| 68 | +{ |
| 69 | + uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x; |
| 70 | + |
| 71 | + // Not very realistic, all threads are doing this on their own: |
| 72 | + auto a = std::span<uint64_t>( |
| 73 | + reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))), |
| 74 | + numValues); |
| 75 | + auto b = std::span<uint64_t>( |
| 76 | + reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))), |
| 77 | + numValues); |
| 78 | + |
| 79 | + std::iota(std::begin(a), std::end(a), tid); |
| 80 | + std::iota(std::begin(b), std::end(b), tid); |
| 81 | + |
| 82 | + uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U); |
| 83 | + |
| 84 | + auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0); |
| 85 | + if(result != expected) |
| 86 | + { |
| 87 | + printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected); |
| 88 | + __trap(); |
| 89 | + } |
| 90 | + |
| 91 | + memoryManager.free(a.data()); |
| 92 | + memoryManager.free(b.data()); |
| 93 | +} |
| 94 | + |
| 95 | +int main() |
| 96 | +{ |
| 97 | + size_t const heapSize = 1024U * 1024U * 1024U; |
| 98 | + uint64_t const numValues = 32U; |
| 99 | + mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize}; |
| 100 | + auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure}; |
| 101 | + |
| 102 | + std::cout << "Running native CUDA kernel." << std::endl; |
| 103 | + oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues); |
| 104 | +} |
0 commit comments