|
| 1 | +#include <kitgenbench/DeviceClock.h> |
| 2 | +#include <kitgenbench/kitgenbench.h> |
| 3 | +#include <kitgenbench/setup.h> |
| 4 | +#include <kitgenbench/version.h> |
| 5 | + |
| 6 | +#include <alpaka/workdiv/WorkDivMembers.hpp> |
| 7 | +#include <cstdint> |
| 8 | +#include <limits> |
| 9 | +#include <mallocMC/mallocMC.hpp> |
| 10 | +#include <tuple> |
| 11 | +#include <utility> |
| 12 | +#include <variant> |
| 13 | +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED |
| 14 | +# include <cuda_runtime.h> |
| 15 | +#endif // alpaka_ACC_GPU_CUDA_ENABLE |
| 16 | + |
| 17 | +#include <alpaka/acc/AccCpuSerial.hpp> |
| 18 | +#include <alpaka/acc/Tag.hpp> |
| 19 | +#include <alpaka/atomic/Traits.hpp> |
| 20 | +#include <alpaka/core/Common.hpp> |
| 21 | +#include <alpaka/mem/buf/Traits.hpp> |
| 22 | +#include <alpaka/mem/view/Traits.hpp> |
| 23 | +#include <cstdlib> |
| 24 | +#include <type_traits> |
| 25 | +#include <vector> |
| 26 | + |
| 27 | +#include "nlohmann/json_fwd.hpp" |
| 28 | + |
| 29 | +using nlohmann::json; |
| 30 | +using namespace kitgenbench; |
| 31 | + |
| 32 | +using Dim = alpaka::DimInt<1>; |
| 33 | +using Idx = std::uint32_t; |
| 34 | +using AccTag = std::remove_cvref_t<decltype(std::get<0>(alpaka::EnabledAccTags{}))>; |
| 35 | +using Acc = alpaka::TagToAcc<AccTag, Dim, Idx>; |
| 36 | + |
| 37 | +using namespace mallocMC; |
| 38 | +using MyAllocator |
| 39 | + = mallocMC::Allocator<Acc, CreationPolicies::FlatterScatter<>, DistributionPolicies::Noop, |
| 40 | + OOMPolicies::ReturnNull, ReservePoolPolicies::AlpakaBuf<Acc>, |
| 41 | + AlignmentPolicies::Shrink<>>; |
| 42 | + |
| 43 | +namespace kitgenbench::Actions { |
| 44 | + [[maybe_unused]] static constexpr int MALLOC = 1; |
| 45 | + [[maybe_unused]] static constexpr int FREE = 2; |
| 46 | +} // namespace kitgenbench::Actions |
| 47 | + |
| 48 | +auto makeExecutionDetails() { |
| 49 | + auto const platformAcc = alpaka::Platform<Acc>{}; |
| 50 | + auto const dev = alpaka::getDevByIdx(platformAcc, 0); |
| 51 | +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED |
| 52 | + cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024U * 1024U * 1024U); |
| 53 | +#endif |
| 54 | + uint32_t const numThreadsPerBlock = 256U; |
| 55 | + uint32_t const numThreads = 4U * numThreadsPerBlock; |
| 56 | + auto workdiv = [numThreads, numThreadsPerBlock]() -> alpaka::WorkDivMembers<Dim, Idx> { |
| 57 | + if constexpr (std::is_same_v<alpaka::AccToTag<Acc>, alpaka::TagCpuSerial>) { |
| 58 | + return {{1U}, {1U}, {numThreads}}; |
| 59 | + } else { |
| 60 | + return alpaka::WorkDivMembers<Dim, Idx>{ |
| 61 | + {numThreads / numThreadsPerBlock}, {numThreadsPerBlock}, {1U}}; |
| 62 | + } |
| 63 | + }(); |
| 64 | + return kitgenbench::ExecutionDetails<Acc, decltype(dev)>{workdiv, dev}; |
| 65 | +} |
| 66 | + |
| 67 | +static constexpr std::uint32_t ALLOCATION_SIZE = 16U; |
| 68 | + |
| 69 | +// Reasons for the check to yield the result it yielded. |
| 70 | +// `completed` means that the check completed. The result can still be true/false depending on |
| 71 | +// whether the obtained value was actually correct. `notApplicable` means that the checks were |
| 72 | +// skipped. `nullpointer` means that a nullpointer was given, so the checks couldn't run at all. |
| 73 | +enum class Reason { completed, notApplicable, nullpointer }; |
| 74 | +using Payload = std::variant<std::span<std::byte, ALLOCATION_SIZE>, std::pair<bool, Reason>>; |
| 75 | + |
| 76 | +template <typename TAccTag> struct SimpleSumLogger { |
| 77 | + using Clock = DeviceClock<TAccTag>; |
| 78 | + |
| 79 | + DeviceClock<TAccTag>::DurationType mallocDuration; |
| 80 | + std::uint32_t mallocCounter{0U}; |
| 81 | + |
| 82 | + DeviceClock<TAccTag>::DurationType freeDuration; |
| 83 | + std::uint32_t freeCounter{0U}; |
| 84 | + |
| 85 | + std::uint32_t nullpointersObtained{0U}; |
| 86 | + std::uint32_t failedChecksCounter{0U}; |
| 87 | + std::uint32_t invalidCheckResults{0U}; |
| 88 | + |
| 89 | + template <typename TAcc> ALPAKA_FN_INLINE ALPAKA_FN_ACC auto call(TAcc const& acc, auto func) { |
| 90 | + static_assert( |
| 91 | + std::is_same_v<alpaka::TagToAcc<TAccTag, alpaka::Dim<Acc>, alpaka::Idx<Acc>>, TAcc>); |
| 92 | + auto start = Clock::clock(); |
| 93 | + auto result = func(acc); |
| 94 | + auto end = Clock::clock(); |
| 95 | + |
| 96 | + if (std::get<0>(result) == Actions::MALLOC) { |
| 97 | + mallocDuration += Clock::duration(start, end); |
| 98 | + mallocCounter++; |
| 99 | + } |
| 100 | + |
| 101 | + if (std::get<0>(result) == Actions::FREE) { |
| 102 | + freeDuration += Clock::duration(start, end); |
| 103 | + freeCounter++; |
| 104 | + } |
| 105 | + |
| 106 | + if (std::get<0>(result) == Actions::CHECK) { |
| 107 | + if (std::holds_alternative<std::pair<bool, Reason>>(std::get<1>(result))) { |
| 108 | + auto [passed, reason] = std::get<std::pair<bool, Reason>>(std::get<1>(result)); |
| 109 | + if (not passed) { |
| 110 | + if (reason == Reason::nullpointer) { |
| 111 | + nullpointersObtained++; |
| 112 | + } |
| 113 | + if (reason == Reason::completed) { |
| 114 | + failedChecksCounter++; |
| 115 | + } |
| 116 | + } |
| 117 | + } else { |
| 118 | + invalidCheckResults++; |
| 119 | + } |
| 120 | + } |
| 121 | + |
| 122 | + return result; |
| 123 | + } |
| 124 | + |
| 125 | + ALPAKA_FN_ACC void accumulate(const auto& acc, const SimpleSumLogger& other) { |
| 126 | + alpaka::atomicAdd(acc, &mallocDuration, other.mallocDuration); |
| 127 | + alpaka::atomicAdd(acc, &mallocCounter, other.mallocCounter); |
| 128 | + alpaka::atomicAdd(acc, &freeDuration, other.freeDuration); |
| 129 | + alpaka::atomicAdd(acc, &freeCounter, other.freeCounter); |
| 130 | + alpaka::atomicAdd(acc, &nullpointersObtained, other.nullpointersObtained); |
| 131 | + alpaka::atomicAdd(acc, &failedChecksCounter, other.failedChecksCounter); |
| 132 | + alpaka::atomicAdd(acc, &invalidCheckResults, other.invalidCheckResults); |
| 133 | + } |
| 134 | + |
| 135 | + nlohmann::json generateReport() { |
| 136 | +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED |
| 137 | + cudaDeviceProp prop; |
| 138 | + cudaGetDeviceProperties(&prop, 0); |
| 139 | + auto clockRate = prop.clockRate; |
| 140 | +#else |
| 141 | + auto clockRate = 1; |
| 142 | +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED |
| 143 | + return { |
| 144 | +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED |
| 145 | + {"clock rate [1/ms]", clockRate}, |
| 146 | +#endif |
| 147 | + {"allocation total time [ms]", mallocDuration / clockRate}, |
| 148 | + {"allocation average time [ms]", |
| 149 | + mallocDuration / clockRate / (mallocCounter > 0 ? mallocCounter : 1U)}, |
| 150 | + {"allocation count", mallocCounter}, |
| 151 | + {"deallocation total time [ms]", freeDuration / clockRate}, |
| 152 | + {"deallocation average time [ms]", |
| 153 | + freeDuration / clockRate / (freeCounter > 0 ? freeCounter : 1U)}, |
| 154 | + {"deallocation count ", freeCounter}, |
| 155 | + {"failed checks count", failedChecksCounter}, |
| 156 | + {"nullpointers count", nullpointersObtained}, |
| 157 | + {"invalid check results count", invalidCheckResults}, |
| 158 | + }; |
| 159 | + } |
| 160 | +}; |
| 161 | + |
| 162 | +template <template <typename, size_t> typename T, typename TType, size_t TExtent> struct IsSpan { |
| 163 | + static constexpr bool value = std::is_same_v<T<TType, TExtent>, std::span<TType, TExtent>>; |
| 164 | +}; |
| 165 | + |
| 166 | +template <template <typename, size_t> typename T, typename TType, size_t TExtent> |
| 167 | +constexpr auto isSpan(T<TType, TExtent>) { |
| 168 | + return IsSpan<T, TType, TExtent>{}; |
| 169 | +} |
| 170 | + |
| 171 | +template <typename TNew, typename TOld, std::size_t TExtent> |
| 172 | +constexpr auto convertDataType(std::span<TOld, TExtent>& range) { |
| 173 | + return std::span<TNew, TExtent * sizeof(TOld) / sizeof(TNew)>( |
| 174 | + reinterpret_cast<TNew*>(range.data()), range.size()); |
| 175 | +} |
| 176 | + |
| 177 | +struct IotaReductionChecker { |
| 178 | + uint32_t currentValue{}; |
| 179 | + |
| 180 | + ALPAKA_FN_ACC auto check([[maybe_unused]] const auto& acc, const auto& result) { |
| 181 | + if (std::get<0>(result) != Actions::MALLOC) { |
| 182 | + return std::make_tuple(Actions::CHECK, Payload(std::make_pair(true, Reason::notApplicable))); |
| 183 | + } |
| 184 | + auto range = std::get<0>(std::get<1>(result)); |
| 185 | + if (range.data() == nullptr) { |
| 186 | + return std::make_tuple(Actions::CHECK, Payload(std::make_pair(false, Reason::nullpointer))); |
| 187 | + } |
| 188 | + auto uintRange = convertDataType<uint32_t>(range); |
| 189 | + std::iota(std::begin(uintRange), std::end(uintRange), currentValue); |
| 190 | + size_t n = uintRange.size(); |
| 191 | + // The exact formula is using size_t because n is size_t. Casting it down will oftentimes run |
| 192 | + // into an overflow that the reduction encounters, too. |
| 193 | + auto expected = static_cast<uint32_t>(n * currentValue + n * (n - 1) / 2) ^ currentValue; |
| 194 | + currentValue ^= std::reduce(std::cbegin(uintRange), std::cend(uintRange)); |
| 195 | + return std::make_tuple(+Actions::CHECK, |
| 196 | + Payload(std::make_pair(expected == currentValue, Reason::completed))); |
| 197 | + } |
| 198 | + |
| 199 | + ALPAKA_FN_ACC auto accumulate(const auto& acc, const auto& other) { |
| 200 | + alpaka::atomicXor(acc, ¤tValue, other.currentValue); |
| 201 | + } |
| 202 | + |
| 203 | + nlohmann::json generateReport() { return {{"final value", currentValue}}; } |
| 204 | +}; |
| 205 | + |
| 206 | +template <typename T> struct NoStoreProvider { |
| 207 | + ALPAKA_FN_ACC T load(auto const) { return {}; } |
| 208 | + ALPAKA_FN_ACC void store(auto const&, T&&, auto const) {} |
| 209 | + nlohmann::json generateReport() { return {}; } |
| 210 | +}; |
| 211 | + |
| 212 | +template <typename T> struct AccumulateResultsProvider { |
| 213 | + T result{}; |
| 214 | + ALPAKA_FN_ACC T load(auto const) { return {}; } |
| 215 | + ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) { |
| 216 | + result.accumulate(acc, instance); |
| 217 | + } |
| 218 | + nlohmann::json generateReport() { return result.generateReport(); } |
| 219 | +}; |
| 220 | + |
| 221 | +template <typename T> struct AcumulateChecksProvider { |
| 222 | + T result{}; |
| 223 | + ALPAKA_FN_ACC T load(auto const threadIndex) { return {threadIndex}; } |
| 224 | + ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) { |
| 225 | + result.accumulate(acc, instance); |
| 226 | + } |
| 227 | + nlohmann::json generateReport() { return result.generateReport(); } |
| 228 | +}; |
| 229 | + |
| 230 | +namespace setups { |
| 231 | + struct SingleSizeMallocRecipe { |
| 232 | + static constexpr std::uint32_t allocationSize{ALLOCATION_SIZE}; |
| 233 | + static constexpr std::uint32_t numAllocations{256U}; |
| 234 | + std::array<std::byte*, numAllocations> pointers{{}}; |
| 235 | + std::uint32_t counter{0U}; |
| 236 | + |
| 237 | + ALPAKA_FN_ACC auto next([[maybe_unused]] const auto& acc) { |
| 238 | + if (counter >= numAllocations) |
| 239 | + return std::make_tuple(+kitgenbench::Actions::STOP, |
| 240 | + Payload(std::span<std::byte, allocationSize>{ |
| 241 | + static_cast<std::byte*>(nullptr), allocationSize})); |
| 242 | + pointers[counter] = static_cast<std::byte*>(malloc(allocationSize)); |
| 243 | + auto result = std::make_tuple( |
| 244 | + +kitgenbench::Actions::MALLOC, |
| 245 | + Payload(std::span<std::byte, allocationSize>(pointers[counter], allocationSize))); |
| 246 | + counter++; |
| 247 | + return result; |
| 248 | + } |
| 249 | + |
| 250 | + nlohmann::json generateReport() { return {}; } |
| 251 | + }; |
| 252 | + |
| 253 | + template <typename TAcc, typename TDev> struct InstructionDetails { |
| 254 | + struct DevicePackage { |
| 255 | + NoStoreProvider<SingleSizeMallocRecipe> recipes{}; |
| 256 | + AccumulateResultsProvider<SimpleSumLogger<AccTag>> loggers{}; |
| 257 | + AcumulateChecksProvider<IotaReductionChecker> checkers{}; |
| 258 | + }; |
| 259 | + |
| 260 | + DevicePackage hostData{}; |
| 261 | + alpaka::Buf<TDev, DevicePackage, alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> devicePackageBuffer; |
| 262 | + |
| 263 | + InstructionDetails(TDev const& device) |
| 264 | + : devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U)) {}; |
| 265 | + |
| 266 | + auto sendTo([[maybe_unused]] TDev const& device, auto& queue) { |
| 267 | + alpaka::memset(queue, devicePackageBuffer, 0U); |
| 268 | + return reinterpret_cast<DevicePackage*>(alpaka::getPtrNative(devicePackageBuffer)); |
| 269 | + } |
| 270 | + auto retrieveFrom([[maybe_unused]] TDev const& device, auto& queue) { |
| 271 | + auto const platformHost = alpaka::PlatformCpu{}; |
| 272 | + auto const devHost = getDevByIdx(platformHost, 0); |
| 273 | + auto view = alpaka::createView(devHost, &hostData, 1U); |
| 274 | + alpaka::memcpy(queue, view, devicePackageBuffer); |
| 275 | + } |
| 276 | + |
| 277 | + nlohmann::json generateReport() { |
| 278 | + return {{"recipes", hostData.recipes.generateReport()}, |
| 279 | + {"logs", hostData.loggers.generateReport()}, |
| 280 | + {"checks", hostData.checkers.generateReport()}}; |
| 281 | + } |
| 282 | + }; |
| 283 | + |
| 284 | + template <typename TAcc, typename TDev> auto makeInstructionDetails(TDev const& device) { |
| 285 | + return InstructionDetails<TAcc, TDev>(device); |
| 286 | + } |
| 287 | + |
| 288 | + auto composeSetup() { |
| 289 | + auto execution = makeExecutionDetails(); |
| 290 | + return setup::composeSetup("Non trivial", execution, |
| 291 | + makeInstructionDetails<Acc>(execution.device), {}); |
| 292 | + } |
| 293 | +} // namespace setups |
| 294 | + |
| 295 | +/** |
| 296 | + * @brief Compose a report from the provided metadata, configuration, and individual reports. |
| 297 | + * |
| 298 | + * This function takes a json object representing the metadata, a json object |
| 299 | + * representing the configuration, and a json object representing the individual |
| 300 | + * reports, and composes a report by merging them into a single json object. |
| 301 | + * The resulting json object is returned. |
| 302 | + * |
| 303 | + * @param metadata The json object representing the metadata. |
| 304 | + * @param config The json object representing the configuration. |
| 305 | + * @param individualReports The json object representing the individual reports. |
| 306 | + * @return json The json object representing the composed report. |
| 307 | + */ |
| 308 | +json composeReport(json const& metadata, json const& benchmarkReports) { |
| 309 | + json report{}; |
| 310 | + report["metadata"] = metadata; |
| 311 | + report["benchmarks"] = benchmarkReports; |
| 312 | + return report; |
| 313 | +} |
| 314 | + |
| 315 | +void output(json const& report) { std::cout << report << std::endl; } |
| 316 | + |
| 317 | +auto main() -> int { |
| 318 | + auto metadata = gatherMetadata(); |
| 319 | + auto setup = setups::composeSetup(); |
| 320 | + auto benchmarkReports = runBenchmarks(setup); |
| 321 | + auto report = composeReport(metadata, benchmarkReports); |
| 322 | + output(report); |
| 323 | + return EXIT_SUCCESS; |
| 324 | +} |
0 commit comments