Skip to content

Commit a644b5a

Browse files
committed
Add mallocMC
1 parent 6cbb7a2 commit a644b5a

File tree

3 files changed

+391
-0
lines changed

3 files changed

+391
-0
lines changed

examples/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,8 @@ add_subdirectory(
66
${CMAKE_CURRENT_LIST_DIR}/plain-malloc
77
${CMAKE_BINARY_DIR}/examples/plain-malloc
88
)
9+
10+
add_subdirectory(
11+
${CMAKE_CURRENT_LIST_DIR}/simple-mallocMC
12+
${CMAKE_BINARY_DIR}/examples/simple-mallocMC
13+
)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
cmake_minimum_required(VERSION 3.14...3.22)
2+
3+
if(POLICY CMP0167)
4+
cmake_policy(SET CMP0167 NEW)
5+
endif()
6+
project(KitGenBenchExampleSimpleMallocMC LANGUAGES CXX)
7+
8+
# --- Import tools ----
9+
10+
include(../../cmake/tools.cmake)
11+
12+
# ---- Dependencies ----
13+
14+
include(../../cmake/CPM.cmake)
15+
16+
cpmaddpackage(
17+
NAME nlohmann_json
18+
GITHUB_REPOSITORY nlohmann/json
19+
VERSION 3.11.3 NO_TESTS
20+
)
21+
22+
cpmaddpackage(
23+
NAME alpaka
24+
GITHUB_REPOSITORY alpaka-group/alpaka
25+
GIT_TAG 1.2.0
26+
)
27+
28+
# mallocMC does not have a modern cmake yet.
29+
# We use the approach from here:
30+
# https://github.com/cpm-cmake/CPM.cmake?tab=readme-ov-file #lua
31+
cpmaddpackage(
32+
NAME mallocMC
33+
URL file:///home/lenz/workspace/src/mallocMC-modern
34+
GIT_TAG main
35+
)
36+
37+
cpmaddpackage(NAME KitGenBench SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
38+
39+
# ---- Create standalone executable ----
40+
41+
file(GLOB sources CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cpp)
42+
43+
alpaka_add_executable(${PROJECT_NAME} ${sources})
44+
45+
set_target_properties(
46+
${PROJECT_NAME}
47+
PROPERTIES
48+
CXX_STANDARD 20
49+
OUTPUT_NAME ${PROJECT_NAME}
50+
CXX_STANDARD_REQUIRED ON
51+
CXX_EXTENSIONS OFF
52+
)
53+
54+
target_link_libraries(
55+
${PROJECT_NAME}
56+
KitGenBench::KitGenBench
57+
nlohmann_json::nlohmann_json
58+
alpaka::alpaka
59+
mallocMC::mallocMC
60+
)
Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
#include <kitgenbench/DeviceClock.h>
2+
#include <kitgenbench/kitgenbench.h>
3+
#include <kitgenbench/setup.h>
4+
#include <kitgenbench/version.h>
5+
6+
#include <alpaka/workdiv/WorkDivMembers.hpp>
7+
#include <cstdint>
8+
#include <limits>
9+
#include <mallocMC/mallocMC.hpp>
10+
#include <tuple>
11+
#include <utility>
12+
#include <variant>
13+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
14+
# include <cuda_runtime.h>
15+
#endif // alpaka_ACC_GPU_CUDA_ENABLE
16+
17+
#include <alpaka/acc/AccCpuSerial.hpp>
18+
#include <alpaka/acc/Tag.hpp>
19+
#include <alpaka/atomic/Traits.hpp>
20+
#include <alpaka/core/Common.hpp>
21+
#include <alpaka/mem/buf/Traits.hpp>
22+
#include <alpaka/mem/view/Traits.hpp>
23+
#include <cstdlib>
24+
#include <type_traits>
25+
#include <vector>
26+
27+
#include "nlohmann/json_fwd.hpp"
28+
29+
using nlohmann::json;
30+
using namespace kitgenbench;
31+
32+
using Dim = alpaka::DimInt<1>;
33+
using Idx = std::uint32_t;
34+
using AccTag = std::remove_cvref_t<decltype(std::get<0>(alpaka::EnabledAccTags{}))>;
35+
using Acc = alpaka::TagToAcc<AccTag, Dim, Idx>;
36+
37+
using namespace mallocMC;
38+
using MyAllocator
39+
= mallocMC::Allocator<Acc, CreationPolicies::FlatterScatter<>, DistributionPolicies::Noop,
40+
OOMPolicies::ReturnNull, ReservePoolPolicies::AlpakaBuf<Acc>,
41+
AlignmentPolicies::Shrink<>>;
42+
43+
namespace kitgenbench::Actions {
44+
[[maybe_unused]] static constexpr int MALLOC = 1;
45+
[[maybe_unused]] static constexpr int FREE = 2;
46+
} // namespace kitgenbench::Actions
47+
48+
auto makeExecutionDetails() {
49+
auto const platformAcc = alpaka::Platform<Acc>{};
50+
auto const dev = alpaka::getDevByIdx(platformAcc, 0);
51+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
52+
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024U * 1024U * 1024U);
53+
#endif
54+
uint32_t const numThreadsPerBlock = 256U;
55+
uint32_t const numThreads = 4U * numThreadsPerBlock;
56+
auto workdiv = [numThreads, numThreadsPerBlock]() -> alpaka::WorkDivMembers<Dim, Idx> {
57+
if constexpr (std::is_same_v<alpaka::AccToTag<Acc>, alpaka::TagCpuSerial>) {
58+
return {{1U}, {1U}, {numThreads}};
59+
} else {
60+
return alpaka::WorkDivMembers<Dim, Idx>{
61+
{numThreads / numThreadsPerBlock}, {numThreadsPerBlock}, {1U}};
62+
}
63+
}();
64+
auto queue = alpaka::Queue<Acc, alpaka::Blocking>{dev};
65+
auto alloc = MyAllocator(dev, queue);
66+
return kitgenbench::ExecutionDetails<Acc, decltype(dev)>{workdiv, dev};
67+
}
68+
69+
static constexpr std::uint32_t ALLOCATION_SIZE = 16U;
70+
71+
// Reasons for the check to yield the result it yielded.
72+
// `completed` means that the check completed. The result can still be true/false depending on
73+
// whether the obtained value was actually correct. `notApplicable` means that the checks were
74+
// skipped. `nullpointer` means that a nullpointer was given, so the checks couldn't run at all.
75+
enum class Reason { completed, notApplicable, nullpointer };
76+
using Payload = std::variant<std::span<std::byte, ALLOCATION_SIZE>, std::pair<bool, Reason>>;
77+
78+
template <typename TAccTag> struct SimpleSumLogger {
79+
using Clock = DeviceClock<TAccTag>;
80+
81+
DeviceClock<TAccTag>::DurationType mallocDuration;
82+
std::uint32_t mallocCounter{0U};
83+
84+
DeviceClock<TAccTag>::DurationType freeDuration;
85+
std::uint32_t freeCounter{0U};
86+
87+
std::uint32_t nullpointersObtained{0U};
88+
std::uint32_t failedChecksCounter{0U};
89+
std::uint32_t invalidCheckResults{0U};
90+
91+
template <typename TAcc> ALPAKA_FN_INLINE ALPAKA_FN_ACC auto call(TAcc const& acc, auto func) {
92+
static_assert(
93+
std::is_same_v<alpaka::TagToAcc<TAccTag, alpaka::Dim<Acc>, alpaka::Idx<Acc>>, TAcc>);
94+
auto start = Clock::clock();
95+
auto result = func(acc);
96+
auto end = Clock::clock();
97+
98+
if (std::get<0>(result) == Actions::MALLOC) {
99+
mallocDuration += Clock::duration(start, end);
100+
mallocCounter++;
101+
}
102+
103+
if (std::get<0>(result) == Actions::FREE) {
104+
freeDuration += Clock::duration(start, end);
105+
freeCounter++;
106+
}
107+
108+
if (std::get<0>(result) == Actions::CHECK) {
109+
if (std::holds_alternative<std::pair<bool, Reason>>(std::get<1>(result))) {
110+
auto [passed, reason] = std::get<std::pair<bool, Reason>>(std::get<1>(result));
111+
if (not passed) {
112+
if (reason == Reason::nullpointer) {
113+
nullpointersObtained++;
114+
}
115+
if (reason == Reason::completed) {
116+
failedChecksCounter++;
117+
}
118+
}
119+
} else {
120+
invalidCheckResults++;
121+
}
122+
}
123+
124+
return result;
125+
}
126+
127+
ALPAKA_FN_ACC void accumulate(const auto& acc, const SimpleSumLogger& other) {
128+
alpaka::atomicAdd(acc, &mallocDuration, other.mallocDuration);
129+
alpaka::atomicAdd(acc, &mallocCounter, other.mallocCounter);
130+
alpaka::atomicAdd(acc, &freeDuration, other.freeDuration);
131+
alpaka::atomicAdd(acc, &freeCounter, other.freeCounter);
132+
alpaka::atomicAdd(acc, &nullpointersObtained, other.nullpointersObtained);
133+
alpaka::atomicAdd(acc, &failedChecksCounter, other.failedChecksCounter);
134+
alpaka::atomicAdd(acc, &invalidCheckResults, other.invalidCheckResults);
135+
}
136+
137+
nlohmann::json generateReport() {
138+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
139+
cudaDeviceProp prop;
140+
cudaGetDeviceProperties(&prop, 0);
141+
auto clockRate = prop.clockRate;
142+
#else
143+
auto clockRate = 1;
144+
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
145+
return {
146+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
147+
{"clock rate [1/ms]", clockRate},
148+
#endif
149+
{"allocation total time [ms]", mallocDuration / clockRate},
150+
{"allocation average time [ms]",
151+
mallocDuration / clockRate / (mallocCounter > 0 ? mallocCounter : 1U)},
152+
{"allocation count", mallocCounter},
153+
{"deallocation total time [ms]", freeDuration / clockRate},
154+
{"deallocation average time [ms]",
155+
freeDuration / clockRate / (freeCounter > 0 ? freeCounter : 1U)},
156+
{"deallocation count ", freeCounter},
157+
{"failed checks count", failedChecksCounter},
158+
{"nullpointers count", nullpointersObtained},
159+
{"invalid check results count", invalidCheckResults},
160+
};
161+
}
162+
};
163+
164+
template <template <typename, size_t> typename T, typename TType, size_t TExtent> struct IsSpan {
165+
static constexpr bool value = std::is_same_v<T<TType, TExtent>, std::span<TType, TExtent>>;
166+
};
167+
168+
template <template <typename, size_t> typename T, typename TType, size_t TExtent>
169+
constexpr auto isSpan(T<TType, TExtent>) {
170+
return IsSpan<T, TType, TExtent>{};
171+
}
172+
173+
template <typename TNew, typename TOld, std::size_t TExtent>
174+
constexpr auto convertDataType(std::span<TOld, TExtent>& range) {
175+
return std::span<TNew, TExtent * sizeof(TOld) / sizeof(TNew)>(
176+
reinterpret_cast<TNew*>(range.data()), range.size());
177+
}
178+
179+
struct IotaReductionChecker {
180+
uint32_t currentValue{};
181+
182+
ALPAKA_FN_ACC auto check([[maybe_unused]] const auto& acc, const auto& result) {
183+
if (std::get<0>(result) != Actions::MALLOC) {
184+
return std::make_tuple(Actions::CHECK, Payload(std::make_pair(true, Reason::notApplicable)));
185+
}
186+
auto range = std::get<0>(std::get<1>(result));
187+
if (range.data() == nullptr) {
188+
return std::make_tuple(Actions::CHECK, Payload(std::make_pair(false, Reason::nullpointer)));
189+
}
190+
auto uintRange = convertDataType<uint32_t>(range);
191+
std::iota(std::begin(uintRange), std::end(uintRange), currentValue);
192+
size_t n = uintRange.size();
193+
// The exact formula is using size_t because n is size_t. Casting it down will oftentimes run
194+
// into an overflow that the reduction encounters, too.
195+
auto expected = static_cast<uint32_t>(n * currentValue + n * (n - 1) / 2) ^ currentValue;
196+
currentValue ^= std::reduce(std::cbegin(uintRange), std::cend(uintRange));
197+
return std::make_tuple(+Actions::CHECK,
198+
Payload(std::make_pair(expected == currentValue, Reason::completed)));
199+
}
200+
201+
ALPAKA_FN_ACC auto accumulate(const auto& acc, const auto& other) {
202+
alpaka::atomicXor(acc, &currentValue, other.currentValue);
203+
}
204+
205+
nlohmann::json generateReport() { return {{"final value", currentValue}}; }
206+
};
207+
208+
template <typename T> struct NoStoreProvider {
209+
ALPAKA_FN_ACC T load(auto const) { return {}; }
210+
ALPAKA_FN_ACC void store(auto const&, T&&, auto const) {}
211+
nlohmann::json generateReport() { return {}; }
212+
};
213+
214+
template <typename T> struct AccumulateResultsProvider {
215+
T result{};
216+
ALPAKA_FN_ACC T load(auto const) { return {}; }
217+
ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) {
218+
result.accumulate(acc, instance);
219+
}
220+
nlohmann::json generateReport() { return result.generateReport(); }
221+
};
222+
223+
template <typename T> struct AcumulateChecksProvider {
224+
T result{};
225+
ALPAKA_FN_ACC T load(auto const threadIndex) { return {threadIndex}; }
226+
ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) {
227+
result.accumulate(acc, instance);
228+
}
229+
nlohmann::json generateReport() { return result.generateReport(); }
230+
};
231+
232+
namespace setups {
233+
struct SingleSizeMallocRecipe {
234+
static constexpr std::uint32_t allocationSize{ALLOCATION_SIZE};
235+
static constexpr std::uint32_t numAllocations{256U};
236+
std::array<std::byte*, numAllocations> pointers{{}};
237+
std::uint32_t counter{0U};
238+
239+
ALPAKA_FN_ACC auto next([[maybe_unused]] const auto& acc) {
240+
if (counter >= numAllocations)
241+
return std::make_tuple(+kitgenbench::Actions::STOP,
242+
Payload(std::span<std::byte, allocationSize>{
243+
static_cast<std::byte*>(nullptr), allocationSize}));
244+
pointers[counter] = static_cast<std::byte*>(malloc(allocationSize));
245+
auto result = std::make_tuple(
246+
+kitgenbench::Actions::MALLOC,
247+
Payload(std::span<std::byte, allocationSize>(pointers[counter], allocationSize)));
248+
counter++;
249+
return result;
250+
}
251+
252+
nlohmann::json generateReport() { return {}; }
253+
};
254+
255+
template <typename TAcc, typename TDev> struct InstructionDetails {
256+
struct DevicePackage {
257+
NoStoreProvider<SingleSizeMallocRecipe> recipes{};
258+
AccumulateResultsProvider<SimpleSumLogger<AccTag>> loggers{};
259+
AcumulateChecksProvider<IotaReductionChecker> checkers{};
260+
};
261+
262+
DevicePackage hostData{};
263+
alpaka::Buf<TDev, DevicePackage, alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> devicePackageBuffer;
264+
265+
InstructionDetails(TDev const& device)
266+
: devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U)) {};
267+
268+
auto sendTo([[maybe_unused]] TDev const& device, auto& queue) {
269+
alpaka::memset(queue, devicePackageBuffer, 0U);
270+
return reinterpret_cast<DevicePackage*>(alpaka::getPtrNative(devicePackageBuffer));
271+
}
272+
auto retrieveFrom([[maybe_unused]] TDev const& device, auto& queue) {
273+
auto const platformHost = alpaka::PlatformCpu{};
274+
auto const devHost = getDevByIdx(platformHost, 0);
275+
auto view = alpaka::createView(devHost, &hostData, 1U);
276+
alpaka::memcpy(queue, view, devicePackageBuffer);
277+
}
278+
279+
nlohmann::json generateReport() {
280+
return {{"recipes", hostData.recipes.generateReport()},
281+
{"logs", hostData.loggers.generateReport()},
282+
{"checks", hostData.checkers.generateReport()}};
283+
}
284+
};
285+
286+
template <typename TAcc, typename TDev> auto makeInstructionDetails(TDev const& device) {
287+
return InstructionDetails<TAcc, TDev>(device);
288+
}
289+
290+
auto composeSetup() {
291+
auto execution = makeExecutionDetails();
292+
return setup::composeSetup("Non trivial", execution,
293+
makeInstructionDetails<Acc>(execution.device), {});
294+
}
295+
} // namespace setups
296+
297+
/**
298+
* @brief Compose a report from the provided metadata, configuration, and individual reports.
299+
*
300+
* This function takes a json object representing the metadata, a json object
301+
* representing the configuration, and a json object representing the individual
302+
* reports, and composes a report by merging them into a single json object.
303+
* The resulting json object is returned.
304+
*
305+
* @param metadata The json object representing the metadata.
306+
* @param config The json object representing the configuration.
307+
* @param individualReports The json object representing the individual reports.
308+
* @return json The json object representing the composed report.
309+
*/
310+
json composeReport(json const& metadata, json const& benchmarkReports) {
311+
json report{};
312+
report["metadata"] = metadata;
313+
report["benchmarks"] = benchmarkReports;
314+
return report;
315+
}
316+
317+
void output(json const& report) { std::cout << report << std::endl; }
318+
319+
auto main() -> int {
320+
auto metadata = gatherMetadata();
321+
auto setup = setups::composeSetup();
322+
auto benchmarkReports = runBenchmarks(setup);
323+
auto report = composeReport(metadata, benchmarkReports);
324+
output(report);
325+
return EXIT_SUCCESS;
326+
}

0 commit comments

Comments
 (0)