Skip to content

Commit b09ce70

Browse files
Merge pull request #2 from chillenzer/add-mallocMC-example
Add malloc mc example
2 parents 6cbb7a2 + 3b58ac8 commit b09ce70

File tree

3 files changed

+386
-0
lines changed

3 files changed

+386
-0
lines changed

examples/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,8 @@ add_subdirectory(
66
${CMAKE_CURRENT_LIST_DIR}/plain-malloc
77
${CMAKE_BINARY_DIR}/examples/plain-malloc
88
)
9+
10+
add_subdirectory(
11+
${CMAKE_CURRENT_LIST_DIR}/simple-mallocMC
12+
${CMAKE_BINARY_DIR}/examples/simple-mallocMC
13+
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
cmake_minimum_required(VERSION 3.14...3.22)
2+
3+
if(POLICY CMP0167)
4+
cmake_policy(SET CMP0167 NEW)
5+
endif()
6+
project(KitGenBenchExampleSimpleMallocMC LANGUAGES CXX)
7+
8+
# --- Import tools ----
9+
10+
include(../../cmake/tools.cmake)
11+
12+
# ---- Dependencies ----
13+
14+
include(../../cmake/CPM.cmake)
15+
16+
cpmaddpackage(
17+
NAME nlohmann_json
18+
GITHUB_REPOSITORY nlohmann/json
19+
VERSION 3.11.3 NO_TESTS
20+
)
21+
22+
cpmaddpackage(
23+
NAME alpaka
24+
GITHUB_REPOSITORY alpaka-group/alpaka
25+
GIT_TAG 1.2.0
26+
)
27+
28+
cpmaddpackage(
29+
NAME mallocMC
30+
GITHUB_REPOSITORY chillenzer/mallocMC
31+
GIT_TAG update-cmake
32+
)
33+
34+
cpmaddpackage(NAME KitGenBench SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
35+
36+
# ---- Create standalone executable ----
37+
38+
file(GLOB sources CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cpp)
39+
40+
alpaka_add_executable(${PROJECT_NAME} ${sources})
41+
42+
set_target_properties(
43+
${PROJECT_NAME}
44+
PROPERTIES
45+
CXX_STANDARD 20
46+
OUTPUT_NAME ${PROJECT_NAME}
47+
CXX_STANDARD_REQUIRED ON
48+
CXX_EXTENSIONS OFF
49+
)
50+
51+
target_link_libraries(
52+
${PROJECT_NAME}
53+
KitGenBench::KitGenBench
54+
nlohmann_json::nlohmann_json
55+
alpaka::alpaka
56+
mallocMC::mallocMC
57+
)
Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
#include <kitgenbench/DeviceClock.h>
2+
#include <kitgenbench/kitgenbench.h>
3+
#include <kitgenbench/setup.h>
4+
#include <kitgenbench/version.h>
5+
6+
#include <alpaka/workdiv/WorkDivMembers.hpp>
7+
#include <cstdint>
8+
#include <limits>
9+
#include <mallocMC/mallocMC.hpp>
10+
#include <tuple>
11+
#include <utility>
12+
#include <variant>
13+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
14+
# include <cuda_runtime.h>
15+
#endif // alpaka_ACC_GPU_CUDA_ENABLE
16+
17+
#include <alpaka/acc/AccCpuSerial.hpp>
18+
#include <alpaka/acc/Tag.hpp>
19+
#include <alpaka/atomic/Traits.hpp>
20+
#include <alpaka/core/Common.hpp>
21+
#include <alpaka/mem/buf/Traits.hpp>
22+
#include <alpaka/mem/view/Traits.hpp>
23+
#include <cstdlib>
24+
#include <type_traits>
25+
#include <vector>
26+
27+
#include "nlohmann/json_fwd.hpp"
28+
29+
using nlohmann::json;
30+
using namespace kitgenbench;
31+
32+
using Dim = alpaka::DimInt<1>;
33+
using Idx = std::uint32_t;
34+
using AccTag = std::remove_cvref_t<decltype(std::get<0>(alpaka::EnabledAccTags{}))>;
35+
using Acc = alpaka::TagToAcc<AccTag, Dim, Idx>;
36+
37+
using namespace mallocMC;
38+
using MyAllocator
39+
= mallocMC::Allocator<Acc, CreationPolicies::FlatterScatter<>, DistributionPolicies::Noop,
40+
OOMPolicies::ReturnNull, ReservePoolPolicies::AlpakaBuf<Acc>,
41+
AlignmentPolicies::Shrink<>>;
42+
43+
namespace kitgenbench::Actions {
44+
[[maybe_unused]] static constexpr int MALLOC = 1;
45+
[[maybe_unused]] static constexpr int FREE = 2;
46+
} // namespace kitgenbench::Actions
47+
48+
auto makeExecutionDetails() {
49+
auto const platformAcc = alpaka::Platform<Acc>{};
50+
auto const dev = alpaka::getDevByIdx(platformAcc, 0);
51+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
52+
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024U * 1024U * 1024U);
53+
#endif
54+
uint32_t const numThreadsPerBlock = 256U;
55+
uint32_t const numThreads = 4U * numThreadsPerBlock;
56+
auto workdiv = [numThreads, numThreadsPerBlock]() -> alpaka::WorkDivMembers<Dim, Idx> {
57+
if constexpr (std::is_same_v<alpaka::AccToTag<Acc>, alpaka::TagCpuSerial>) {
58+
return {{1U}, {1U}, {numThreads}};
59+
} else {
60+
return alpaka::WorkDivMembers<Dim, Idx>{
61+
{numThreads / numThreadsPerBlock}, {numThreadsPerBlock}, {1U}};
62+
}
63+
}();
64+
return kitgenbench::ExecutionDetails<Acc, decltype(dev)>{workdiv, dev};
65+
}
66+
67+
static constexpr std::uint32_t ALLOCATION_SIZE = 16U;
68+
69+
// Reasons for the check to yield the result it yielded.
70+
// `completed` means that the check completed. The result can still be true/false depending on
71+
// whether the obtained value was actually correct. `notApplicable` means that the checks were
72+
// skipped. `nullpointer` means that a nullpointer was given, so the checks couldn't run at all.
73+
enum class Reason { completed, notApplicable, nullpointer };
74+
using Payload = std::variant<std::span<std::byte, ALLOCATION_SIZE>, std::pair<bool, Reason>>;
75+
76+
template <typename TAccTag> struct SimpleSumLogger {
77+
using Clock = DeviceClock<TAccTag>;
78+
79+
DeviceClock<TAccTag>::DurationType mallocDuration;
80+
std::uint32_t mallocCounter{0U};
81+
82+
DeviceClock<TAccTag>::DurationType freeDuration;
83+
std::uint32_t freeCounter{0U};
84+
85+
std::uint32_t nullpointersObtained{0U};
86+
std::uint32_t failedChecksCounter{0U};
87+
std::uint32_t invalidCheckResults{0U};
88+
89+
template <typename TAcc> ALPAKA_FN_INLINE ALPAKA_FN_ACC auto call(TAcc const& acc, auto func) {
90+
static_assert(
91+
std::is_same_v<alpaka::TagToAcc<TAccTag, alpaka::Dim<Acc>, alpaka::Idx<Acc>>, TAcc>);
92+
auto start = Clock::clock();
93+
auto result = func(acc);
94+
auto end = Clock::clock();
95+
96+
if (std::get<0>(result) == Actions::MALLOC) {
97+
mallocDuration += Clock::duration(start, end);
98+
mallocCounter++;
99+
}
100+
101+
if (std::get<0>(result) == Actions::FREE) {
102+
freeDuration += Clock::duration(start, end);
103+
freeCounter++;
104+
}
105+
106+
if (std::get<0>(result) == Actions::CHECK) {
107+
if (std::holds_alternative<std::pair<bool, Reason>>(std::get<1>(result))) {
108+
auto [passed, reason] = std::get<std::pair<bool, Reason>>(std::get<1>(result));
109+
if (not passed) {
110+
if (reason == Reason::nullpointer) {
111+
nullpointersObtained++;
112+
}
113+
if (reason == Reason::completed) {
114+
failedChecksCounter++;
115+
}
116+
}
117+
} else {
118+
invalidCheckResults++;
119+
}
120+
}
121+
122+
return result;
123+
}
124+
125+
ALPAKA_FN_ACC void accumulate(const auto& acc, const SimpleSumLogger& other) {
126+
alpaka::atomicAdd(acc, &mallocDuration, other.mallocDuration);
127+
alpaka::atomicAdd(acc, &mallocCounter, other.mallocCounter);
128+
alpaka::atomicAdd(acc, &freeDuration, other.freeDuration);
129+
alpaka::atomicAdd(acc, &freeCounter, other.freeCounter);
130+
alpaka::atomicAdd(acc, &nullpointersObtained, other.nullpointersObtained);
131+
alpaka::atomicAdd(acc, &failedChecksCounter, other.failedChecksCounter);
132+
alpaka::atomicAdd(acc, &invalidCheckResults, other.invalidCheckResults);
133+
}
134+
135+
nlohmann::json generateReport() {
136+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
137+
cudaDeviceProp prop;
138+
cudaGetDeviceProperties(&prop, 0);
139+
auto clockRate = prop.clockRate;
140+
#else
141+
auto clockRate = 1;
142+
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
143+
return {
144+
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
145+
{"clock rate [1/ms]", clockRate},
146+
#endif
147+
{"allocation total time [ms]", mallocDuration / clockRate},
148+
{"allocation average time [ms]",
149+
mallocDuration / clockRate / (mallocCounter > 0 ? mallocCounter : 1U)},
150+
{"allocation count", mallocCounter},
151+
{"deallocation total time [ms]", freeDuration / clockRate},
152+
{"deallocation average time [ms]",
153+
freeDuration / clockRate / (freeCounter > 0 ? freeCounter : 1U)},
154+
{"deallocation count ", freeCounter},
155+
{"failed checks count", failedChecksCounter},
156+
{"nullpointers count", nullpointersObtained},
157+
{"invalid check results count", invalidCheckResults},
158+
};
159+
}
160+
};
161+
162+
template <template <typename, size_t> typename T, typename TType, size_t TExtent> struct IsSpan {
163+
static constexpr bool value = std::is_same_v<T<TType, TExtent>, std::span<TType, TExtent>>;
164+
};
165+
166+
template <template <typename, size_t> typename T, typename TType, size_t TExtent>
167+
constexpr auto isSpan(T<TType, TExtent>) {
168+
return IsSpan<T, TType, TExtent>{};
169+
}
170+
171+
template <typename TNew, typename TOld, std::size_t TExtent>
172+
constexpr auto convertDataType(std::span<TOld, TExtent>& range) {
173+
return std::span<TNew, TExtent * sizeof(TOld) / sizeof(TNew)>(
174+
reinterpret_cast<TNew*>(range.data()), range.size());
175+
}
176+
177+
struct IotaReductionChecker {
178+
uint32_t currentValue{};
179+
180+
ALPAKA_FN_ACC auto check([[maybe_unused]] const auto& acc, const auto& result) {
181+
if (std::get<0>(result) != Actions::MALLOC) {
182+
return std::make_tuple(Actions::CHECK, Payload(std::make_pair(true, Reason::notApplicable)));
183+
}
184+
auto range = std::get<0>(std::get<1>(result));
185+
if (range.data() == nullptr) {
186+
return std::make_tuple(Actions::CHECK, Payload(std::make_pair(false, Reason::nullpointer)));
187+
}
188+
auto uintRange = convertDataType<uint32_t>(range);
189+
std::iota(std::begin(uintRange), std::end(uintRange), currentValue);
190+
size_t n = uintRange.size();
191+
// The exact formula is using size_t because n is size_t. Casting it down will oftentimes run
192+
// into an overflow that the reduction encounters, too.
193+
auto expected = static_cast<uint32_t>(n * currentValue + n * (n - 1) / 2) ^ currentValue;
194+
currentValue ^= std::reduce(std::cbegin(uintRange), std::cend(uintRange));
195+
return std::make_tuple(+Actions::CHECK,
196+
Payload(std::make_pair(expected == currentValue, Reason::completed)));
197+
}
198+
199+
ALPAKA_FN_ACC auto accumulate(const auto& acc, const auto& other) {
200+
alpaka::atomicXor(acc, &currentValue, other.currentValue);
201+
}
202+
203+
nlohmann::json generateReport() { return {{"final value", currentValue}}; }
204+
};
205+
206+
template <typename T> struct NoStoreProvider {
207+
ALPAKA_FN_ACC T load(auto const) { return {}; }
208+
ALPAKA_FN_ACC void store(auto const&, T&&, auto const) {}
209+
nlohmann::json generateReport() { return {}; }
210+
};
211+
212+
template <typename T> struct AccumulateResultsProvider {
213+
T result{};
214+
ALPAKA_FN_ACC T load(auto const) { return {}; }
215+
ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) {
216+
result.accumulate(acc, instance);
217+
}
218+
nlohmann::json generateReport() { return result.generateReport(); }
219+
};
220+
221+
template <typename T> struct AcumulateChecksProvider {
222+
T result{};
223+
ALPAKA_FN_ACC T load(auto const threadIndex) { return {threadIndex}; }
224+
ALPAKA_FN_ACC void store(const auto& acc, T&& instance, auto const) {
225+
result.accumulate(acc, instance);
226+
}
227+
nlohmann::json generateReport() { return result.generateReport(); }
228+
};
229+
230+
namespace setups {
231+
struct SingleSizeMallocRecipe {
232+
static constexpr std::uint32_t allocationSize{ALLOCATION_SIZE};
233+
static constexpr std::uint32_t numAllocations{256U};
234+
std::array<std::byte*, numAllocations> pointers{{}};
235+
std::uint32_t counter{0U};
236+
237+
ALPAKA_FN_ACC auto next([[maybe_unused]] const auto& acc) {
238+
if (counter >= numAllocations)
239+
return std::make_tuple(+kitgenbench::Actions::STOP,
240+
Payload(std::span<std::byte, allocationSize>{
241+
static_cast<std::byte*>(nullptr), allocationSize}));
242+
pointers[counter] = static_cast<std::byte*>(malloc(allocationSize));
243+
auto result = std::make_tuple(
244+
+kitgenbench::Actions::MALLOC,
245+
Payload(std::span<std::byte, allocationSize>(pointers[counter], allocationSize)));
246+
counter++;
247+
return result;
248+
}
249+
250+
nlohmann::json generateReport() { return {}; }
251+
};
252+
253+
template <typename TAcc, typename TDev> struct InstructionDetails {
254+
struct DevicePackage {
255+
NoStoreProvider<SingleSizeMallocRecipe> recipes{};
256+
AccumulateResultsProvider<SimpleSumLogger<AccTag>> loggers{};
257+
AcumulateChecksProvider<IotaReductionChecker> checkers{};
258+
};
259+
260+
DevicePackage hostData{};
261+
alpaka::Buf<TDev, DevicePackage, alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> devicePackageBuffer;
262+
263+
InstructionDetails(TDev const& device)
264+
: devicePackageBuffer(alpaka::allocBuf<DevicePackage, Idx>(device, 1U)) {};
265+
266+
auto sendTo([[maybe_unused]] TDev const& device, auto& queue) {
267+
alpaka::memset(queue, devicePackageBuffer, 0U);
268+
return reinterpret_cast<DevicePackage*>(alpaka::getPtrNative(devicePackageBuffer));
269+
}
270+
auto retrieveFrom([[maybe_unused]] TDev const& device, auto& queue) {
271+
auto const platformHost = alpaka::PlatformCpu{};
272+
auto const devHost = getDevByIdx(platformHost, 0);
273+
auto view = alpaka::createView(devHost, &hostData, 1U);
274+
alpaka::memcpy(queue, view, devicePackageBuffer);
275+
}
276+
277+
nlohmann::json generateReport() {
278+
return {{"recipes", hostData.recipes.generateReport()},
279+
{"logs", hostData.loggers.generateReport()},
280+
{"checks", hostData.checkers.generateReport()}};
281+
}
282+
};
283+
284+
template <typename TAcc, typename TDev> auto makeInstructionDetails(TDev const& device) {
285+
return InstructionDetails<TAcc, TDev>(device);
286+
}
287+
288+
auto composeSetup() {
289+
auto execution = makeExecutionDetails();
290+
return setup::composeSetup("Non trivial", execution,
291+
makeInstructionDetails<Acc>(execution.device), {});
292+
}
293+
} // namespace setups
294+
295+
/**
296+
* @brief Compose a report from the provided metadata, configuration, and individual reports.
297+
*
298+
* This function takes a json object representing the metadata, a json object
299+
* representing the configuration, and a json object representing the individual
300+
* reports, and composes a report by merging them into a single json object.
301+
* The resulting json object is returned.
302+
*
303+
* @param metadata The json object representing the metadata.
304+
* @param config The json object representing the configuration.
305+
* @param individualReports The json object representing the individual reports.
306+
* @return json The json object representing the composed report.
307+
*/
308+
json composeReport(json const& metadata, json const& benchmarkReports) {
309+
json report{};
310+
report["metadata"] = metadata;
311+
report["benchmarks"] = benchmarkReports;
312+
return report;
313+
}
314+
315+
void output(json const& report) { std::cout << report << std::endl; }
316+
317+
auto main() -> int {
318+
auto metadata = gatherMetadata();
319+
auto setup = setups::composeSetup();
320+
auto benchmarkReports = runBenchmarks(setup);
321+
auto report = composeReport(metadata, benchmarkReports);
322+
output(report);
323+
return EXIT_SUCCESS;
324+
}

0 commit comments

Comments
 (0)