forked from alpaka-group/alpaka3
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathactivemask.cpp
More file actions
100 lines (81 loc) · 3.76 KB
/
activemask.cpp
File metadata and controls
100 lines (81 loc) · 3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/* Copyright 2025 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego, Mehmet
* Yusufoglu, René Widera SPDX-License-Identifier: MPL-2.0
*/
/** @file Tests that the warp activemask helper reports exactly the lanes participating in execution.
* Warp operations are SIMT collectives that act on the threads executing in lockstep on a GPU warp.
*/
#include "utils.hpp"
#include <alpaka/onAcc/warp.hpp>
#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>
#include <cstdint>
using namespace alpaka;
using alpaka::test::warp::fullMask;
using alpaka::test::warp::singleBit;
using alpaka::test::warp::warpCheck;
using alpaka::test::warp::WarpTestBackends;
namespace
{
struct ActivemaskMultiThreadKernel
{
template<typename TAcc>
ALPAKA_FN_ACC void operator()(
TAcc const& acc,
concepts::IMdSpan<bool> auto success,
std::uint32_t inactiveLane) const
{
// test if the warp size can be constexpr
constexpr uint32_t warpExtent = onAcc::warp::getSize<ALPAKA_TYPEOF(acc)>();
/* We can not use a static_assert for testing because the compiler will evaluate the warp size during the
* host parsing to what will result in false negatives */
warpCheck(success, warpExtent >= 1u);
/* We start on the host side a frame specification with a frame extent of the warp size.
* alpaka should not reduce the number of threads to a value smaller than the warp size if the user is not
* applying for it.
*/
auto const threadsPerBlock = static_cast<std::uint32_t>(acc[alpaka::layer::thread].count().product());
// number of threads should be a multiple of the warp size
warpCheck(success, threadsPerBlock % warpExtent == 0);
auto const lane = onAcc::warp::getLaneIdx(acc);
if(lane == inactiveLane)
{
// Early exit: mark this lane inactive without touching the mask.
return;
}
auto const mask = onAcc::warp::activemask(acc);
auto const expected = fullMask(warpExtent) & ~singleBit(inactiveLane);
warpCheck(success, mask == expected);
}
};
} // namespace
TEMPLATE_LIST_TEST_CASE("warp activemask reflects participating lanes", "[warp][activemask]", WarpTestBackends)
{
auto cfg = TestType::makeDict();
auto deviceSpec = cfg[object::deviceSpec];
auto exec = cfg[object::exec];
auto selector = onHost::makeDeviceSelector(deviceSpec);
if(!selector.isAvailable())
{
INFO("No device available for " << deviceSpec.getName());
return;
}
auto deviceProperties = selector.getDeviceProperties(0);
auto const warpExtent = deviceProperties.warpSize;
auto device = selector.makeDevice(0);
auto queue = device.makeQueue(queueKind::blocking);
auto successHost = onHost::allocHost<bool>(1u);
auto successDev = onHost::allocLike(device, successHost);
auto const blocks = Vec<std::uint32_t, 1u>{5u};
auto const threads = Vec<std::uint32_t, 1u>{4u * warpExtent};
auto const frame = onHost::FrameSpec{blocks, threads};
for(std::uint32_t inactiveLane = 0u; inactiveLane < warpExtent; ++inactiveLane)
{
// Sweep every lane once to confirm the mask drops exactly that participant.
onHost::memset(queue, successDev, static_cast<std::uint8_t>(true));
queue.enqueue(exec, frame, KernelBundle{ActivemaskMultiThreadKernel{}, successDev, inactiveLane});
onHost::memcpy(queue, successHost, successDev);
onHost::wait(queue);
INFO("backend=" << deviceSpec.getName() << " inactiveLane=" << inactiveLane);
CHECK(successHost[0]);
}
}