alpaka3/test/unit/warp/activemask.cpp at ffcfc3c27f7c821655a684f024f58f47291b94a4 · psychocoderHPC/alpaka3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/* Copyright 2025 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego, Mehmet
 * Yusufoglu, René Widera SPDX-License-Identifier: MPL-2.0
 */

/** @file Tests that the warp activemask helper reports exactly the lanes participating in execution.
 * Warp operations are SIMT collectives that act on the threads executing in lockstep on a GPU warp.
 */

#include "utils.hpp"

#include <alpaka/onAcc/warp.hpp>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>

#include <cstdint>

using namespace alpaka;
using alpaka::test::warp::fullMask;
using alpaka::test::warp::singleBit;
using alpaka::test::warp::warpCheck;
using alpaka::test::warp::WarpTestBackends;

namespace
{
    struct ActivemaskMultiThreadKernel
    {
        template<typename TAcc>
        ALPAKA_FN_ACC void operator()(
            TAcc const& acc,
            concepts::IMdSpan<bool> auto success,
            std::uint32_t inactiveLane) const
        {
            // test if the warp size can be constexpr
            constexpr uint32_t warpExtent = onAcc::warp::getSize<ALPAKA_TYPEOF(acc)>();
            /* We can not use a static_assert for testing because the compiler will evaluate the warp size during the
             * host parsing to what will result in false negatives */
            warpCheck(success, warpExtent >= 1u);

            /* We start on the host side a frame specification with a frame extent of the warp size.
             * alpaka should not reduce the number of threads to a value smaller than the warp size if the user is not
             * applying for it.
             */
            auto const threadsPerBlock = static_cast<std::uint32_t>(acc[alpaka::layer::thread].count().product());
            // number of threads should be a multiple of the warp size
            warpCheck(success, threadsPerBlock % warpExtent == 0);

            auto const lane = onAcc::warp::getLaneIdx(acc);

            if(lane == inactiveLane)
            {
                // Early exit: mark this lane inactive without touching the mask.
                return;
            }

            auto const mask = onAcc::warp::activemask(acc);

            auto const expected = fullMask(warpExtent) & ~singleBit(inactiveLane);
            warpCheck(success, mask == expected);
        }
    };
} // namespace

TEMPLATE_LIST_TEST_CASE("warp activemask reflects participating lanes", "[warp][activemask]", WarpTestBackends)
{
    auto cfg = TestType::makeDict();
    auto deviceSpec = cfg[object::deviceSpec];
    auto exec = cfg[object::exec];

    auto selector = onHost::makeDeviceSelector(deviceSpec);
    if(!selector.isAvailable())
    {
        INFO("No device available for " << deviceSpec.getName());
        return;
    }

    auto deviceProperties = selector.getDeviceProperties(0);
    auto const warpExtent = deviceProperties.warpSize;

    auto device = selector.makeDevice(0);
    auto queue = device.makeQueue(queueKind::blocking);

    auto successHost = onHost::allocHost<bool>(1u);
    auto successDev = onHost::allocLike(device, successHost);

    auto const blocks = Vec<std::uint32_t, 1u>{5u};
    auto const threads = Vec<std::uint32_t, 1u>{4u * warpExtent};
    auto const frame = onHost::FrameSpec{blocks, threads};

    for(std::uint32_t inactiveLane = 0u; inactiveLane < warpExtent; ++inactiveLane)
    {
        // Sweep every lane once to confirm the mask drops exactly that participant.
        onHost::memset(queue, successDev, static_cast<std::uint8_t>(true));
        queue.enqueue(exec, frame, KernelBundle{ActivemaskMultiThreadKernel{}, successDev, inactiveLane});
        onHost::memcpy(queue, successHost, successDev);
        onHost::wait(queue);
        INFO("backend=" << deviceSpec.getName() << " inactiveLane=" << inactiveLane);
        CHECK(successHost[0]);
    }
}