alpaka3/test/unit/warp/shfl.cpp at ffcfc3c27f7c821655a684f024f58f47291b94a4 · psychocoderHPC/alpaka3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* Copyright 2025 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego, Mehmet
 * Yusufoglu, René Widera SPDX-License-Identifier: MPL-2.0
 */

/** @file Tests the warp "shfl" (shuffle) operation which broadcasts a value from one lane to all lanes.
 * The "shfl" warp operation allows each thread to read a value from a specified source lane's register.
 * It's a data exchange operation that enables direct thread-to-thread communication within a warp without shared
 * memory.
 */

#include "utils.hpp"

#include <alpaka/onAcc/warp.hpp>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>

#include <cstdint>
#include <limits>

using namespace alpaka;
using alpaka::test::warp::warpCheck;
using alpaka::test::warp::WarpTestBackends;

namespace
{
    struct ShflMultiThreadKernel
    {
        template<typename TAcc>
        ALPAKA_FN_ACC void operator()(TAcc const& acc, concepts::IMdSpan<bool> auto success) const
        {
            constexpr uint32_t warpExtent = onAcc::warp::getSize<ALPAKA_TYPEOF(acc)>();

            // number of threads should be a multiple of the warp size
            warpCheck(success, warpExtent >= 1u);

            // Lane ID drives the expected source values for each shuffle check.
            uint32_t const lane = onAcc::warp::getLaneIdx(acc);

            // Exercise trivial zero-offset and max-offset cases.
            // Broadcasting from literal lane 0 must work regardless of the caller lane.
            warpCheck(success, onAcc::warp::shfl(acc, 42, 0u) == 42);
            // Using the current lane as the payload and requesting src=0 should always give back 0.
            warpCheck(success, onAcc::warp::shfl(acc, lane, 0u) == 0);
            if constexpr(warpExtent >= 2)
            {
                // Requesting src=1 broadcasts lane 1's value to every participant.
                // test requires at least two threads in a warp
                warpCheck(success, onAcc::warp::shfl(acc, lane, 1u) == 1);
            }

            // Large src index is clamped to the logical width; value must remain unchanged.
            warpCheck(success, onAcc::warp::shfl(acc, 5, std::numeric_limits<uint32_t>::max()) == 5);

            auto const epsilon = std::numeric_limits<float>::epsilon();
            for(uint32_t width = 1; width < warpExtent; width *= 2)
            {
                // Check every logical partition width supported by the backend.
                for(uint32_t idx = 0; idx < width; ++idx)
                {
                    auto const section = width * (lane / width);
                    // Integer payloads should resolve to the subgroup-relative source index.
                    auto const shuffle = onAcc::warp::shfl(acc, lane, idx, width);
                    warpCheck(success, shuffle == idx + section);

                    // Floating payloads exercise non-integral types under the same subgroup restriction.
                    auto const ans = onAcc::warp::shfl(acc, 4.0f - static_cast<float>(lane), idx, width);
                    auto const expect = 4.0f - static_cast<float>(idx + section);
                    warpCheck(success, alpaka::math::abs(ans - expect) < epsilon);
                }
            }

            if(static_cast<int>(lane) >= static_cast<int>(warpExtent / 2u))
            {
                warpCheck(success, onAcc::warp::shfl(acc, 42, warpExtent - 1u) == 42);
                // Upper half should be fully masked from the final checks.
                return;
            }
            else
            {
                // check that shfl can be called within branches of the same level
                warpCheck(success, onAcc::warp::shfl(acc, 11, 0u) == 11);
            }
            // int is used to silence cast warning because warpExtent can be zero during the host path evaluation
            for(int idxTmp = 0u; idxTmp < static_cast<int>(warpExtent) / 2; ++idxTmp)
            {
                uint32_t idx = static_cast<uint32_t>(idxTmp);
                // Active sub-group must always read the value produced by the chosen lane.
                // Within the lower half, shuffling with src=idx must reproduce the selected lane.
                warpCheck(success, onAcc::warp::shfl(acc, lane, idx) == idx);
                auto const ans = onAcc::warp::shfl(acc, 4.0f - static_cast<float>(lane), idx);
                // Float payload confirms the same behaviour holds across types for the masked subgroup.
                auto const expect = 4.0f - static_cast<float>(idx);
                warpCheck(success, alpaka::math::abs(ans - expect) < epsilon);
            }
        }
    };
} // namespace

TEMPLATE_LIST_TEST_CASE("warp shfl moves values between lanes", "[warp][shfl]", WarpTestBackends)
{
    auto cfg = TestType::makeDict();
    auto deviceSpec = cfg[object::deviceSpec];
    auto exec = cfg[object::exec];

    auto selector = onHost::makeDeviceSelector(deviceSpec);
    if(!selector.isAvailable())
    {
        INFO("No device available for " << deviceSpec.getName());
        return;
    }

    auto deviceProperties = selector.getDeviceProperties(0);
    auto const warpExtent = deviceProperties.warpSize;

    auto device = selector.makeDevice(0);
    auto queue = device.makeQueue(queueKind::blocking);

    auto successHost = onHost::allocHost<bool>(1u);
    auto successDev = onHost::allocLike(device, successHost);

    auto const blocks = Vec<std::uint32_t, 1u>{5u};
    auto const threads = Vec<std::uint32_t, 1u>{4u * warpExtent};

    onHost::memset(queue, successDev, static_cast<std::uint8_t>(true));
    queue.enqueue(exec, onHost::FrameSpec{blocks, threads}, KernelBundle{ShflMultiThreadKernel{}, successDev});
    onHost::memcpy(queue, successHost, successDev);
    onHost::wait(queue);
    INFO("backend=" << deviceSpec.getName());
    CHECK(successHost[0]);
}