alpaka3/test/unit/warp/shfl_down.cpp at ffcfc3c27f7c821655a684f024f58f47291b94a4 · psychocoderHPC/alpaka3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/* Copyright 2025 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego, Mehmet
 * Yusufoglu, René Widera SPDX-License-Identifier: MPL-2.0
 */

/** @file Tests the warp "shfl_down" (shuffle down) operation which shifts values toward higher-numbered lanes.
 * The "shfl_down" warp operation allows each thread to read a value from a lane at a fixed offset below.
 * It's a data exchange operation useful for prefix scans and reduction patterns within a warp.
 */

#include "utils.hpp"

#include <alpaka/onAcc/warp.hpp>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>

#include <cstdint>
#include <limits>

using namespace alpaka;
using alpaka::test::warp::warpCheck;
using alpaka::test::warp::WarpTestBackends;

namespace
{
    struct ShflDownMultiThreadKernel
    {
        template<typename TAcc>
        ALPAKA_FN_ACC void operator()(TAcc const& acc, concepts::IMdSpan<bool> auto success) const
        {
            auto const warpExtent = static_cast<std::int32_t>(onAcc::warp::getSize(acc));
            warpCheck(success, warpExtent >= 1);

            auto const threadsPerBlock = static_cast<std::int32_t>(acc[alpaka::layer::thread].count().product());
            warpCheck(success, threadsPerBlock % warpExtent == 0);

            auto const lane = static_cast<std::int32_t>(onAcc::warp::getLaneIdx(acc));

            // With zero offset, every lane should see the source literal unchanged.
            warpCheck(success, onAcc::warp::shflDown(acc, 42, 0u) == 42);
            // A zero-width shuffle leaves each lane's original value intact.
            warpCheck(success, onAcc::warp::shflDown(acc, lane, 0u) == lane);
            // Offset of one shifts values toward higher indices, clamping at the partition boundary.
            // For example, lane 0 with offset 1 should see lane 1's value, lane 1 should see lane 2's value, and so
            // on, with the last lane seeing its own value.
            warpCheck(success, onAcc::warp::shflDown(acc, lane, 1u) == (lane + 1 < warpExtent ? lane + 1 : lane));

            auto const epsilon = std::numeric_limits<float>::epsilon();
            for(int width = 1; width < warpExtent; width *= 2)
            {
                // Validate every partition width the backend claims to support.
                for(int idx = 0; idx < width; ++idx)
                {
                    auto const sectionStart = width * (lane / width);
                    auto const sectionEnd = sectionStart + width;
                    auto const shuffled = onAcc::warp::shflDown(
                        acc,
                        lane,
                        static_cast<std::uint32_t>(idx),
                        static_cast<std::uint32_t>(width));
                    auto const expectedInt = (lane + idx < sectionEnd) ? lane + idx : lane;
                    // Each lane should see the value from the lane at the offset below, or clamp to its own value if
                    // out of range. For example, if lane 2 with offset 1 should see lane 3's value, but if lane 3 is
                    // the last lane in the partition, it sees its own value.
                    warpCheck(success, shuffled == expectedInt);

                    auto const ans = onAcc::warp::shflDown(
                        acc,
                        4.0f - static_cast<float>(lane),
                        static_cast<std::uint32_t>(idx),
                        static_cast<std::uint32_t>(width));
                    auto const expect = (lane + idx < sectionEnd) ? 4.0f - static_cast<float>(lane + idx)
                                                                  : 4.0f - static_cast<float>(lane);
                    warpCheck(success, alpaka::math::abs(ans - expect) < epsilon);
                }
            }

            if(lane >= warpExtent / 2)
            {
                warpCheck(success, onAcc::warp::shflDown(acc, 42, 1u) == 42);
                // Mask out the upper sub-group for the final spot checks.
                return;
            }
            else
            {
                // warpCheck(success, onAcc::warp::shflDown(acc, 43, 1u) == 43);
            }

            for(int idx = 0; idx < warpExtent / 2; ++idx)
            {
                // Active lanes must march forward until the end of the logical sub-group.
                auto const shuffled = onAcc::warp::shflDown(acc, lane, static_cast<std::uint32_t>(idx));
                auto const ans
                    = onAcc::warp::shflDown(acc, 4.0f - static_cast<float>(lane), static_cast<std::uint32_t>(idx));
                auto const expectFloat = (lane + idx < warpExtent / 2) ? 4.0f - static_cast<float>(lane + idx) : 0.0f;

                if(lane + idx < warpExtent / 2)
                {
                    // Each lane should see the value from the lane at the offset below within the active sub-group.
                    // Example: lane 0 with offset 1 reads lane 1, lane 1 reads lane 2, etc., until the subgroup ends.
                    warpCheck(success, shuffled == lane + idx);
                    // Floating payload mirrors the integer expectation for the same in-range partner lane.
                    warpCheck(success, alpaka::math::abs(ans - expectFloat) < epsilon);
                }
            }
        }
    };
} // namespace

TEMPLATE_LIST_TEST_CASE("warp shflDown shifts toward higher lanes", "[warp][shfl_down]", WarpTestBackends)
{
    auto cfg = TestType::makeDict();
    auto deviceSpec = cfg[object::deviceSpec];
    auto exec = cfg[object::exec];

    auto selector = onHost::makeDeviceSelector(deviceSpec);
    if(!selector.isAvailable())
    {
        INFO("No device available for " << deviceSpec.getName());
        return;
    }

    auto deviceProperties = selector.getDeviceProperties(0);
    auto const warpExtent = deviceProperties.warpSize;

    auto device = selector.makeDevice(0);
    auto queue = device.makeQueue(queueKind::blocking);

    auto successHost = onHost::allocHost<bool>(1u);
    auto successDev = onHost::allocLike(device, successHost);

    auto const blocks = Vec<std::uint32_t, 1u>{5u};
    auto const threads = Vec<std::uint32_t, 1u>{4u * warpExtent};

    onHost::memset(queue, successDev, static_cast<std::uint8_t>(true));
    queue.enqueue(exec, onHost::FrameSpec{blocks, threads}, KernelBundle{ShflDownMultiThreadKernel{}, successDev});
    onHost::memcpy(queue, successHost, successDev);
    onHost::wait(queue);
    INFO("backend=" << deviceSpec.getName());
    CHECK(successHost[0]);
}