alpaka3/test/unit/warp/shfl_xor.cpp at ffcfc3c27f7c821655a684f024f58f47291b94a4 · psychocoderHPC/alpaka3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* Copyright 2025 Mehmet Yusufoglu
 * SPDX-License-Identifier: MPL-2.0
 */

/** @file Tests the warp "shfl_xor" (shuffle XOR) operation which exchanges values according to XOR-based lane pairing.
 * The "shfl_xor" warp operation allows each thread to read from a lane whose ID is the XOR of its own ID with a mask.
 * It's a data exchange operation commonly used in butterfly reduction and FFT-like parallel algorithms.
 */

#include "utils.hpp"

#include <alpaka/onAcc/warp.hpp>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>

#include <cstdint>
#include <limits>

using namespace alpaka;
using alpaka::test::warp::warpCheck;
using alpaka::test::warp::WarpTestBackends;

namespace
{
    struct ShflXorMultiThreadKernel
    {
        template<typename TAcc>
        ALPAKA_FN_ACC void operator()(TAcc const& acc, concepts::IMdSpan<bool> auto success) const
        {
            auto const warpExtent = static_cast<std::int32_t>(onAcc::warp::getSize(acc));
            warpCheck(success, warpExtent >= 1);

            auto const threadsPerBlock = static_cast<std::int32_t>(acc[alpaka::layer::thread].count().product());
            warpCheck(success, threadsPerBlock % warpExtent == 0);

            auto const lane = static_cast<std::int32_t>(onAcc::warp::getLaneIdx(acc));
            // Exercise trivial zero-offset and max-offset cases.
            // For zero offset, each lane should see its own value.
            warpCheck(success, onAcc::warp::shflXor(acc, 42, 0u) == 42);
            // For zero offset, each lane should see its own value.
            warpCheck(success, onAcc::warp::shflXor(acc, lane, 0u) == lane);

            // For offset one, each lane should xor with 1 to find its partner.
            // For example, lane 0 with offset 1 should see lane 1's value, lane 1 should see lane 0's value, and so
            // on.
            auto shuffleOneMaskResult = onAcc::warp::shflXor(acc, lane, 1u);
            warpCheck(
                success,
                shuffleOneMaskResult == (lane ^ 1) || (warpExtent == 1 && shuffleOneMaskResult == lane));

            // Max offset should behave like zero offset since no lanes exist beyond the warp size.
            // For example, lane 2 with max offset should see lane 2's own value.
            warpCheck(success, onAcc::warp::shflXor(acc, 5, std::numeric_limits<std::uint32_t>::max()) == 5);

            auto const epsilon = std::numeric_limits<float>::epsilon();
            for(int width = 1; width < warpExtent; width *= 2)
            {
                // Test every xor distance inside the advertised subgroup width.
                for(int idx = 0; idx < width; ++idx)
                {
                    auto const shuffled = onAcc::warp::shflXor(
                        acc,
                        lane,
                        static_cast<std::uint32_t>(idx),
                        static_cast<std::uint32_t>(width));
                    warpCheck(success, shuffled == (lane ^ idx));

                    auto const ans = onAcc::warp::shflXor(
                        acc,
                        4.0f - static_cast<float>(lane),
                        static_cast<std::uint32_t>(idx),
                        static_cast<std::uint32_t>(width));
                    auto const expect = 4.0f - static_cast<float>(lane ^ idx);
                    warpCheck(success, alpaka::math::abs(ans - expect) < epsilon);
                }
            }

            if(lane >= warpExtent / 2)
            {
                // Deactivate the upper half to probe masked partners.
                return;
            }

            for(int idx = 0; idx < warpExtent / 2; ++idx)
            {
                // Remaining lanes must xor-pair with the expected partner.
                warpCheck(success, onAcc::warp::shflXor(acc, lane, static_cast<std::uint32_t>(idx)) == (lane ^ idx));
                auto const ans
                    = onAcc::warp::shflXor(acc, 4.0f - static_cast<float>(lane), static_cast<std::uint32_t>(idx));
                auto const expect = 4.0f - static_cast<float>(lane ^ idx);
                warpCheck(success, alpaka::math::abs(ans - expect) < epsilon);
            }
        }
    };
} // namespace

TEMPLATE_LIST_TEST_CASE("warp shflXor exchanges partner lanes", "[warp][shfl_xor]", WarpTestBackends)
{
    auto cfg = TestType::makeDict();
    auto deviceSpec = cfg[object::deviceSpec];
    auto exec = cfg[object::exec];

    auto selector = onHost::makeDeviceSelector(deviceSpec);
    if(!selector.isAvailable())
    {
        INFO("No device available for " << deviceSpec.getName());
        return;
    }

    auto deviceProperties = selector.getDeviceProperties(0);
    auto const warpExtent = deviceProperties.warpSize;

    auto device = selector.makeDevice(0);
    auto queue = device.makeQueue(queueKind::blocking);

    auto successHost = onHost::allocHost<bool>(1u);
    auto successDev = onHost::allocLike(device, successHost);

    auto const blocks = Vec<std::uint32_t, 1u>{5u};
    auto const threads = Vec<std::uint32_t, 1u>{4u * warpExtent};

    onHost::memset(queue, successDev, static_cast<std::uint8_t>(true));
    queue.enqueue(exec, onHost::FrameSpec{blocks, threads}, KernelBundle{ShflXorMultiThreadKernel{}, successDev});
    onHost::memcpy(queue, successHost, successDev);
    onHost::wait(queue);
    INFO("backend=" << deviceSpec.getName());
    CHECK(successHost[0]);
}