alpaka3/test/unit/warp/iota.cpp at ffcfc3c27f7c821655a684f024f58f47291b94a4 · psychocoderHPC/alpaka3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Bernhard Manfred Gruber, Jan Stephan, René Widera
 * SPDX-License-Identifier: MPL-2.0
 */

#include <alpaka/alpaka.hpp>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>

#include <iostream>

using namespace alpaka;

using TestApis = std::decay_t<decltype(onHost::allBackends(onHost::enabledApis, exec::enabledExecutors))>;

struct IotaKernelND
{
    ALPAKA_FN_ACC void operator()(auto const& acc, auto out, auto outSize) const
    {
        constexpr uint32_t warpSize = onAcc::warp::getSize<ALPAKA_TYPEOF(acc)>();
        concepts::Vector auto numThreadsPerBlock = acc.getExtentsOf(onAcc::origin::block, onAcc::unit::threads);
        // test using some of the predefined warp worker and range types
        for(auto bOffset :
            onAcc::makeIdxMap(acc, onAcc::worker::linearBlocksInGrid, IdxRange{Vec{0u}, outSize, numThreadsPerBlock}))
            for(auto w : onAcc::makeIdxMap(acc, onAcc::worker::linearWarpsInBlock, onAcc::range::linearWarpsInBlock))
            {
                for(auto t :
                    onAcc::makeIdxMap(acc, onAcc::worker::linearThreadsInWarp, onAcc::range::linearThreadsInWarp))
                {
                    auto idx = bOffset + warpSize * w + t;
                    /* use atomic to avoid that multiple threads change the same location, if so we will detect it in
                     * our verification
                     */
                    if(idx < outSize)
                        onAcc::atomicAdd(acc, &out[idx].x(), idx.x());
                }
            }
    }
};

TEMPLATE_LIST_TEST_CASE("warp iota1D", "", TestApis)
{
    auto cfg = TestType::makeDict();
    auto deviceSpec = cfg[object::deviceSpec];
    auto exec = cfg[object::exec];

    auto devSelector = onHost::makeDeviceSelector(deviceSpec);
    if(!devSelector.isAvailable())
    {
        std::cout << "No device available for " << deviceSpec.getName() << std::endl;
        return;
    }

    std::cout << deviceSpec.getApi().getName() << std::endl;
    onHost::Device device = devSelector.makeDevice(0);

    std::cout << device.getName() << std::endl;

    auto deviceProperties = devSelector.getDeviceProperties(0);
    uint32_t warpSize = deviceProperties.warpSize;

    onHost::Queue queue = device.makeQueue();
    Vec extent = Vec{warpSize * 123u};
    std::cout << "exec=" << onHost::demangledName(exec) << std::endl;
    auto dBuff = onHost::alloc<Vec<uint32_t, 1u>>(device, extent);

    auto hBuff = onHost::allocHostLike(dBuff);

    onHost::wait(queue);
    /* Use an odd number that to avoid any assumptions about number of threads.
     * Note that for a thread serialized executor alpaka will internally use a single thread per block what is equal to
     * the warp size.
     */
    auto frameSize = Vec{warpSize + 3u};
    onHost::fill(queue, dBuff, Vec{0u});
    queue.enqueue(exec, onHost::FrameSpec{extent / frameSize, frameSize}, KernelBundle{IotaKernelND{}, dBuff, extent});
    onHost::memcpy(queue, hBuff, dBuff);
    onHost::wait(queue);
    meta::ndLoopIncIdx(extent, [&](auto idx) { CHECK(idx == hBuff[idx]); });
}