diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index db3ef601..fb46ab97 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -18,3 +18,5 @@ project("alpakaExamples" LANGUAGES CXX) add_subdirectory("heatEquation2D/") add_subdirectory("vectorAdd/") add_subdirectory("tutorial/") +add_subdirectory("helloWorld/") +add_subdirectory("reduction/") diff --git a/example/reduction/CMakeLists.txt b/example/reduction/CMakeLists.txt new file mode 100644 index 00000000..18db8a31 --- /dev/null +++ b/example/reduction/CMakeLists.txt @@ -0,0 +1,50 @@ +# +# Copyright 2023 Benjamin Worpitz, Jan Stephan +# SPDX-License-Identifier: ISC +# + +################################################################################ +# Required CMake version. + +cmake_minimum_required(VERSION 3.25) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. + +set(_TARGET_NAME vectorAdd) + +project(tutorail LANGUAGES CXX) + + +#------------------------------------------------------------------------------- +# Find alpaka. + +if (NOT TARGET alpaka::alpaka) + option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) + + if (alpaka_USE_SOURCE_TREE) + # Don't build the examples recursively + set(alpaka_BUILD_EXAMPLES OFF) + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") + else () + find_package(alpaka REQUIRED) + endif () +endif () + +#------------------------------------------------------------------------------- +# Add executable. + +file(GLOB_RECURSE tutorialSource "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") + +foreach (tutExampleFile ${tutorialSource}) + get_filename_component(tutFileName ${tutExampleFile} NAME) + string(REPLACE ".cpp" "" tutName ${tutFileName}) + alpaka_add_executable(${tutName} ${tutExampleFile}) + target_link_libraries(${tutName} PUBLIC alpaka::alpaka) + set_target_properties(${tutName} PROPERTIES FOLDER reduction) + target_compile_features(${tutName} PRIVATE cxx_std_20) + + add_test(NAME ${tutName} COMMAND ${tutName}) +endforeach () diff --git a/example/reduction/src/config.h b/example/reduction/src/config.h new file mode 100644 index 00000000..8a69e3c8 --- /dev/null +++ b/example/reduction/src/config.h @@ -0,0 +1,26 @@ +/* Copyright 2024 Andrea Bocci + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include + +#include + +// index type +using Idx = uint32_t; +// vectors +template +using Vec = alpaka::Vec; +// zero dimension aka scalar is currently not supported +// using Scalar = Vec; +using Vec1D = Vec<1u>; +using Vec2D = Vec<2u>; +using Vec3D = Vec<3u>; + +// remove NDEBUG to activate asserts +#ifdef NDEBUG +# undef NDEBUG +#endif diff --git a/example/reduction/src/reduction.cpp b/example/reduction/src/reduction.cpp new file mode 100644 index 00000000..8ecea705 --- /dev/null +++ b/example/reduction/src/reduction.cpp @@ -0,0 +1,250 @@ +/* Copyright 2024 Andrea Bocci, René Widera + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "config.h" + +#include + +#include +#include +#include + +#include +#include + +#include +#include + +/** @file + * + * In the previous example we showed how to handle thread indices by hand to iterate over 1 and 3-dimensional data. + * There are very seldom cases where you need this explict control over threads and blocks. Very often handling thread + * indices by hand will result in performance issues at least on CPU devices. + * + * This example will show how you can iterate with frames, which can be seen as data index chunks without explicit + * calculate thread indices by hand. The code is is easy to write and read and will mostly be faster on CPU and GPU + * devices. + */ + +// sum += A[]*B[]; + +struct ReductionKernel +{ + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, auto const in1, auto out, auto arraySize) const + { + // product() returns a scalar therefore we need the explicit Vec1D type + Vec1D linearNumFrames = acc[alpaka::frame::count].product(); + auto frameExtent = acc[alpaka::frame::extent]; + auto frameDataExtent = linearNumFrames*frameExtent; + Vec1D linearFrameExtent = frameExtent.product(); + auto traverseInFrame = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{frameExtent}); + auto traverseOverFrames = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::blocksInGrid, alpaka::IdxRange{alpaka::CVec{}, frameDataExtent, linearFrameExtent}); + // Shared Mempry + auto sumOnSharedMem = alpaka::onAcc::declareSharedMdArray(acc, frameExtent); + + // Values of these addresses will be used later. + // Sync() is not required. + for(auto [elemIdxInFrame] : traverseInFrame){ + sumOnSharedMem[elemIdxInFrame] = 0; + } + + // init completed + /////////////////////////////////////////////////////////// + // For each frame in frames and for each thread in frame do + // get the sum of a small piece in in1[] on the thread to the sharedMem[]. + + for(auto frameIdx : traverseOverFrames){ + for(auto elemIdx : traverseInFrame){ + for(auto [i]: alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::WorkerGroup{frameIdx + elemIdx, frameDataExtent}, alpaka::IdxRange{arraySize})){ + sumOnSharedMem[elemIdx] += in1[i]; + } + } + } + + alpaka::onAcc::syncBlockThreads(acc); + + // Copying data to sharedMem completed. + /////////////////////////////////////////////////////////// + // For each thread on acc do + // sum up. + + for(auto [elemIdx] : alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{acc[alpaka::layer::thread].count(), frameExtent})){ + sumOnSharedMem[acc[alpaka::layer::thread].idx()] += sumOnSharedMem[elemIdx]; + } + + // Suming up on each thread completed. + /////////////////////////////////////////////////////////// + // + + auto const [local_i] = acc[alpaka::layer::thread].idx(); + auto const [blockSize] = acc[alpaka::layer::thread].count(); + for(auto stride = blockSize /2; stride > 0; stride /=2){ + alpaka::onAcc::syncBlockThreads(acc); + if(local_i < stride){ + sumOnSharedMem[local_i] += sumOnSharedMem[local_i+stride]; + //printf("[kernel] %f\n", sumOnSharedMem[local_i]); + //sumOnSharedMem[local_i+stride] = 0; + } + } + + if(local_i == 0){ + out[acc[alpaka::layer::block].idx().x()] = sumOnSharedMem[local_i]; + } + + // + /////////////////////////////////////////////////////////// + // + + + } +}; + +void testReductionKernel( + alpaka::onHost::concepts::Device auto host, + alpaka::onHost::concepts::Device auto device, + auto computeExec, auto size) +{ + + //std::cout << "[Host] " << alpaka::onHost::getName(host) << ", "; + //std::cout << "[Device] " << alpaka::onHost::getName(device) << ", "; + std::cout << alpaka::onHost::getName(device) << ", " << std::flush; + + // random number generator with a gaussian distribution + //std::random_device rd{}; + //std::default_random_engine rand{rd()}; + std::default_random_engine rand{}; + std::normal_distribution dist{(double)0.01, (double)1.}; + + // buffer size + //std::cout << "[Problem Size] " << size << ", "; + std::cout << size << ", " << std::flush; + + // tolerance + constexpr double epsilon = (double)0.0001; + //constexpr float epsilon = 0.000001f*size; + + // allocate input and output host buffers in pinned memory accessible by the Platform devices + auto in1_h = alpaka::onHost::alloc(host, Vec1D{size}); + auto out_h = alpaka::onHost::allocMirror(host, in1_h); + + // fill the input buffers with random data, and the output buffer with zeros + for(auto i = 0; i < size; ++i) + { + in1_h[i] = dist(rand); + out_h[i] = 0.; + } + + // run the test the given device + alpaka::onHost::Queue queue = device.makeQueue(); + + // allocate input and output buffers on the device + auto in1_d = alpaka::onHost::allocMirror(device, in1_h); + auto out_d = alpaka::onHost::allocMirror(device, out_h); + + // copy the input data to the device; the size is known from the buffer objects + alpaka::onHost::memcpy(queue, in1_d, in1_h); + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::onHost::memset(queue, out_d, 0x00); + + // launch the 1-dimensional kernel + constexpr auto frameExtent = 256; + //constexpr auto frameExtent = 1024; + auto numFrames = Vec1D{size} / frameExtent /8; + // The kernel assumes that the problem size is a multiple of the frame size. + //assert((numFrames * frameExtent).x() *8 == size); + + auto frameSpec = alpaka::onHost::FrameSpec{numFrames, alpaka::CVec{}}; + + // fill the output buffer with zeros; the size is known from the buffer objects + alpaka::onHost::memset(queue, out_d, 0x00); + + //std::cout << "Grid of " << frameSpec << ", "; + + alpaka::onHost::wait(queue); + auto beginT = std::chrono::high_resolution_clock::now(); + queue + .enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size)); + alpaka::onHost::wait(queue); + auto endT = std::chrono::high_resolution_clock::now(); + //std::cout << "[T Kernel Exec] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; + std::cout << std::chrono::duration(endT - beginT).count() << ", " << std::flush; + + + alpaka::onHost::wait(queue); + beginT = std::chrono::high_resolution_clock::now(); + // copy the results from the device to the host + alpaka::onHost::memcpy(queue, out_h, out_d); + // wait for all the operations to complete + alpaka::onHost::wait(queue); + endT = std::chrono::high_resolution_clock::now(); + //std::cout << "[T HtoD Copy] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; + + alpaka::onHost::wait(queue); + beginT = std::chrono::high_resolution_clock::now(); + auto finalSum = std::accumulate( + &out_h[0], + &out_h[size-1], + double(0)); + endT = std::chrono::high_resolution_clock::now(); + //std::cout << "[T Partial Sum Accumulation] " << std::chrono::duration(endT - beginT).count() << 's' << ", "; + + double sum = 0; + // check the results + for(auto i = 0; i < size; ++i) + { + //if (i < 5) std::cout << "[num] " << in1_h[i] << std::endl; + sum += in1_h[i]; + } + //std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl; + //printf("[Device Output] %f [Host Output] %f, ",finalSum, sum); + assert(pow(finalSum - sum,2) < pow(epsilon,2)); + //std::cout << "[Results] " << "success\n"; + std::cout << "success\n" << std::flush; +} + +int example(auto const cfg) +{ + auto deviceApi = cfg[alpaka::object::api]; + auto computeExec = cfg[alpaka::object::exec]; + + // initialise the accelerator platform + alpaka::onHost::Platform platform = alpaka::onHost::makePlatform(deviceApi); + + // require at least one device + std::size_t n = alpaka::onHost::getDeviceCount(platform); + + if(n == 0) + { + return EXIT_FAILURE; + } + + // use the single host device + alpaka::onHost::Platform host_platform = alpaka::onHost::makePlatform(alpaka::api::cpu); + alpaka::onHost::Device host = host_platform.makeDevice(0); + + // use the first device + alpaka::onHost::Device device = platform.makeDevice(0); + + uint32_t size = 1024 * 1024; + for(int fac = 10; fac < 11; fac++){ + size = 1024*1024*pow(2,fac); + testReductionKernel(host, device, computeExec, size); + } + + return EXIT_SUCCESS; +} + +auto main() -> int +{ + using namespace alpaka; + // Execute the example once for each enabled API and executor. + //std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed + std::srand(12345); + std::cout << "Device, Problem Size, T Kernel Exec (s), Results" << std::endl; + return executeForEach( + [=](auto const& tag) { return example(tag); }, + onHost::allExecutorsAndApis(onHost::enabledApis)); +}