Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ project("alpakaExamples" LANGUAGES CXX)
add_subdirectory("heatEquation2D/")
add_subdirectory("vectorAdd/")
add_subdirectory("tutorial/")
add_subdirectory("helloWorld/")
add_subdirectory("reduction/")
50 changes: 50 additions & 0 deletions example/reduction/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#
# Copyright 2023 Benjamin Worpitz, Jan Stephan
# SPDX-License-Identifier: ISC
#

################################################################################
# Required CMake version.

cmake_minimum_required(VERSION 3.25)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

################################################################################
# Project.

set(_TARGET_NAME vectorAdd)

project(tutorail LANGUAGES CXX)


#-------------------------------------------------------------------------------
# Find alpaka.

if (NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

if (alpaka_USE_SOURCE_TREE)
# Don't build the examples recursively
set(alpaka_BUILD_EXAMPLES OFF)
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
else ()
find_package(alpaka REQUIRED)
endif ()
endif ()

#-------------------------------------------------------------------------------
# Add executable.

file(GLOB_RECURSE tutorialSource "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")

foreach (tutExampleFile ${tutorialSource})
get_filename_component(tutFileName ${tutExampleFile} NAME)
string(REPLACE ".cpp" "" tutName ${tutFileName})
alpaka_add_executable(${tutName} ${tutExampleFile})
target_link_libraries(${tutName} PUBLIC alpaka::alpaka)
set_target_properties(${tutName} PROPERTIES FOLDER reduction)
target_compile_features(${tutName} PRIVATE cxx_std_20)

add_test(NAME ${tutName} COMMAND ${tutName})
endforeach ()
26 changes: 26 additions & 0 deletions example/reduction/src/config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/* Copyright 2024 Andrea Bocci
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once

#include <alpaka/alpaka.hpp>
#include <alpaka/example/executeForEach.hpp>
#include <alpaka/example/executors.hpp>

#include <cstdint>

// index type
using Idx = uint32_t;
// vectors
template<uint32_t T_dim>
using Vec = alpaka::Vec<uint32_t, T_dim>;
// zero dimension aka scalar is currently not supported
// using Scalar = Vec<Dim0D>;
using Vec1D = Vec<1u>;
using Vec2D = Vec<2u>;
using Vec3D = Vec<3u>;

// remove NDEBUG to activate asserts
#ifdef NDEBUG
# undef NDEBUG
#endif
250 changes: 250 additions & 0 deletions example/reduction/src/reduction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
/* Copyright 2024 Andrea Bocci, René Widera
* SPDX-License-Identifier: Apache-2.0
*/

#include "config.h"

#include <alpaka/alpaka.hpp>

#include <cassert>
#include <cstdio>
#include <random>

#include <chrono>
#include <iostream>

#include <cstdlib>
#include <ctime>

/** @file
*
* In the previous example we showed how to handle thread indices by hand to iterate over 1 and 3-dimensional data.
* There are very seldom cases where you need this explict control over threads and blocks. Very often handling thread
* indices by hand will result in performance issues at least on CPU devices.
*
* This example will show how you can iterate with frames, which can be seen as data index chunks without explicit
* calculate thread indices by hand. The code is is easy to write and read and will mostly be faster on CPU and GPU
* devices.
*/

// sum += A[]*B[];

struct ReductionKernel
{
template<typename TAcc>
ALPAKA_FN_ACC void operator()(TAcc const& acc, auto const in1, auto out, auto arraySize) const
{
// product() returns a scalar therefore we need the explicit Vec1D type
Vec1D linearNumFrames = acc[alpaka::frame::count].product();
auto frameExtent = acc[alpaka::frame::extent];
auto frameDataExtent = linearNumFrames*frameExtent;
Vec1D linearFrameExtent = frameExtent.product();
auto traverseInFrame = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{frameExtent});
auto traverseOverFrames = alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::blocksInGrid, alpaka::IdxRange{alpaka::CVec<uint32_t, 0u>{}, frameDataExtent, linearFrameExtent});
// Shared Mempry
auto sumOnSharedMem = alpaka::onAcc::declareSharedMdArray<double>(acc, frameExtent);

// Values of these addresses will be used later.
// Sync() is not required.
for(auto [elemIdxInFrame] : traverseInFrame){
sumOnSharedMem[elemIdxInFrame] = 0;
}

// init completed
///////////////////////////////////////////////////////////
// For each frame in frames and for each thread in frame do
// get the sum of a small piece in in1[] on the thread to the sharedMem[].

for(auto frameIdx : traverseOverFrames){
for(auto elemIdx : traverseInFrame){
for(auto [i]: alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::WorkerGroup{frameIdx + elemIdx, frameDataExtent}, alpaka::IdxRange{arraySize})){
sumOnSharedMem[elemIdx] += in1[i];
}
}
}

alpaka::onAcc::syncBlockThreads(acc);

// Copying data to sharedMem completed.
///////////////////////////////////////////////////////////
// For each thread on acc do
// sum up.

for(auto [elemIdx] : alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInBlock, alpaka::IdxRange{acc[alpaka::layer::thread].count(), frameExtent})){
sumOnSharedMem[acc[alpaka::layer::thread].idx()] += sumOnSharedMem[elemIdx];
}

// Suming up on each thread completed.
///////////////////////////////////////////////////////////
//

auto const [local_i] = acc[alpaka::layer::thread].idx();
auto const [blockSize] = acc[alpaka::layer::thread].count();
for(auto stride = blockSize /2; stride > 0; stride /=2){
alpaka::onAcc::syncBlockThreads(acc);
if(local_i < stride){
sumOnSharedMem[local_i] += sumOnSharedMem[local_i+stride];
//printf("[kernel] %f\n", sumOnSharedMem[local_i]);
//sumOnSharedMem[local_i+stride] = 0;
}
}

if(local_i == 0){
out[acc[alpaka::layer::block].idx().x()] = sumOnSharedMem[local_i];
}

//
///////////////////////////////////////////////////////////
//


}
};

void testReductionKernel(
alpaka::onHost::concepts::Device auto host,
alpaka::onHost::concepts::Device auto device,
auto computeExec, auto size)
{

//std::cout << "[Host] " << alpaka::onHost::getName(host) << ", ";
//std::cout << "[Device] " << alpaka::onHost::getName(device) << ", ";
std::cout << alpaka::onHost::getName(device) << ", " << std::flush;

// random number generator with a gaussian distribution
//std::random_device rd{};
//std::default_random_engine rand{rd()};
std::default_random_engine rand{};
std::normal_distribution<double> dist{(double)0.01, (double)1.};

// buffer size
//std::cout << "[Problem Size] " << size << ", ";
std::cout << size << ", " << std::flush;

// tolerance
constexpr double epsilon = (double)0.0001;
//constexpr float epsilon = 0.000001f*size;

// allocate input and output host buffers in pinned memory accessible by the Platform devices
auto in1_h = alpaka::onHost::alloc<double>(host, Vec1D{size});
auto out_h = alpaka::onHost::allocMirror(host, in1_h);

// fill the input buffers with random data, and the output buffer with zeros
for(auto i = 0; i < size; ++i)
{
in1_h[i] = dist(rand);
out_h[i] = 0.;
}

// run the test the given device
alpaka::onHost::Queue queue = device.makeQueue();

// allocate input and output buffers on the device
auto in1_d = alpaka::onHost::allocMirror(device, in1_h);
auto out_d = alpaka::onHost::allocMirror(device, out_h);

// copy the input data to the device; the size is known from the buffer objects
alpaka::onHost::memcpy(queue, in1_d, in1_h);

// fill the output buffer with zeros; the size is known from the buffer objects
alpaka::onHost::memset(queue, out_d, 0x00);

// launch the 1-dimensional kernel
constexpr auto frameExtent = 256;
//constexpr auto frameExtent = 1024;
auto numFrames = Vec1D{size} / frameExtent /8;
// The kernel assumes that the problem size is a multiple of the frame size.
//assert((numFrames * frameExtent).x() *8 == size);

auto frameSpec = alpaka::onHost::FrameSpec{numFrames, alpaka::CVec<uint32_t, frameExtent>{}};

// fill the output buffer with zeros; the size is known from the buffer objects
alpaka::onHost::memset(queue, out_d, 0x00);

//std::cout << "Grid of " << frameSpec << ", ";

alpaka::onHost::wait(queue);
auto beginT = std::chrono::high_resolution_clock::now();
queue
.enqueue(computeExec, frameSpec, ReductionKernel{}, in1_d.getMdSpan(), out_d.getMdSpan(), Vec1D(size));
alpaka::onHost::wait(queue);
auto endT = std::chrono::high_resolution_clock::now();
//std::cout << "[T Kernel Exec] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";
std::cout << std::chrono::duration<double>(endT - beginT).count() << ", " << std::flush;


alpaka::onHost::wait(queue);
beginT = std::chrono::high_resolution_clock::now();
// copy the results from the device to the host
alpaka::onHost::memcpy(queue, out_h, out_d);
// wait for all the operations to complete
alpaka::onHost::wait(queue);
endT = std::chrono::high_resolution_clock::now();
//std::cout << "[T HtoD Copy] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";

alpaka::onHost::wait(queue);
beginT = std::chrono::high_resolution_clock::now();
auto finalSum = std::accumulate(
&out_h[0],
&out_h[size-1],
double(0));
endT = std::chrono::high_resolution_clock::now();
//std::cout << "[T Partial Sum Accumulation] " << std::chrono::duration<double>(endT - beginT).count() << 's' << ", ";

double sum = 0;
// check the results
for(auto i = 0; i < size; ++i)
{
//if (i < 5) std::cout << "[num] " << in1_h[i] << std::endl;
sum += in1_h[i];
}
//std::cout << "acc output: " << finalSum << " host answer: " << sum << std::endl;
//printf("[Device Output] %f [Host Output] %f, ",finalSum, sum);
assert(pow(finalSum - sum,2) < pow(epsilon,2));
//std::cout << "[Results] " << "success\n";
std::cout << "success\n" << std::flush;
}

int example(auto const cfg)
{
auto deviceApi = cfg[alpaka::object::api];
auto computeExec = cfg[alpaka::object::exec];

// initialise the accelerator platform
alpaka::onHost::Platform platform = alpaka::onHost::makePlatform(deviceApi);

// require at least one device
std::size_t n = alpaka::onHost::getDeviceCount(platform);

if(n == 0)
{
return EXIT_FAILURE;
}

// use the single host device
alpaka::onHost::Platform host_platform = alpaka::onHost::makePlatform(alpaka::api::cpu);
alpaka::onHost::Device host = host_platform.makeDevice(0);

// use the first device
alpaka::onHost::Device device = platform.makeDevice(0);

uint32_t size = 1024 * 1024;
for(int fac = 10; fac < 11; fac++){
size = 1024*1024*pow(2,fac);
testReductionKernel(host, device, computeExec, size);
}

return EXIT_SUCCESS;
}

auto main() -> int
{
using namespace alpaka;
// Execute the example once for each enabled API and executor.
//std::srand(std::time(0)); // set time as random seed; rand() after this line will automatically use the same seed
std::srand(12345);
std::cout << "Device, Problem Size, T Kernel Exec (s), Results" << std::endl;
return executeForEach(
[=](auto const& tag) { return example(tag); },
onHost::allExecutorsAndApis(onHost::enabledApis));
}