diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index 24d05794..9d63ff3f 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -5,20 +5,77 @@ // source code #include "CUDAStream.h" +#include -[[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) { - std::fprintf(stderr, "Error at %s:%d: %s (%d)\n %s\n", file, line, cudaGetErrorString(e), e, expr); +#if !defined(UNROLL_FACTOR) +#define UNROLL_FACTOR 4 +#endif + +[[noreturn]] inline void cuda_error(char const* file, int line, char const* expr, cudaError_t e) { + std::fprintf(stderr, "CUDA Error at %s:%d: %s (%d)\n %s\n", file, line, cudaGetErrorString(e), e, expr); + exit(e); +} + +[[noreturn]] inline void nvml_error(char const* file, int line, char const* expr, nvmlReturn_t e) { + std::fprintf(stderr, "NVML Error at %s:%d: %s (%d)\n %s\n", file, line, nvmlErrorString(e), e, expr); exit(e); } // The do while is there to make sure you remember to put a semi-colon after calling CU -#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false) +#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) cuda_error(__FILE__, __LINE__, #EXPR, __e); } while(false) +#define NVML(EXPR) do { auto __e = (EXPR); if (__e != NVML_SUCCESS) nvml_error(__FILE__, __LINE__, #EXPR, __e); } while(false) // It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; } cudaStream_t stream; +template +T* alloc_device(const intptr_t array_size) { + size_t array_bytes = sizeof(T) * array_size; + T* p = nullptr; +#if defined(MANAGED) + CU(cudaMallocManaged(&p, array_bytes)); +#elif defined(PAGEFAULT) + p = (T*)malloc(array_bytes); +#else + CU(cudaMalloc(&p, array_bytes)); +#endif + if (p == nullptr) throw std::runtime_error("Failed to allocate device array"); + return p; +} + +template +T* alloc_host(const intptr_t array_size) { + size_t array_bytes = sizeof(T) * array_size; + T* p = nullptr; +#if defined(PAGEFAULT) + p = (T*)malloc(array_bytes); +#else + CU(cudaHostAlloc(&p, array_bytes, cudaHostAllocDefault)); +#endif + if (p == nullptr) throw std::runtime_error("Failed to allocate host array"); + return p; +} + +template +void free_device(T* p) { +#if defined(PAGEFAULT) + free(p); +#else + CU(cudaFree(p)); +#endif +} + +template +void free_host(T* p) { +#if defined(PAGEFAULT) + free(p); +#else + CU(cudaFreeHost(p)); +#endif +} + template CUDAStream::CUDAStream(const intptr_t array_size, const int device_index) : array_size(array_size) @@ -33,85 +90,117 @@ CUDAStream::CUDAStream(const intptr_t array_size, const int device_index) CU(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); // Print out device information - std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl; - std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; -#if defined(MANAGED) - std::cout << "Memory: MANAGED" << std::endl; -#elif defined(PAGEFAULT) - std::cout << "Memory: PAGEFAULT" << std::endl; -#else - std::cout << "Memory: DEFAULT" << std::endl; -#endif - - // Query device for sensible dot kernel block count - cudaDeviceProp props; - CU(cudaGetDeviceProperties(&props, device_index)); - dot_num_blocks = props.multiProcessorCount * 4; + std::cout << "CUDA Driver: " << getDeviceDriver(device_index) << std::endl; + NVML(nvmlInit()); + cudaDeviceProp dprop; + CU(cudaGetDeviceProperties(&dprop, device_index)); + unsigned int memclock; + char mybus[16]; + sprintf(&mybus[0], "%04x:%02x:%02x.0", dprop.pciDomainID, dprop.pciBusID, dprop.pciDeviceID); + nvmlDevice_t nvmldev; + NVML(nvmlDeviceGetHandleByPciBusId(mybus, &nvmldev)); + NVML(nvmlDeviceGetClockInfo(nvmldev, NVML_CLOCK_MEM, &memclock)); + std::cout << "CUDA Device " << device_index << ": \"" + << getDeviceName(device_index) + << "\" " << dprop.multiProcessorCount << " SMs(" << dprop.major << "," << dprop.minor << ") " + << "Memory: " << memclock << " MHz x " << dprop.memoryBusWidth << "-bit = " + << 2.0*memclock*(dprop.memoryBusWidth/8)/1000.0 << " GB/s PEAK, ECC is " + << (dprop.ECCEnabled ? "ON" : "OFF") + << std::endl; + + // Print Memory allocation API used for buffers + std::cout << "Memory Allocation: "; + #if defined(MANAGED) + std::cout << "MANAGED"; + #elif defined(PAGEFAULT) + std::cout << "PAGEFAULT"; + #else + std::cout << "DEFAULT"; + #endif + std::cout << std::endl; + + std::cout << "Parallel for kernel config: thread blocks of size " << TBSIZE << std::endl; + + // Set sensible dot kernel block count + dot_num_blocks = dprop.multiProcessorCount * 4; // Size of partial sums for dot kernels size_t sums_bytes = sizeof(T) * dot_num_blocks; size_t array_bytes = sizeof(T) * array_size; size_t total_bytes = array_bytes * size_t(3) + sums_bytes; - std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; + std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl; // Check buffers fit on the device - if (props.totalGlobalMem < total_bytes) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - // Create device buffers -#if defined(MANAGED) - CU(cudaMallocManaged(&d_a, array_bytes)); - CU(cudaMallocManaged(&d_b, array_bytes)); - CU(cudaMallocManaged(&d_c, array_bytes)); - CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault)); -#elif defined(PAGEFAULT) - d_a = (T*)malloc(array_bytes); - d_b = (T*)malloc(array_bytes); - d_c = (T*)malloc(array_bytes); - sums = (T*)malloc(sums_bytes); -#else - CU(cudaMalloc(&d_a, array_bytes)); - CU(cudaMalloc(&d_b, array_bytes)); - CU(cudaMalloc(&d_c, array_bytes)); - CU(cudaHostAlloc(&sums, sums_bytes, cudaHostAllocDefault)); -#endif + if (dprop.totalGlobalMem < total_bytes) + throw std::runtime_error("Device does not have enough memory for all buffers"); + + // Allocate buffers: + d_a = alloc_device(array_size); + d_b = alloc_device(array_size); + d_c = alloc_device(array_size); + sums = alloc_host(dot_num_blocks); } template CUDAStream::~CUDAStream() { CU(cudaStreamDestroy(stream)); + free_device(d_a); + free_device(d_b); + free_device(d_c); + free_host(sums); +} -#if defined(PAGEFAULT) - free(d_a); - free(d_b); - free(d_c); - free(sums); +template +__global__ void for_each_kernel(size_t array_size, size_t start, F f) { + constexpr int unroll_factor = UNROLL_FACTOR; +#if defined(GRID_STRIDE) + // Grid-stride loop + size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; + #pragma unroll(unroll_factor) + for (; i < array_size; i += (size_t)gridDim.x * blockDim.x) { + f(i); + } +#elif defined(BLOCK_STRIDE) + // Block-stride loop + size_t i = start * blockIdx.x + threadIdx.x; + const size_t e = min(array_size, start * (blockIdx.x + size_t(1)) + threadIdx.x); + #pragma unroll(unroll_factor) + for (; i < e; i += blockDim.x) { + f(i); + } #else - CU(cudaFree(d_a)); - CU(cudaFree(d_b)); - CU(cudaFree(d_c)); - CU(cudaFreeHost(sums)); + #error Must pick grid-stride or block-stride loop #endif } -template -__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, size_t array_size) -{ - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - a[i] = initA; - b[i] = initB; - c[i] = initC; +template +void for_each(size_t array_size, F f) { + static int threads_per_block = 0; + if (threads_per_block == 0) { + // Pick suitable thread block size for F: + int min_blocks_per_grid; + auto dyn_smem = [] __host__ __device__ (int){ return 0; }; + CU(cudaOccupancyMaxPotentialBlockSizeVariableSMem + (&min_blocks_per_grid, &threads_per_block, for_each_kernel, dyn_smem, 0)); + // Clamp to TBSIZE + threads_per_block = std::min(TBSIZE, threads_per_block); } + size_t blocks = ceil_div(array_size / UNROLL_FACTOR, threads_per_block); + size_t start = ceil_div(array_size, (size_t)blocks); + for_each_kernel<<>>(array_size, start, f); + CU(cudaPeekAtLastError()); + CU(cudaStreamSynchronize(stream)); } template void CUDAStream::init_arrays(T initA, T initB, T initC) { - size_t blocks = ceil_div(array_size, TBSIZE); - init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); + for_each(array_size, [=,a=d_a,b=d_b,c=d_c] __device__ (size_t i) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + }); } template @@ -133,101 +222,54 @@ void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vecto #endif } -template -__global__ void copy_kernel(const T * a, T * c, size_t array_size) -{ - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - c[i] = a[i]; - } -} - template void CUDAStream::copy() { - size_t blocks = ceil_div(array_size, TBSIZE); - copy_kernel<<>>(d_a, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); -} - -template -__global__ void mul_kernel(T * b, const T * c, size_t array_size) -{ - const T scalar = startScalar; - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - b[i] = scalar * c[i]; - } + for_each(array_size, [a=d_a,c=d_c] __device__ (size_t i) { + c[i] = a[i]; + }); } template void CUDAStream::mul() { - size_t blocks = ceil_div(array_size, TBSIZE); - mul_kernel<<>>(d_b, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); -} - -template -__global__ void add_kernel(const T * a, const T * b, T * c, size_t array_size) -{ - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - c[i] = a[i] + b[i]; - } + for_each(array_size, [b=d_b,c=d_c] __device__ (size_t i) { + b[i] = startScalar * c[i]; + }); } template void CUDAStream::add() { - size_t blocks = ceil_div(array_size, TBSIZE); - add_kernel<<>>(d_a, d_b, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); -} - -template -__global__ void triad_kernel(T * a, const T * b, const T * c, size_t array_size) -{ - const T scalar = startScalar; - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - a[i] = b[i] + scalar * c[i]; - } + for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) { + c[i] = a[i] + b[i]; + }); } template void CUDAStream::triad() { - size_t blocks = ceil_div(array_size, TBSIZE); - triad_kernel<<>>(d_a, d_b, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); -} - -template -__global__ void nstream_kernel(T * a, const T * b, const T * c, size_t array_size) -{ - const T scalar = startScalar; - for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { - a[i] += b[i] + scalar * c[i]; - } + for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) { + a[i] = b[i] + startScalar * c[i]; + }); } template void CUDAStream::nstream() { - size_t blocks = ceil_div(array_size, TBSIZE); - nstream_kernel<<>>(d_a, d_b, d_c, array_size); - CU(cudaPeekAtLastError()); - CU(cudaStreamSynchronize(stream)); + for_each(array_size, [a=d_a,b=d_b,c=d_c] __device__ (size_t i) { + a[i] += b[i] + startScalar * c[i]; + }); } template __global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size) { - __shared__ T smem[TBSIZE]; + __shared__ T smem[TBSIZE_DOT]; T tmp = T(0.); const size_t tidx = threadIdx.x; - for (size_t i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { + size_t i = tidx + (size_t)blockDim.x * blockIdx.x; + for (; i < array_size; i += (size_t)gridDim.x * blockDim.x) { tmp += a[i] * b[i]; } smem[tidx] = tmp; @@ -244,7 +286,7 @@ __global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size) template T CUDAStream::dot() { - dot_kernel<<>>(d_a, d_b, sums, array_size); + dot_kernel<<>>(d_a, d_b, sums, array_size); CU(cudaPeekAtLastError()); CU(cudaStreamSynchronize(stream)); diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 4b4a1a3a..5b739569 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -15,7 +15,8 @@ #define IMPLEMENTATION_STRING "CUDA" -#define TBSIZE 1024 +#define TBSIZE 256 +#define TBSIZE_DOT 1024 template class CUDAStream : public Stream diff --git a/src/cuda/model.cmake b/src/cuda/model.cmake index 7c1b0d6e..8314e1f2 100644 --- a/src/cuda/model.cmake +++ b/src/cuda/model.cmake @@ -9,6 +9,11 @@ register_flag_optional(MEM "Device memory mode: PAGEFAULT - shared memory, only host pointers allocated." "DEFAULT") +register_flag_optional(STRIDE "Kernel stride: GRID_STRIDE or BLOCK_STRIDE" "GRID_STRIDE") + +register_flag_optional(UNROLL_FACTOR "Kernel unroll factor:" "4") + + register_flag_required(CMAKE_CUDA_COMPILER "Path to the CUDA nvcc compiler") @@ -30,11 +35,17 @@ macro(setup) enable_language(CUDA) register_definitions(${MEM}) + register_definitions(${STRIDE}) # add -forward-unknown-to-host-compiler for compatibility reasons - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + # add --extended-lambda for device-lambdas + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" + "--extended-lambda" "-DUNROLL_FACTOR=${UNROLL_FACTOR}" ${CUDA_EXTRA_FLAGS}) string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") + # Link against the NVIDIA Management Library for device information + register_link_library("nvidia-ml") + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # appended later wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) diff --git a/src/main.cpp b/src/main.cpp index ee091259..c677f048 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -244,7 +244,13 @@ void run() } std::cout << " "; } - std::cout << num_times << " times" << std::endl; + std::cout << num_times << " times in "; + switch (order) { + case BenchOrder::Classic: std::cout << " Classic"; break; + case BenchOrder::Isolated: std::cout << " Isolated"; break; + default: std::cerr << "Error: Unknown order" << std::endl; abort(); + }; + std::cout << " order " << std::endl; std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; std::cout << "Precision: " << (sizeof(T) == sizeof(float)? "float" : "double") << std::endl; @@ -366,7 +372,8 @@ void check_solution(const size_t num_times, // Error relative tolerance check size_t failed = 0; - T epsi = std::numeric_limits::epsilon() * T(100000.0); + T eps = std::numeric_limits::epsilon(); + T epsi = eps * T(100000.0); auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) { if (e > epsi || std::isnan(e) || std::isnan(is)) { ++failed; @@ -379,7 +386,7 @@ void check_solution(const size_t num_times, }; // Sum - T eS = std::fabs(sum - goldS) / std::fabs(goldS); + T eS = std::fabs(sum - goldS) / std::fabs(goldS + eps); for (size_t i = 0; i < num_benchmarks; ++i) { if (bench[i].id != BenchId::Dot) continue; if (run_benchmark(bench[i])) @@ -390,9 +397,9 @@ void check_solution(const size_t num_times, // Calculate the L^infty-norm relative error for (size_t i = 0; i < a.size(); ++i) { T vA = a[i], vB = b[i], vC = c[i]; - T eA = std::fabs(vA - goldA) / std::fabs(goldA); - T eB = std::fabs(vB - goldB) / std::fabs(goldB); - T eC = std::fabs(vC - goldC) / std::fabs(goldC); + T eA = std::fabs(vA - goldA) / std::fabs(goldA + eps); + T eB = std::fabs(vB - goldB) / std::fabs(goldB + eps); + T eC = std::fabs(vC - goldC) / std::fabs(goldC + eps); check("a", a[i], goldA, eA, i); check("b", b[i], goldB, eB, i); @@ -510,12 +517,12 @@ void parseArguments(int argc, char *argv[]) { if (++i >= argc) { - std::cerr << "Expected benchmark order after --order. Options: \"classic\" (default), \"isolated\"." + std::cerr << "Expected benchmark order after --order. Options: \"Classic\" (default), \"Isolated\"." << std::endl; exit(EXIT_FAILURE); } auto key = std::string(argv[i]); - if (key == "isolated") + if (key == "Isolated") { order = BenchOrder::Isolated; } @@ -566,7 +573,7 @@ void parseArguments(int argc, char *argv[]) std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << " -o --only NAME Only run one benchmark (see --print-names)" << std::endl; std::cout << " --print-names Prints all available benchmark names" << std::endl; - std::cout << " --order Benchmark run order: \"classic\" (default) or \"isolated\"." << std::endl; + std::cout << " --order Benchmark run order: \"Classic\" (default) or \"Isolated\"." << std::endl; std::cout << " --csv Output as csv table" << std::endl; std::cout << " --megabytes Use MB=10^6 for bandwidth calculation (default)" << std::endl; std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl;