|
| 1 | +//============================================================== |
| 2 | +// Copyright © 2022 Intel Corporation |
| 3 | +// |
| 4 | +// SPDX-License-Identifier: MIT |
| 5 | +// ============================================================= |
| 6 | +#include <iostream> |
| 7 | +#include <random> |
| 8 | +#include <sycl/sycl.hpp> |
| 9 | +#include <vector> |
| 10 | + |
| 11 | +int main() { |
| 12 | + constexpr int N = 4096 * 4096; |
| 13 | + |
| 14 | + std::vector<unsigned long> input(N); |
| 15 | + srand(2009); |
| 16 | + for (int i = 0; i < N; ++i) { |
| 17 | + input[i] = (long)rand() % 256; |
| 18 | + input[i] |= ((long)rand() % 256) << 8; |
| 19 | + input[i] |= ((long)rand() % 256) << 16; |
| 20 | + input[i] |= ((long)rand() % 256) << 24; |
| 21 | + input[i] |= ((long)rand() % 256) << 32; |
| 22 | + input[i] |= ((long)rand() % 256) << 40; |
| 23 | + input[i] |= ((long)rand() % 256) << 48; |
| 24 | + input[i] |= ((long)rand() % 256) << 56; |
| 25 | + } |
| 26 | + |
| 27 | + sycl::queue q{sycl::gpu_selector_v, |
| 28 | + sycl::property::queue::enable_profiling{}}; |
| 29 | + std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>() |
| 30 | + << "\n"; |
| 31 | + |
| 32 | + // Snippet begin |
| 33 | + constexpr int BLOCK_SIZE = 256; |
| 34 | + constexpr int NUM_BINS = 32; |
| 35 | + |
| 36 | + std::vector<unsigned long> hist(NUM_BINS, 0); |
| 37 | + |
| 38 | + sycl::buffer<unsigned long, 1> mbuf(input.data(), N); |
| 39 | + sycl::buffer<unsigned long, 1> hbuf(hist.data(), NUM_BINS); |
| 40 | + |
| 41 | + auto e = q.submit([&](auto &h) { |
| 42 | + sycl::accessor macc(mbuf, h, sycl::read_only); |
| 43 | + auto hacc = hbuf.get_access<sycl::access::mode::atomic>(h); |
| 44 | + h.parallel_for( |
| 45 | + sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}), |
| 46 | + [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] { |
| 47 | + int group = it.get_group()[0]; |
| 48 | + int gSize = it.get_local_range()[0]; |
| 49 | + auto sg = it.get_sub_group(); |
| 50 | + int sgSize = sg.get_local_range()[0]; |
| 51 | + int sgGroup = sg.get_group_id()[0]; |
| 52 | + |
| 53 | + unsigned int histogram[NUM_BINS]; // histogram bins take less storage |
| 54 | + // with smaller data type |
| 55 | + for (int k = 0; k < NUM_BINS; k++) { |
| 56 | + histogram[k] = 0; |
| 57 | + } |
| 58 | + for (int k = 0; k < BLOCK_SIZE; k++) { |
| 59 | + unsigned long x = |
| 60 | + sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE + |
| 61 | + sgGroup * sgSize * BLOCK_SIZE + sgSize * k); |
| 62 | + unsigned long *p = &x; |
| 63 | +#pragma unroll |
| 64 | + for (int i = 0; i < 8; i++) { |
| 65 | + unsigned int c = (*p & 0x1FU); |
| 66 | + histogram[c] += 1; |
| 67 | + *p = (*p >> 8); |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + for (int k = 0; k < NUM_BINS; k++) { |
| 72 | + hacc[k].fetch_add(histogram[k]); |
| 73 | + } |
| 74 | + }); |
| 75 | + }); |
| 76 | + // Snippet end |
| 77 | + q.wait(); |
| 78 | + |
| 79 | + size_t kernel_ns = (e.template get_profiling_info< |
| 80 | + sycl::info::event_profiling::command_end>() - |
| 81 | + e.template get_profiling_info< |
| 82 | + sycl::info::event_profiling::command_start>()); |
| 83 | + std::cout << "Kernel Execution Time Average: total = " << kernel_ns * 1e-6 |
| 84 | + << " msec" << std::endl; |
| 85 | + |
| 86 | + return 0; |
| 87 | +} |
0 commit comments