Skip to content

Commit 4657107

Browse files
committed
Sync GPU Optimization Guide exampls
1 parent d7e7c4e commit 4657107

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
//==============================================================
2+
// Copyright © 2022 Intel Corporation
3+
//
4+
// SPDX-License-Identifier: MIT
5+
// =============================================================
6+
#include <iostream>
7+
#include <random>
8+
#include <sycl/sycl.hpp>
9+
#include <vector>
10+
11+
int main() {
12+
constexpr int N = 4096 * 4096;
13+
14+
std::vector<unsigned long> input(N);
15+
srand(2009);
16+
for (int i = 0; i < N; ++i) {
17+
input[i] = (long)rand() % 256;
18+
input[i] |= ((long)rand() % 256) << 8;
19+
input[i] |= ((long)rand() % 256) << 16;
20+
input[i] |= ((long)rand() % 256) << 24;
21+
input[i] |= ((long)rand() % 256) << 32;
22+
input[i] |= ((long)rand() % 256) << 40;
23+
input[i] |= ((long)rand() % 256) << 48;
24+
input[i] |= ((long)rand() % 256) << 56;
25+
}
26+
27+
sycl::queue q{sycl::gpu_selector_v,
28+
sycl::property::queue::enable_profiling{}};
29+
std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>()
30+
<< "\n";
31+
32+
// Snippet begin
33+
constexpr int BLOCK_SIZE = 256;
34+
constexpr int NUM_BINS = 32;
35+
36+
std::vector<unsigned long> hist(NUM_BINS, 0);
37+
38+
sycl::buffer<unsigned long, 1> mbuf(input.data(), N);
39+
sycl::buffer<unsigned long, 1> hbuf(hist.data(), NUM_BINS);
40+
41+
auto e = q.submit([&](auto &h) {
42+
sycl::accessor macc(mbuf, h, sycl::read_only);
43+
auto hacc = hbuf.get_access<sycl::access::mode::atomic>(h);
44+
h.parallel_for(
45+
sycl::nd_range(sycl::range{N / BLOCK_SIZE}, sycl::range{64}),
46+
[=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(16)]] {
47+
int group = it.get_group()[0];
48+
int gSize = it.get_local_range()[0];
49+
auto sg = it.get_sub_group();
50+
int sgSize = sg.get_local_range()[0];
51+
int sgGroup = sg.get_group_id()[0];
52+
53+
unsigned int histogram[NUM_BINS]; // histogram bins take less storage
54+
// with smaller data type
55+
for (int k = 0; k < NUM_BINS; k++) {
56+
histogram[k] = 0;
57+
}
58+
for (int k = 0; k < BLOCK_SIZE; k++) {
59+
unsigned long x =
60+
sg.load(macc.get_pointer() + group * gSize * BLOCK_SIZE +
61+
sgGroup * sgSize * BLOCK_SIZE + sgSize * k);
62+
unsigned long *p = &x;
63+
#pragma unroll
64+
for (int i = 0; i < 8; i++) {
65+
unsigned int c = (*p & 0x1FU);
66+
histogram[c] += 1;
67+
*p = (*p >> 8);
68+
}
69+
}
70+
71+
for (int k = 0; k < NUM_BINS; k++) {
72+
hacc[k].fetch_add(histogram[k]);
73+
}
74+
});
75+
});
76+
// Snippet end
77+
q.wait();
78+
79+
size_t kernel_ns = (e.template get_profiling_info<
80+
sycl::info::event_profiling::command_end>() -
81+
e.template get_profiling_info<
82+
sycl::info::event_profiling::command_start>());
83+
std::cout << "Kernel Execution Time Average: total = " << kernel_ns * 1e-6
84+
<< " msec" << std::endl;
85+
86+
return 0;
87+
}

0 commit comments

Comments
 (0)