Skip to content

Commit da5a682

Browse files
authored
Merge pull request oneapi-src#2539 from krisrak/20241004-pvc
updated gpu-opt training FLAT/COMPOSITE mode
2 parents 02070b5 + acf2533 commit da5a682

14 files changed

+760
-467
lines changed

DirectProgramming/C++SYCL/Jupyter/gpu-optimization-sycl-training/15_Implicit_Explicit_Scaling/15_Implicit_Explicit_Scaling.ipynb

Lines changed: 214 additions & 209 deletions
Large diffs are not rendered by default.

DirectProgramming/C++SYCL/Jupyter/gpu-optimization-sycl-training/15_Implicit_Explicit_Scaling/lab/vectoradd_explicit_scaling.cpp

Lines changed: 0 additions & 127 deletions
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
//==============================================================
2+
// Copyright © Intel Corporation
3+
//
4+
// SPDX-License-Identifier: MIT
5+
// =============================================================
6+
#include <sycl/sycl.hpp>
7+
8+
void kernel_compute_vadd(sycl::queue &q, float *a, float *b, float *c, size_t n) {
9+
q.parallel_for(n, [=](auto i) {
10+
c[i] = a[i] + b[i];
11+
});
12+
}
13+
14+
int main() {
15+
const int N = 1680;
16+
17+
// Define 3 arrays
18+
float *a = static_cast<float *>(malloc(N * sizeof(float)));
19+
float *b = static_cast<float *>(malloc(N * sizeof(float)));
20+
float *c = static_cast<float *>(malloc(N * sizeof(float)));
21+
22+
// Initialize matrices with values
23+
for (int i = 0; i < N; i++){
24+
a[i] = 1;
25+
b[i] = 2;
26+
c[i] = 0;
27+
}
28+
29+
// get all GPUs devices into a vector
30+
auto gpus = sycl::platform(sycl::gpu_selector_v).get_devices();
31+
int num_devices = gpus.size();
32+
33+
// Create sycl::queue for each gpu
34+
std::vector<sycl::queue> q(num_devices);
35+
for(int i = 0; i < num_devices; i++){
36+
std::cout << "Device: " << gpus[i].get_info<sycl::info::device::name>() << "\n";
37+
std::cout << "-EUs : " << gpus[i].get_info<sycl::info::device::max_compute_units>() << "\n";
38+
q.push_back(sycl::queue(gpus[i]));
39+
}
40+
41+
// device mem alloc for vectors a,b,c for each device
42+
float *da[num_devices];
43+
float *db[num_devices];
44+
float *dc[num_devices];
45+
for (int i = 0; i < num_devices; i++) {
46+
da[i] = sycl::malloc_device<float>(N/num_devices, q[i]);
47+
db[i] = sycl::malloc_device<float>(N/num_devices, q[i]);
48+
dc[i] = sycl::malloc_device<float>(N/num_devices, q[i]);
49+
}
50+
51+
// memcpy for matrix and b to device alloc
52+
for (int i = 0; i < num_devices; i++) {
53+
q[i].memcpy(&da[i][0], &a[i*N/num_devices], N/num_devices * sizeof(float));
54+
q[i].memcpy(&db[i][0], &b[i*N/num_devices], N/num_devices * sizeof(float));
55+
}
56+
57+
// wait for copy to complete
58+
for (int i = 0; i < num_devices; i++)
59+
q[i].wait();
60+
61+
// submit vector-add kernels to all devices
62+
for (int i = 0; i < num_devices; i++)
63+
kernel_compute_vadd(q[i], da[i], db[i], dc[i], N/num_devices);
64+
65+
// wait for compute complete
66+
for (int i = 0; i < num_devices; i++)
67+
q[i].wait();
68+
69+
// copy back result to host
70+
for (int i = 0; i < num_devices; i++)
71+
q[i].memcpy(&c[i*N/num_devices], &dc[i][0], N/num_devices * sizeof(float));
72+
73+
// wait for copy to complete
74+
for (int i = 0; i < num_devices; i++)
75+
q[i].wait();
76+
77+
// print output
78+
for (int i = 0; i < N; i++) std::cout << c[i] << " ";
79+
std::cout << "\n";
80+
81+
free(a);
82+
free(b);
83+
free(c);
84+
for (int i = 0; i < num_devices; i++) {
85+
sycl::free(da[i], q[i]);
86+
sycl::free(db[i], q[i]);
87+
sycl::free(dc[i], q[i]);
88+
}
89+
return 0;
90+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
//==============================================================
2+
// Copyright © Intel Corporation
3+
//
4+
// SPDX-License-Identifier: MIT
5+
// =============================================================
6+
#include <sycl/sycl.hpp>
7+
8+
void kernel_compute_vadd(sycl::queue &q, float *a, float *b, float *c, size_t n) {
9+
q.parallel_for(n, [=](auto i) {
10+
c[i] = a[i] + b[i];
11+
});
12+
}
13+
14+
int main() {
15+
const int N = 1680;
16+
17+
// Define 3 arrays
18+
float *a = static_cast<float *>(malloc(N * sizeof(float)));
19+
float *b = static_cast<float *>(malloc(N * sizeof(float)));
20+
float *c = static_cast<float *>(malloc(N * sizeof(float)));
21+
22+
// Initialize matrices with values
23+
for (int i = 0; i < N; i++){
24+
a[i] = 1;
25+
b[i] = 2;
26+
c[i] = 0;
27+
}
28+
29+
// get all GPUs devices into a vector
30+
sycl::queue q;
31+
std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>() << "\n";
32+
std::cout << "-EUs : " << q.get_device().get_info<sycl::info::device::max_compute_units>() << "\n";
33+
34+
// device mem alloc for vectors a,b,c for device
35+
auto da = sycl::malloc_device<float>(N, q);
36+
auto db = sycl::malloc_device<float>(N, q);
37+
auto dc = sycl::malloc_device<float>(N, q);
38+
39+
// memcpy for matrix and b to device alloc
40+
q.memcpy(da, a, N * sizeof(float));
41+
q.memcpy(db, b, N * sizeof(float));
42+
q.wait();
43+
44+
kernel_compute_vadd(q, da, db, dc, N);
45+
q.wait();
46+
47+
// copy back result to host
48+
q.memcpy(c, dc, N * sizeof(float));
49+
q.wait();
50+
51+
// print output
52+
for (int i = 0; i < N; i++) std::cout << c[i] << " ";
53+
std::cout << "\n";
54+
55+
free(a);
56+
free(b);
57+
free(c);
58+
sycl::free(da, q);
59+
sycl::free(db, q);
60+
sycl::free(dc, q);
61+
return 0;
62+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
//==============================================================
2+
// Copyright © Intel Corporation
3+
//
4+
// SPDX-License-Identifier: MIT
5+
// =============================================================
6+
#include <sycl/sycl.hpp>
7+
8+
void kernel_compute_vadd(sycl::queue &q, float *a, float *b, float *c, size_t n) {
9+
q.parallel_for(n, [=](auto i) {
10+
c[i] = a[i] + b[i];
11+
});
12+
}
13+
14+
int main() {
15+
const int N = 1680;
16+
17+
// Define 3 arrays
18+
float *a = static_cast<float *>(malloc(N * sizeof(float)));
19+
float *b = static_cast<float *>(malloc(N * sizeof(float)));
20+
float *c = static_cast<float *>(malloc(N * sizeof(float)));
21+
22+
// Initialize matrices with values
23+
for (int i = 0; i < N; i++){
24+
a[i] = 1;
25+
b[i] = 2;
26+
c[i] = 0;
27+
}
28+
29+
sycl::queue q_root;
30+
sycl::device RootDevice = q_root.get_device();
31+
std::cout << "Device: " << RootDevice.get_info<sycl::info::device::name>() << "\n";
32+
std::cout << "-EUs : " << RootDevice.get_info<sycl::info::device::max_compute_units>() << "\n\n";
33+
34+
//# Check if GPU can be partitioned (stacks/Stack)
35+
std::vector<sycl::queue> q;
36+
auto num_devices = RootDevice.get_info<sycl::info::device::partition_max_sub_devices>();
37+
if(num_devices > 0){
38+
std::cout << "-partition_max_sub_devices: " << num_devices << "\n\n";
39+
std::vector<sycl::device> SubDevices = RootDevice.create_sub_devices<
40+
sycl::info::partition_property::partition_by_affinity_domain>(
41+
sycl::info::partition_affinity_domain::numa);
42+
for (auto &SubDevice : SubDevices) {
43+
q.push_back(sycl::queue(SubDevice));
44+
std::cout << "Sub-Device: " << SubDevice.get_info<sycl::info::device::name>() << "\n";
45+
std::cout << "-EUs : " << SubDevice.get_info<sycl::info::device::max_compute_units>() << "\n";
46+
}
47+
} else {
48+
std::cout << "-cannot partition to sub-device, running on root-device " << "\n\n";
49+
num_devices = 1;
50+
q.push_back(q_root);
51+
}
52+
53+
54+
// device mem alloc for vectors a,b,c for each device
55+
float *da[num_devices];
56+
float *db[num_devices];
57+
float *dc[num_devices];
58+
for (int i = 0; i < num_devices; i++) {
59+
da[i] = sycl::malloc_device<float>(N/num_devices, q[i]);
60+
db[i] = sycl::malloc_device<float>(N/num_devices, q[i]);
61+
dc[i] = sycl::malloc_device<float>(N/num_devices, q[i]);
62+
}
63+
64+
// memcpy for matrix and b to device alloc
65+
for (int i = 0; i < num_devices; i++) {
66+
q[i].memcpy(&da[i][0], &a[i*N/num_devices], N/num_devices * sizeof(float));
67+
q[i].memcpy(&db[i][0], &b[i*N/num_devices], N/num_devices * sizeof(float));
68+
}
69+
70+
// wait for copy to complete
71+
for (int i = 0; i < num_devices; i++)
72+
q[i].wait();
73+
74+
// submit vector-add kernels to all devices
75+
for (int i = 0; i < num_devices; i++)
76+
kernel_compute_vadd(q[i], da[i], db[i], dc[i], N/num_devices);
77+
78+
// wait for compute complete
79+
for (int i = 0; i < num_devices; i++)
80+
q[i].wait();
81+
82+
// copy back result to host
83+
for (int i = 0; i < num_devices; i++)
84+
q[i].memcpy(&c[i*N/num_devices], &dc[i][0], N/num_devices * sizeof(float));
85+
86+
// wait for copy to complete
87+
for (int i = 0; i < num_devices; i++)
88+
q[i].wait();
89+
90+
// print output
91+
for (int i = 0; i < N; i++) std::cout << c[i] << " ";
92+
std::cout << "\n";
93+
94+
free(a);
95+
free(b);
96+
free(c);
97+
for (int i = 0; i < num_devices; i++) {
98+
sycl::free(da[i], q[i]);
99+
sycl::free(db[i], q[i]);
100+
sycl::free(dc[i], q[i]);
101+
}
102+
return 0;
103+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1
3+
echo "export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE"
4+
echo
5+
export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
6+
sycl-ls
7+
echo
8+
icpx -fsycl lab/vectoradd_sub_device.cpp -w
9+
if [ $? -eq 0 ]; then ./a.out; fi

0 commit comments

Comments
 (0)