Skip to content

Commit d7e7c4e

Browse files
committed
Sync GPU Optimization Guide exampls
1 parent fed066f commit d7e7c4e

File tree

87 files changed

+3348
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+3348
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
add_example_with_mkl_mpi(dgemm 8192 8192 8192)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#OMP_AFFINITIZATION = 0 to ensure affinitization through MPI environment variables, and = 1 to use OpenMP to affinitize the MPI rank
2+
OMP_AFFINITIZATION=0
3+
4+
CC=mpicxx
5+
INCLUDE=-I$(MKLROOT)/include
6+
LIB="$(MKLROOT)/lib"/libmkl_sycl.a -Wl,--start-group "$(MKLROOT)/lib"/libmkl_intel_lp64.a "$(MKLROOT)/lib"/libmkl_intel_thread.a "$(MKLROOT)/lib"/libmkl_core.a -Wl,--end-group -lsycl -lOpenCL -liomp5 -lpthread -ldl -lm -lstdc++
7+
CFLAGS=-cxx=icpx -fiopenmp -fopenmp-targets=spir64 -fsycl -DOMP_AFFINITIZATION=$(OMP_AFFINITIZATION)
8+
CFLAGS2=-cxx=icpx -fsycl-device-code-split=per_kernel -fiopenmp -fopenmp-targets=spir64 -fsycl
9+
10+
dgemm: dgemm.o Makefile
11+
$(CC) $(CFLAGS2) dgemm.o $(LIB) -o dgemm
12+
13+
dgemm.o: dgemm.cpp Makefile
14+
$(CC) $(CFLAGS) $(INCLUDE) -c dgemm.cpp -o dgemm.o
15+
16+
clean:
17+
rm -rf ./dgemm ./dgemm.o
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#include "mkl.h"
2+
#include "mkl_omp_offload.h"
3+
#include <algorithm>
4+
#include <chrono>
5+
#include <limits>
6+
#include <mpi.h>
7+
#include <omp.h>
8+
#define FLOAT double
9+
#define MPI_FLOAT_T MPI_DOUBLE
10+
#define MKL_INT_T MKL_INT
11+
#define index(i, j, ld) (((j) * (ld)) + (i))
12+
#define RAND() ((FLOAT)rand() / (FLOAT)RAND_MAX * 2.0 - 1.0)
13+
#define LD_ALIGN 256
14+
#define LD_BIAS 8
15+
#define HPL_PTR(ptr_, al_) ((((size_t)(ptr_) + (al_) - 1) / (al_)) * (al_))
16+
static inline MKL_INT_T getld(MKL_INT_T x) {
17+
MKL_INT_T ld;
18+
ld = HPL_PTR(x, LD_ALIGN);
19+
if (ld - LD_BIAS >= x)
20+
ld -= LD_BIAS;
21+
else
22+
ld += LD_BIAS;
23+
return ld;
24+
}
25+
int main(int argc, char **argv) {
26+
if ((argc < 4) || (argc > 4 && argc < 8)) {
27+
printf("Performs a DGEMM test C = alpha*A*B + beta*C\n");
28+
printf("A matrix is MxK and B matrix is KxN\n");
29+
printf("All matrices are stored in column-major format\n");
30+
printf("Run as ./dgemm <M> <K> <N> [<alpha> <beta> <iterations>]\n");
31+
printf("Required inputs are:\n");
32+
printf(" M: number of rows of matrix A\n");
33+
printf(" K: number of cols of matrix A\n");
34+
printf(" N: number of cols of matrix B\n");
35+
printf("Optional inputs are (all must be provided if providing any):\n");
36+
printf(" alpha: scalar multiplier (default: 1.0)\n");
37+
printf(" beta: scalar multiplier (default: 0.0)\n");
38+
printf(" iterations: number of blocking DGEMM calls to perform "
39+
"(default: 10)\n");
40+
return EXIT_FAILURE;
41+
}
42+
MKL_INT_T HA = (MKL_INT_T)(atoi(argv[1]));
43+
MKL_INT_T WA = (MKL_INT_T)(atoi(argv[2]));
44+
MKL_INT_T WB = (MKL_INT_T)(atoi(argv[3]));
45+
FLOAT alpha, beta;
46+
int niter;
47+
if (argc > 4) {
48+
sscanf(argv[4], "%lf", &alpha);
49+
sscanf(argv[5], "%lf", &beta);
50+
niter = atoi(argv[6]);
51+
} else {
52+
alpha = 1.0;
53+
beta = 0.0;
54+
niter = 10;
55+
}
56+
MKL_INT_T HB = WA;
57+
MKL_INT_T WC = WB;
58+
MKL_INT_T HC = HA;
59+
MKL_INT_T ldA = getld(HA);
60+
MKL_INT_T ldB = getld(HB);
61+
MKL_INT_T ldC = getld(HC);
62+
double tot_t = 0.0, best_t = std::numeric_limits<double>::max();
63+
FLOAT *A = new FLOAT[ldA * WA];
64+
FLOAT *B, *C, *local_B, *local_C;
65+
MPI_Init(&argc, &argv);
66+
int mpi_rank, mpi_size;
67+
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
68+
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
69+
if (mpi_rank == 0) {
70+
B = new FLOAT[ldB * WB];
71+
C = new FLOAT[ldC * WC];
72+
srand(2864);
73+
for (int j = 0; j < WA; j++)
74+
for (int i = 0; i < HA; i++)
75+
A[index(i, j, ldA)] = RAND();
76+
for (int j = 0; j < WB; j++)
77+
for (int i = 0; i < HB; i++)
78+
B[index(i, j, ldB)] = RAND();
79+
if (beta != 0.0) {
80+
for (int j = 0; j < WC; j++)
81+
for (int i = 0; i < HC; i++)
82+
C[index(i, j, ldC)] = RAND();
83+
} else {
84+
for (int j = 0; j < WC; j++)
85+
for (int i = 0; i < HC; i++)
86+
C[index(i, j, ldC)] = 0.0;
87+
}
88+
}
89+
size_t sizea = (size_t)ldA * WA;
90+
size_t local_sizeb, local_sizec;
91+
int *displacements_b = new int[mpi_size];
92+
int *send_counts_b = new int[mpi_size];
93+
int *displacements_c = new int[mpi_size];
94+
int *send_counts_c = new int[mpi_size];
95+
int local_WB = WB / mpi_size;
96+
send_counts_b[0] = ldB * (local_WB + WB % mpi_size);
97+
send_counts_c[0] = ldC * (local_WB + WB % mpi_size);
98+
displacements_b[0] = 0;
99+
displacements_c[0] = 0;
100+
for (int i = 1; i < mpi_size; i++) {
101+
send_counts_b[i] = ldB * local_WB;
102+
send_counts_c[i] = ldC * local_WB;
103+
displacements_b[i] = displacements_b[i - 1] + send_counts_b[i - 1];
104+
displacements_c[i] = displacements_b[i - 1] + send_counts_c[i - 1];
105+
}
106+
if (mpi_rank == 0) {
107+
local_WB += WB % mpi_size;
108+
}
109+
local_sizeb = ldB * local_WB;
110+
local_sizec = ldC * local_WB;
111+
local_B = new FLOAT[local_sizeb];
112+
local_C = new FLOAT[local_sizec];
113+
MPI_Bcast(A, sizea, MPI_FLOAT_T, 0, MPI_COMM_WORLD);
114+
MPI_Scatterv(B, send_counts_b, displacements_b, MPI_FLOAT_T, local_B,
115+
local_sizeb, MPI_FLOAT_T, 0, MPI_COMM_WORLD);
116+
MPI_Scatterv(C, send_counts_c, displacements_c, MPI_FLOAT_T, local_C,
117+
local_sizec, MPI_FLOAT_T, 0, MPI_COMM_WORLD);
118+
#if defined(OMP_AFFINITIZATION)
119+
#if OMP_AFFINITIZATION == 1
120+
int ndev = omp_get_num_devices();
121+
int dnum = mpi_rank % ndev;
122+
omp_set_default_device(dnum);
123+
#endif
124+
#endif
125+
#pragma omp target data map(to : A[0 : sizea], local_B[0 : local_sizeb]) \
126+
map(tofrom : local_C[0 : local_sizec])
127+
{
128+
#pragma omp dispatch
129+
dgemm("N", "N", &HA, &local_WB, &WA, &alpha, A, &ldA, local_B, &ldB, &beta,
130+
local_C, &ldC);
131+
for (int i = 0; i < niter; i++) {
132+
auto start_t = std::chrono::high_resolution_clock::now();
133+
#pragma omp dispatch
134+
dgemm("N", "N", &HA, &local_WB, &WA, &alpha, A, &ldA, local_B, &ldB,
135+
&beta, local_C, &ldC);
136+
MPI_Barrier(MPI_COMM_WORLD);
137+
auto end_t = std::chrono::high_resolution_clock::now();
138+
std::chrono::duration<double> diff = end_t - start_t;
139+
tot_t += diff.count();
140+
best_t = std::min(best_t, diff.count());
141+
}
142+
}
143+
MPI_Gatherv(local_C, local_sizec, MPI_FLOAT_T, C, send_counts_c,
144+
displacements_c, MPI_FLOAT_T, 0, MPI_COMM_WORLD);
145+
delete[] local_B;
146+
delete[] local_C;
147+
delete[] displacements_b;
148+
delete[] displacements_c;
149+
delete[] send_counts_b;
150+
delete[] send_counts_c;
151+
MPI_Allreduce(MPI_IN_PLACE, &tot_t, 1, MPI_FLOAT_T, MPI_MAX, MPI_COMM_WORLD);
152+
MPI_Allreduce(MPI_IN_PLACE, &best_t, 1, MPI_FLOAT_T, MPI_MAX, MPI_COMM_WORLD);
153+
if (mpi_rank == 0) {
154+
double tflop_count = (double)2.0 * HA * WB * WA;
155+
if (beta != 0.0)
156+
tflop_count += (double)HA * WB;
157+
tflop_count *= 1.E-12;
158+
printf("Total runtime for %d iterations: %f seconds.\n", niter, tot_t);
159+
printf("Mean TFLOP/s: %f\n", (double)niter * tflop_count / tot_t);
160+
printf("Best TFLOP/s: %f\n", (double)tflop_count / best_t);
161+
delete[] B;
162+
delete[] C;
163+
}
164+
delete[] A;
165+
MPI_Finalize();
166+
return EXIT_SUCCESS;
167+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
if [ -z ${NCCS} ]; then
4+
NCCS=1
5+
fi
6+
7+
if [ -z ${NGPUS} ]; then
8+
NGPUS=1
9+
fi
10+
11+
if [ -z ${NSTACKS} ]; then
12+
NSTACKS=1
13+
fi
14+
15+
subdevices=$((NGPU*NSTACK))
16+
17+
export ZE_AFFINITY_MASK=$(((MPI_LOCALRANKID/NCCS)%subdevices))
18+
19+
echo MPI_LOCALRANKID = $MPI_LOCALRANKID ZE_AFFINITY_MASK = $ZE_AFFINITY_MASK
20+
exec $@
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
add_openmp_example(histogram)
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
#include <assert.h>
2+
#include <omp.h>
3+
#include <stdio.h>
4+
#include <string.h>
5+
6+
#define SIZE 10000000
7+
#define NUM_BINS 2048
8+
#define REAL float
9+
10+
void initialize(REAL *input, int size, int num_bins) {
11+
for (int i = 0; i < size; i++) {
12+
input[i] = rand() % num_bins;
13+
}
14+
}
15+
16+
void validate(int *result_ref, int *result, int num_bins) {
17+
for (int i = 0; i < num_bins; i++)
18+
assert(result_ref[i] == result[i]);
19+
}
20+
21+
int main(int argc, char **argv) {
22+
23+
int size = SIZE;
24+
int num_bins = NUM_BINS;
25+
if (argc > 1)
26+
size = atoi(argv[1]);
27+
28+
REAL *input = reinterpret_cast<REAL *>(malloc(size * sizeof(REAL)));
29+
int *result = reinterpret_cast<int *>(calloc(num_bins, sizeof(int)));
30+
31+
initialize(input, size, num_bins);
32+
double total_time;
33+
34+
// collect result on host for validation
35+
int *result_ref = (int *)calloc(num_bins, sizeof(int));
36+
#pragma omp parallel for
37+
for (int i = 0; i < size; i++) {
38+
int type = input[i];
39+
#pragma omp atomic update
40+
result_ref[type]++;
41+
}
42+
43+
total_time = omp_get_wtime();
44+
// critical begin
45+
#pragma omp target teams distribute parallel for map(to : input[0 : size]) \
46+
map(tofrom : result[0 : num_bins]) num_teams(1)
47+
for (int i = 0; i < size; i++) {
48+
int type = input[i];
49+
#pragma omp critical
50+
result[type]++;
51+
}
52+
// critical end
53+
total_time = omp_get_wtime() - total_time;
54+
printf("Critical: %g ms\n", total_time * 1000);
55+
validate(result_ref, result, num_bins);
56+
memset(result, 0, sizeof(int) * num_bins);
57+
58+
total_time = omp_get_wtime();
59+
// atomic relaxed begin
60+
#pragma omp target teams distribute parallel for map(to : input[0 : size]) \
61+
map(tofrom : result[0 : num_bins])
62+
for (int i = 0; i < size; i++) {
63+
int type = input[i];
64+
#pragma omp atomic update
65+
result[type]++;
66+
}
67+
// atomic relaxed end
68+
total_time = omp_get_wtime() - total_time;
69+
printf("Atomic relaxed: %g ms\n", total_time * 1000);
70+
validate(result_ref, result, num_bins);
71+
memset(result, 0, sizeof(int) * num_bins);
72+
73+
total_time = omp_get_wtime();
74+
// atomic seq_cst begin
75+
#pragma omp target teams distribute parallel for map(to : input[0 : size]) \
76+
map(tofrom : result[0 : num_bins])
77+
for (int i = 0; i < size; i++) {
78+
int type = input[i];
79+
#pragma omp atomic update seq_cst
80+
result[type]++;
81+
}
82+
// atomic seq_cst end
83+
total_time = omp_get_wtime() - total_time;
84+
printf("Atomic seq_cst: %g ms\n", total_time * 1000);
85+
validate(result_ref, result, num_bins);
86+
memset(result, 0, sizeof(int) * num_bins);
87+
88+
total_time = omp_get_wtime();
89+
// atomic relaxed using SLM begin
90+
#pragma omp target teams map(to : input[0 : size]) \
91+
map(tofrom : result[0 : num_bins])
92+
{
93+
// create a local histogram using SLM in the team
94+
int local_histogram[NUM_BINS] = {0};
95+
int num_local_histogram = omp_get_num_teams();
96+
int team_id = omp_get_team_num();
97+
int chunk_size = size / num_local_histogram;
98+
int leftover = size % num_local_histogram;
99+
int local_lb = team_id * chunk_size;
100+
int local_ub = (team_id + 1) * chunk_size;
101+
// Add the leftover to last chunk.
102+
// e.g. 18 iterations and 4 teams -> 4, 4, 4, 6 = 4(last chunk) +
103+
// 2(leftover)
104+
if (local_ub + chunk_size > size)
105+
local_ub += leftover;
106+
if (local_ub <= size) {
107+
#pragma omp parallel for shared(local_histogram)
108+
for (int i = local_lb; i < local_ub; i++) {
109+
int type = input[i];
110+
#pragma omp atomic update
111+
local_histogram[type]++;
112+
}
113+
114+
// Combine local histograms
115+
#pragma omp parallel for
116+
for (int i = 0; i < num_bins; i++) {
117+
#pragma omp atomic update
118+
result[i] += local_histogram[i];
119+
}
120+
}
121+
}
122+
// atomic relaxed using SLM end
123+
total_time = omp_get_wtime() - total_time;
124+
printf("Atomic relaxed with SLM: %g ms\n", total_time * 1000);
125+
validate(result_ref, result, num_bins);
126+
memset(result, 0, sizeof(int) * num_bins);
127+
128+
total_time = omp_get_wtime();
129+
// atomic seq_cst using SLM begin
130+
#pragma omp target map(to : input[0 : size]) map(tofrom : result[0 : num_bins])
131+
#pragma omp teams
132+
{
133+
// create a local histogram using SLM in the team
134+
int local_histogram[NUM_BINS] = {0};
135+
int num_local_histogram = omp_get_num_teams();
136+
int team_id = omp_get_team_num();
137+
int chunk_size = size / num_local_histogram;
138+
int leftover = size % num_local_histogram;
139+
int local_lb = team_id * chunk_size;
140+
int local_ub = (team_id + 1) * chunk_size;
141+
// Add the leftover to last chunk.
142+
// e.g. 18 iterations and 4 teams -> 4, 4, 4, 6 = 4(last chunk) +
143+
// 2(leftover)
144+
if (local_ub + chunk_size > size)
145+
local_ub += leftover;
146+
if (local_ub <= size) {
147+
#pragma omp parallel for shared(local_histogram)
148+
for (int i = local_lb; i < local_ub; i++) {
149+
int type = input[i];
150+
#pragma omp atomic update seq_cst
151+
local_histogram[type]++;
152+
}
153+
154+
// Combine local histograms
155+
#pragma omp parallel for
156+
for (int i = 0; i < num_bins; i++) {
157+
#pragma omp atomic update seq_cst
158+
result[i] += local_histogram[i];
159+
}
160+
}
161+
}
162+
// atomic seq_cst using SLM end
163+
total_time = omp_get_wtime() - total_time;
164+
printf("Atomic seq_cst with SLM: %g ms\n", total_time * 1000);
165+
validate(result_ref, result, num_bins);
166+
memset(result, 0, sizeof(int) * num_bins);
167+
168+
free(input);
169+
free(result);
170+
free(result_ref);
171+
172+
return 0;
173+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
add_fortran_example(do_concurrent)
2+
add_fortran_example(hybrid_do_concurrent)
3+
add_fortran_example(omp6_do_concurrent)

0 commit comments

Comments
 (0)