oneapi-src
diff --git a/‎Publications/GPU-Opt-Guide/MPI/02_omp_mpi_onemkl_dgemm/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎Publications/GPU-Opt-Guide/MPI/02_omp_mpi_onemkl_dgemm/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎Publications/GPU-Opt-Guide/MPI/02_omp_mpi_onemkl_dgemm/Makefile
Lines changed: 17 additions & 0 deletions b/‎Publications/GPU-Opt-Guide/MPI/02_omp_mpi_onemkl_dgemm/Makefile
Lines changed: 17 additions & 0 deletions
diff --git a/‎Publications/GPU-Opt-Guide/MPI/02_omp_mpi_onemkl_dgemm/dgemm.cpp
Lines changed: 167 additions & 0 deletions b/‎Publications/GPU-Opt-Guide/MPI/02_omp_mpi_onemkl_dgemm/dgemm.cpp
Lines changed: 167 additions & 0 deletions
diff --git a/‎Publications/GPU-Opt-Guide/MPI/02_omp_mpi_onemkl_dgemm/mpich_gpu_bind.sh
Lines changed: 20 additions & 0 deletions b/‎Publications/GPU-Opt-Guide/MPI/02_omp_mpi_onemkl_dgemm/mpich_gpu_bind.sh
Lines changed: 20 additions & 0 deletions
diff --git a/‎Publications/GPU-Opt-Guide/OpenMP/27_omp_atomic/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎Publications/GPU-Opt-Guide/OpenMP/27_omp_atomic/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎Publications/GPU-Opt-Guide/OpenMP/27_omp_atomic/histogram.cpp
Lines changed: 173 additions & 0 deletions b/‎Publications/GPU-Opt-Guide/OpenMP/27_omp_atomic/histogram.cpp
Lines changed: 173 additions & 0 deletions
diff --git a/‎Publications/GPU-Opt-Guide/OpenMP/28_offload_do_concurrent/CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎Publications/GPU-Opt-Guide/OpenMP/28_offload_do_concurrent/CMakeLists.txt
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1 @@
+add_example_with_mkl_mpi(dgemm 8192 8192 8192)
@@ -0,0 +1,17 @@
+#OMP_AFFINITIZATION = 0 to ensure affinitization through MPI environment variables, and = 1 to use OpenMP to affinitize the MPI rank
+OMP_AFFINITIZATION=0
+
+CC=mpicxx
+INCLUDE=-I$(MKLROOT)/include
+LIB="$(MKLROOT)/lib"/libmkl_sycl.a -Wl,--start-group "$(MKLROOT)/lib"/libmkl_intel_lp64.a "$(MKLROOT)/lib"/libmkl_intel_thread.a "$(MKLROOT)/lib"/libmkl_core.a -Wl,--end-group -lsycl -lOpenCL -liomp5 -lpthread -ldl -lm -lstdc++
+CFLAGS=-cxx=icpx -fiopenmp -fopenmp-targets=spir64 -fsycl -DOMP_AFFINITIZATION=$(OMP_AFFINITIZATION)
+CFLAGS2=-cxx=icpx -fsycl-device-code-split=per_kernel -fiopenmp -fopenmp-targets=spir64 -fsycl
+
+dgemm: dgemm.o Makefile
+	$(CC) $(CFLAGS2) dgemm.o $(LIB) -o dgemm
+
+dgemm.o: dgemm.cpp Makefile
+	$(CC) $(CFLAGS) $(INCLUDE) -c dgemm.cpp -o dgemm.o
+
+clean:
+	rm -rf ./dgemm ./dgemm.o
@@ -0,0 +1,167 @@
+#include "mkl.h"
+#include "mkl_omp_offload.h"
+#include <algorithm>
+#include <chrono>
+#include <limits>
+#include <mpi.h>
+#include <omp.h>
+#define FLOAT double
+#define MPI_FLOAT_T MPI_DOUBLE
+#define MKL_INT_T MKL_INT
+#define index(i, j, ld) (((j) * (ld)) + (i))
+#define RAND() ((FLOAT)rand() / (FLOAT)RAND_MAX * 2.0 - 1.0)
+#define LD_ALIGN 256
+#define LD_BIAS 8
+#define HPL_PTR(ptr_, al_) ((((size_t)(ptr_) + (al_) - 1) / (al_)) * (al_))
+static inline MKL_INT_T getld(MKL_INT_T x) {
+  MKL_INT_T ld;
+  ld = HPL_PTR(x, LD_ALIGN);
+  if (ld - LD_BIAS >= x)
+    ld -= LD_BIAS;
+  else
+    ld += LD_BIAS;
+  return ld;
+}
+int main(int argc, char **argv) {
+  if ((argc < 4) || (argc > 4 && argc < 8)) {
+    printf("Performs a DGEMM test C = alpha*A*B + beta*C\n");
+    printf("A matrix is MxK and B matrix is KxN\n");
+    printf("All matrices are stored in column-major format\n");
+    printf("Run as ./dgemm <M> <K> <N> [<alpha> <beta> <iterations>]\n");
+    printf("Required inputs are:\n");
+    printf("      M: number of rows of matrix A\n");
+    printf("      K: number of cols of matrix A\n");
+    printf("      N: number of cols of matrix B\n");
+    printf("Optional inputs are (all must be provided if providing any):\n");
+    printf("      alpha: scalar multiplier (default: 1.0)\n");
+    printf("      beta:  scalar multiplier (default: 0.0)\n");
+    printf("      iterations: number of blocking DGEMM calls to perform "
+           "(default: 10)\n");
+    return EXIT_FAILURE;
+  }
+  MKL_INT_T HA = (MKL_INT_T)(atoi(argv[1]));
+  MKL_INT_T WA = (MKL_INT_T)(atoi(argv[2]));
+  MKL_INT_T WB = (MKL_INT_T)(atoi(argv[3]));
+  FLOAT alpha, beta;
+  int niter;
+  if (argc > 4) {
+    sscanf(argv[4], "%lf", &alpha);
+    sscanf(argv[5], "%lf", &beta);
+    niter = atoi(argv[6]);
+  } else {
+    alpha = 1.0;
+    beta = 0.0;
+    niter = 10;
+  }
+  MKL_INT_T HB = WA;
+  MKL_INT_T WC = WB;
+  MKL_INT_T HC = HA;
+  MKL_INT_T ldA = getld(HA);
+  MKL_INT_T ldB = getld(HB);
+  MKL_INT_T ldC = getld(HC);
+  double tot_t = 0.0, best_t = std::numeric_limits<double>::max();
+  FLOAT *A = new FLOAT[ldA * WA];
+  FLOAT *B, *C, *local_B, *local_C;
+  MPI_Init(&argc, &argv);
+  int mpi_rank, mpi_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+  if (mpi_rank == 0) {
+    B = new FLOAT[ldB * WB];
+    C = new FLOAT[ldC * WC];
+    srand(2864);
+    for (int j = 0; j < WA; j++)
+      for (int i = 0; i < HA; i++)
+        A[index(i, j, ldA)] = RAND();
+    for (int j = 0; j < WB; j++)
+      for (int i = 0; i < HB; i++)
+        B[index(i, j, ldB)] = RAND();
+    if (beta != 0.0) {
+      for (int j = 0; j < WC; j++)
+        for (int i = 0; i < HC; i++)
+          C[index(i, j, ldC)] = RAND();
+    } else {
+      for (int j = 0; j < WC; j++)
+        for (int i = 0; i < HC; i++)
+          C[index(i, j, ldC)] = 0.0;
+    }
+  }
+  size_t sizea = (size_t)ldA * WA;
+  size_t local_sizeb, local_sizec;
+  int *displacements_b = new int[mpi_size];
+  int *send_counts_b = new int[mpi_size];
+  int *displacements_c = new int[mpi_size];
+  int *send_counts_c = new int[mpi_size];
+  int local_WB = WB / mpi_size;
+  send_counts_b[0] = ldB * (local_WB + WB % mpi_size);
+  send_counts_c[0] = ldC * (local_WB + WB % mpi_size);
+  displacements_b[0] = 0;
+  displacements_c[0] = 0;
+  for (int i = 1; i < mpi_size; i++) {
+    send_counts_b[i] = ldB * local_WB;
+    send_counts_c[i] = ldC * local_WB;
+    displacements_b[i] = displacements_b[i - 1] + send_counts_b[i - 1];
+    displacements_c[i] = displacements_b[i - 1] + send_counts_c[i - 1];
+  }
+  if (mpi_rank == 0) {
+    local_WB += WB % mpi_size;
+  }
+  local_sizeb = ldB * local_WB;
+  local_sizec = ldC * local_WB;
+  local_B = new FLOAT[local_sizeb];
+  local_C = new FLOAT[local_sizec];
+  MPI_Bcast(A, sizea, MPI_FLOAT_T, 0, MPI_COMM_WORLD);
+  MPI_Scatterv(B, send_counts_b, displacements_b, MPI_FLOAT_T, local_B,
+               local_sizeb, MPI_FLOAT_T, 0, MPI_COMM_WORLD);
+  MPI_Scatterv(C, send_counts_c, displacements_c, MPI_FLOAT_T, local_C,
+               local_sizec, MPI_FLOAT_T, 0, MPI_COMM_WORLD);
+#if defined(OMP_AFFINITIZATION)
+#if OMP_AFFINITIZATION == 1
+  int ndev = omp_get_num_devices();
+  int dnum = mpi_rank % ndev;
+  omp_set_default_device(dnum);
+#endif
+#endif
+#pragma omp target data map(to : A[0 : sizea], local_B[0 : local_sizeb])       \
+    map(tofrom : local_C[0 : local_sizec])
+  {
+#pragma omp dispatch
+    dgemm("N", "N", &HA, &local_WB, &WA, &alpha, A, &ldA, local_B, &ldB, &beta,
+          local_C, &ldC);
+    for (int i = 0; i < niter; i++) {
+      auto start_t = std::chrono::high_resolution_clock::now();
+#pragma omp dispatch
+      dgemm("N", "N", &HA, &local_WB, &WA, &alpha, A, &ldA, local_B, &ldB,
+            &beta, local_C, &ldC);
+      MPI_Barrier(MPI_COMM_WORLD);
+      auto end_t = std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double> diff = end_t - start_t;
+      tot_t += diff.count();
+      best_t = std::min(best_t, diff.count());
+    }
+  }
+  MPI_Gatherv(local_C, local_sizec, MPI_FLOAT_T, C, send_counts_c,
+              displacements_c, MPI_FLOAT_T, 0, MPI_COMM_WORLD);
+  delete[] local_B;
+  delete[] local_C;
+  delete[] displacements_b;
+  delete[] displacements_c;
+  delete[] send_counts_b;
+  delete[] send_counts_c;
+  MPI_Allreduce(MPI_IN_PLACE, &tot_t, 1, MPI_FLOAT_T, MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &best_t, 1, MPI_FLOAT_T, MPI_MAX, MPI_COMM_WORLD);
+  if (mpi_rank == 0) {
+    double tflop_count = (double)2.0 * HA * WB * WA;
+    if (beta != 0.0)
+      tflop_count += (double)HA * WB;
+    tflop_count *= 1.E-12;
+    printf("Total runtime for %d iterations: %f seconds.\n", niter, tot_t);
+    printf("Mean TFLOP/s: %f\n", (double)niter * tflop_count / tot_t);
+    printf("Best TFLOP/s: %f\n", (double)tflop_count / best_t);
+    delete[] B;
+    delete[] C;
+  }
+  delete[] A;
+  MPI_Finalize();
+  return EXIT_SUCCESS;
+}
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if [ -z ${NCCS} ]; then
+  NCCS=1
+fi
+
+if [ -z ${NGPUS} ]; then
+  NGPUS=1
+fi
+
+if [ -z ${NSTACKS} ]; then
+  NSTACKS=1
+fi
+
+subdevices=$((NGPU*NSTACK))
+
+export ZE_AFFINITY_MASK=$(((MPI_LOCALRANKID/NCCS)%subdevices))
+
+echo MPI_LOCALRANKID = $MPI_LOCALRANKID  ZE_AFFINITY_MASK = $ZE_AFFINITY_MASK
+exec $@
@@ -0,0 +1 @@
+add_openmp_example(histogram)
@@ -0,0 +1,173 @@
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+#include <string.h>
+
+#define SIZE 10000000
+#define NUM_BINS 2048
+#define REAL float
+
+void initialize(REAL *input, int size, int num_bins) {
+  for (int i = 0; i < size; i++) {
+    input[i] = rand() % num_bins;
+  }
+}
+
+void validate(int *result_ref, int *result, int num_bins) {
+  for (int i = 0; i < num_bins; i++)
+    assert(result_ref[i] == result[i]);
+}
+
+int main(int argc, char **argv) {
+
+  int size = SIZE;
+  int num_bins = NUM_BINS;
+  if (argc > 1)
+    size = atoi(argv[1]);
+
+  REAL *input = reinterpret_cast<REAL *>(malloc(size * sizeof(REAL)));
+  int *result = reinterpret_cast<int *>(calloc(num_bins, sizeof(int)));
+
+  initialize(input, size, num_bins);
+  double total_time;
+
+  // collect result on host for validation
+  int *result_ref = (int *)calloc(num_bins, sizeof(int));
+#pragma omp parallel for
+  for (int i = 0; i < size; i++) {
+    int type = input[i];
+#pragma omp atomic update
+    result_ref[type]++;
+  }
+
+  total_time = omp_get_wtime();
+  // critical begin
+#pragma omp target teams distribute parallel for map(to : input[0 : size])     \
+    map(tofrom : result[0 : num_bins]) num_teams(1)
+  for (int i = 0; i < size; i++) {
+    int type = input[i];
+#pragma omp critical
+    result[type]++;
+  }
+  // critical end
+  total_time = omp_get_wtime() - total_time;
+  printf("Critical: %g ms\n", total_time * 1000);
+  validate(result_ref, result, num_bins);
+  memset(result, 0, sizeof(int) * num_bins);
+
+  total_time = omp_get_wtime();
+  // atomic relaxed begin
+#pragma omp target teams distribute parallel for map(to : input[0 : size])     \
+    map(tofrom : result[0 : num_bins])
+  for (int i = 0; i < size; i++) {
+    int type = input[i];
+#pragma omp atomic update
+    result[type]++;
+  }
+  // atomic relaxed end
+  total_time = omp_get_wtime() - total_time;
+  printf("Atomic relaxed: %g ms\n", total_time * 1000);
+  validate(result_ref, result, num_bins);
+  memset(result, 0, sizeof(int) * num_bins);
+
+  total_time = omp_get_wtime();
+  // atomic seq_cst begin
+#pragma omp target teams distribute parallel for map(to : input[0 : size])     \
+    map(tofrom : result[0 : num_bins])
+  for (int i = 0; i < size; i++) {
+    int type = input[i];
+#pragma omp atomic update seq_cst
+    result[type]++;
+  }
+  // atomic seq_cst end
+  total_time = omp_get_wtime() - total_time;
+  printf("Atomic seq_cst: %g ms\n", total_time * 1000);
+  validate(result_ref, result, num_bins);
+  memset(result, 0, sizeof(int) * num_bins);
+
+  total_time = omp_get_wtime();
+  // atomic relaxed using SLM begin
+#pragma omp target teams map(to : input[0 : size])                             \
+    map(tofrom : result[0 : num_bins])
+  {
+    // create a local histogram using SLM in the team
+    int local_histogram[NUM_BINS] = {0};
+    int num_local_histogram = omp_get_num_teams();
+    int team_id = omp_get_team_num();
+    int chunk_size = size / num_local_histogram;
+    int leftover = size % num_local_histogram;
+    int local_lb = team_id * chunk_size;
+    int local_ub = (team_id + 1) * chunk_size;
+    //  Add the leftover to last chunk.
+    //  e.g. 18 iterations and 4 teams -> 4, 4, 4, 6 = 4(last chunk) +
+    //  2(leftover)
+    if (local_ub + chunk_size > size)
+      local_ub += leftover;
+    if (local_ub <= size) {
+#pragma omp parallel for shared(local_histogram)
+      for (int i = local_lb; i < local_ub; i++) {
+        int type = input[i];
+#pragma omp atomic update
+        local_histogram[type]++;
+      }
+
+      // Combine local histograms
+#pragma omp parallel for
+      for (int i = 0; i < num_bins; i++) {
+#pragma omp atomic update
+        result[i] += local_histogram[i];
+      }
+    }
+  }
+  // atomic relaxed using SLM end
+  total_time = omp_get_wtime() - total_time;
+  printf("Atomic relaxed with SLM: %g ms\n", total_time * 1000);
+  validate(result_ref, result, num_bins);
+  memset(result, 0, sizeof(int) * num_bins);
+
+  total_time = omp_get_wtime();
+  // atomic seq_cst using SLM begin
+#pragma omp target map(to : input[0 : size]) map(tofrom : result[0 : num_bins])
+#pragma omp teams
+  {
+    // create a local histogram using SLM in the team
+    int local_histogram[NUM_BINS] = {0};
+    int num_local_histogram = omp_get_num_teams();
+    int team_id = omp_get_team_num();
+    int chunk_size = size / num_local_histogram;
+    int leftover = size % num_local_histogram;
+    int local_lb = team_id * chunk_size;
+    int local_ub = (team_id + 1) * chunk_size;
+    //  Add the leftover to last chunk.
+    //  e.g. 18 iterations and 4 teams -> 4, 4, 4, 6 = 4(last chunk) +
+    //  2(leftover)
+    if (local_ub + chunk_size > size)
+      local_ub += leftover;
+    if (local_ub <= size) {
+#pragma omp parallel for shared(local_histogram)
+      for (int i = local_lb; i < local_ub; i++) {
+        int type = input[i];
+#pragma omp atomic update seq_cst
+        local_histogram[type]++;
+      }
+
+      // Combine local histograms
+#pragma omp parallel for
+      for (int i = 0; i < num_bins; i++) {
+#pragma omp atomic update seq_cst
+        result[i] += local_histogram[i];
+      }
+    }
+  }
+  // atomic seq_cst using SLM end
+  total_time = omp_get_wtime() - total_time;
+  printf("Atomic seq_cst with SLM: %g ms\n", total_time * 1000);
+  validate(result_ref, result, num_bins);
+  memset(result, 0, sizeof(int) * num_bins);
+
+  free(input);
+  free(result);
+  free(result_ref);
+
+  return 0;
+}
@@ -0,0 +1,3 @@
+add_fortran_example(do_concurrent)
+add_fortran_example(hybrid_do_concurrent)
+add_fortran_example(omp6_do_concurrent)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+add_example_with_mkl_mpi(dgemm 8192 8192 8192)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+add_fortran_example(do_concurrent)`
	`2`	`+add_fortran_example(hybrid_do_concurrent)`
	`3`	`+add_fortran_example(omp6_do_concurrent)`