[OpenMP] [Xteam Reduction] Compute number of teams based on whether fast reduction is enabled.

dhruvachak · dhruvachak · commit 441c5fc9ae3b · 2024-02-17T00:04:09.000-05:00
Clang will write a global variable indicating whether
-fopenmp-target-fast-reduction was used during compile. If so, the
number of teams allowed during kernel launch may be determined
accordingly.

Change-Id: Iba930f8d0cbfdb6a8ef376270a5c936c6f87d17e
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -9617,14 +9617,11 @@ static void emitTargetCallKernelLaunch(
                                                   OMPRTL_ompx_get_team_procs),
             DevIdVal, "team_procs");
 
-        // Compute CUMultiplier = (Max threads per CU) / (Block size)
-        int64_t XteamRedBlockSize = CGF.CGM.getXteamRedBlockSize(D);
-        int64_t CUMultiplier =
-            XteamRedBlockSize > 0
-                ? llvm::omp::xteam_red::MaxThreadsPerCU / XteamRedBlockSize
-                : llvm::omp::xteam_red::MaxCUMultiplier;
-        if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
-          CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
+        // Given the currently determined blocksize, compute the scaling
+        // factor for number of teams in terms of the number of CUs. This
+        // computation must stay in sync with the runtime.
+        uint32_t CUMultiplier = llvm::omp::xteam_red::getXteamRedCUMultiplier(
+            CGF.CGM.getXteamRedBlockSize(D));
 
         llvm::Value *Int64CUMultiplier =
             llvm::ConstantInt::get(CGF.Int64Ty, CUMultiplier);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1000,6 +1000,19 @@ static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
   CGM.addCompilerUsedGlobal(GVMode);
 }
 
+// Create a global variable to indicate whether fast reduction is enabled for
+// this file. This variable is read by the runtime while determining the launch
+// bounds.
+static void setIsFastReduction(CodeGenModule &CGM) {
+  auto *GVFastReduction = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int8Ty,
+                             CGM.getLangOpts().OpenMPTargetFastReduction),
+      Twine("__omp_plugin_enable_fast_reduction"));
+  CGM.addCompilerUsedGlobal(GVFastReduction);
+}
+
 static OMPTgtExecModeFlags
 computeExecutionMode(bool Mode, const Stmt *DirectiveStmt, CodeGenModule &CGM) {
   if (!Mode)
@@ -1085,6 +1098,11 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
   if (CGM.getLangOpts().OpenMPCUDAMode)
     CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
 
+  // Write a global variable indicating whether fast reduction is enabled.
+  // This is done regardless of -nogpulib
+  if (!CGM.getLangOpts().OMPHostIRFile.empty())
+    setIsFastReduction(CGM);
+
   llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
   if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
     return;
diff --git a/clang/test/OpenMP/declare_target_constexpr_codegen.cpp b/clang/test/OpenMP/declare_target_constexpr_codegen.cpp
@@ -18,7 +18,7 @@ class A {
 //.
 // CHECK: @_ZN1A2piE = linkonce_odr constant double 0x400921FB54442D18, comdat, align 8
 // CHECK: @_ZL9anotherPi = internal constant double 3.140000e+00, align 8
-// CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr @"__ZN1A2piE$ref", ptr @"__ZL9anotherPi$ref"], section "llvm.metadata"
+// CHECK: @llvm.compiler.used = appending global [3 x ptr] [ptr @__omp_plugin_enable_fast_reduction, ptr @"__ZN1A2piE$ref", ptr @"__ZL9anotherPi$ref"], section "llvm.metadata"
 //.
   A() { ; }
   ~A() { ; }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -294,6 +294,16 @@ constexpr int16_t DefaultBlockSize = 1024;
 // so that it is accessible for all targets.
 constexpr int16_t MaxBlockSize = 1024;
 
+// Compute CUMultiplier = (Max threads per CU) / (Block size)
+static inline uint32_t getXteamRedCUMultiplier(uint32_t BlockSize) {
+  uint32_t CUMultiplier =
+      BlockSize > 0 ? llvm::omp::xteam_red::MaxThreadsPerCU / BlockSize
+                    : llvm::omp::xteam_red::MaxCUMultiplier;
+  if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
+    CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
+  return CUMultiplier;
+}
+
 } // end namespace xteam_red
 
 /// A type of worksharing loop construct
@@ -306,6 +316,15 @@ enum class WorksharingLoopType {
   DistributeForStaticLoop
 };
 
+static inline uint32_t getBlockSizeAsPowerOfTwo(uint32_t BlockSize) {
+  uint32_t Tmp = BlockSize;
+  do {
+    BlockSize = Tmp;
+    Tmp = BlockSize & (BlockSize - 1);
+  } while (Tmp != 0);
+  return BlockSize;
+}
+
 } // end namespace omp
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -120,15 +120,6 @@ static constexpr GV NVPTXGridValues = {
     128,       // GV_Default_WG_Size
 };
 
-static inline uint32_t getBlockSizeAsPowerOfTwo(uint32_t BlockSize) {
-  uint32_t Tmp = BlockSize;
-  do {
-    BlockSize = Tmp;
-    Tmp = BlockSize & (BlockSize - 1);
-  } while (Tmp != 0);
-  return BlockSize;
-}
-
 } // namespace omp
 } // namespace llvm
 
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -945,32 +945,26 @@ struct AMDGPUKernelTy : public GenericKernelTy {
     }
 
     if (isXTeamReductionsMode()) {
-      // Note: The plugin does not know whether XteamReduction is running in
-      // fast mode. If fast mode, metadata is not used and the following
-      // restrictions are not required. But since the plugin does not know, it
-      // will assume that it is running in the default mode with constrained
-      // metadata.
-
-      // The number of teams must not exceed the upper limit determined during
-      // code generation. This upper limit is not currently communicated from
-      // codegen to the plugin. So compute it here again, note that this must
-      // be kept in sync with codegen.
-
-      // This is the block size that CodeGen used.
-      uint32_t XteamRedBlockSize = ConstWGSize;
-
-      int32_t CUMultiplier =
-          XteamRedBlockSize > 0
-              ? llvm::omp::xteam_red::MaxThreadsPerCU / XteamRedBlockSize
-              : llvm::omp::xteam_red::MaxCUMultiplier;
-      if (CUMultiplier > llvm::omp::xteam_red::MaxCUMultiplier)
-        CUMultiplier = llvm::omp::xteam_red::MaxCUMultiplier;
-
-      // Here's the default we use
+      // Here's the default number of teams.
       uint64_t NumGroups = DeviceNumCUs;
-
       // The number of teams must not exceed this upper limit.
-      uint64_t MaxNumGroups = DeviceNumCUs * CUMultiplier;
+      uint64_t MaxNumGroups = NumGroups;
+      if (GenericDevice.isFastReductionEnabled()) {
+        // When fast reduction is enabled, the number of teams is capped by
+        // the MaxCUMultiplier constant.
+        MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
+      } else {
+        // When fast reduction is not enabled, the number of teams is capped
+        // by the metadata that clang CodeGen created. The number of teams
+        // used here must not exceed the upper limit determined during
+        // CodeGen. This upper limit is not currently communicated from
+        // CodeGen to the plugin. So it is re-computed here.
+
+        // ConstWGSize is the block size that CodeGen used.
+        uint32_t CUMultiplier =
+            llvm::omp::xteam_red::getXteamRedCUMultiplier(ConstWGSize);
+        MaxNumGroups = DeviceNumCUs * CUMultiplier;
+      }
 
       // Honor OMP_NUM_TEAMS environment variable for XteamReduction kernel
       // type, if possible.
@@ -1029,6 +1023,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
           NumGroups = std::min(MaxNumGroups, LowTripCountBlocks);
         }
       }
+      DP("xteam-red:NumCUs=%lu xteam-red:NumGroups=%lu\n", DeviceNumCUs,
+         NumGroups);
       return NumGroups;
     }
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -950,6 +950,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   bool useAutoZeroCopy();
   virtual bool useAutoZeroCopyImpl() { return false; }
 
+  bool isFastReductionEnabled() const { return IsFastReductionEnabled; }
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -1045,6 +1047,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+
+  bool IsFastReductionEnabled = false;
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1018,6 +1018,17 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
        DevEnvGlobal.getName().data());
     consumeError(std::move(Err));
   }
+
+  // From the image, read whether fast reduction is enabled.
+  StaticGlobalTy<int8_t> IsFastRedGlobal("__omp_plugin_enable_fast_reduction");
+  if (auto Err = GHandler.readGlobalFromImage(*this, Image, IsFastRedGlobal)) {
+    DP("Missing symbol %s, continue execution anyway.\n",
+       IsFastRedGlobal.getName().data());
+    consumeError(std::move(Err));
+  } else {
+    IsFastReductionEnabled = IsFastRedGlobal.getValue();
+  }
+
   return Plugin::success();
 }
 
diff --git a/openmp/libomptarget/test/offloading/xteam_red_1.c b/openmp/libomptarget/test/offloading/xteam_red_1.c
@@ -0,0 +1,41 @@
+// clang-format off
+// This test verifies that the reduction kernel is of Xteam-reduction type
+// and is launched with 1920 teams and 8 threads in each team. 
+// 
+// RUN: %libomptarget-compile-generic -fopenmp-target-fast -fopenmp-target-fast-reduction
+// RUN: env LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT=15360 LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS=32 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+// clang-format on
+#include <stdio.h>
+
+int main() {
+  int N = 15360;
+
+  double a[N];
+
+  for (int i = 0; i < N; i++)
+    a[i] = i;
+
+  double sum1;
+  sum1 = 0;
+
+#pragma omp target teams distribute parallel for map(tofrom:sum1) reduction(+:sum1)
+  for (int j = 0; j < N; j = j + 1)
+    sum1 += a[j];
+
+  printf("sum1=%f\n", sum1);
+
+  return 0;
+}
+// clang-format off
+/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8
+/// CHECK: teamsXthrds:(1920X   8)
+
diff --git a/openmp/libomptarget/test/offloading/xteam_red_2.c b/openmp/libomptarget/test/offloading/xteam_red_2.c
@@ -0,0 +1,40 @@
+// clang-format off
+// This test verifies that the reduction kernel is of Xteam reduction
+// type and is launched with as many teams as the number of CUs.
+// RUN: %libomptarget-compile-generic -fopenmp-target-fast
+// RUN: env LIBOMPTARGET_DEBUG=1 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+// clang-format on
+#include <stdio.h>
+
+int main() {
+  int N = 1000000;
+
+  double a[N];
+
+  for (int i = 0; i < N; i++)
+    a[i] = i;
+
+  double sum1;
+  sum1 = 0;
+
+#pragma omp target teams distribute parallel for map(tofrom:sum1) reduction(+:sum1)
+  for (int j = 0; j < N; j = j + 1)
+    sum1 += a[j];
+
+  printf("sum1=%f\n", sum1);
+
+  return 0;
+}
+// clang-format off
+/// CHECK: xteam-red:NumCUs=[[CU_COUNT:[0-9]+]]
+/// CHECK: xteam-red:NumGroups=[[CU_COUNT]]
+