clang/AMDGPU: Set amdgpu-max-num-workgroups to disable Y/Z by default

arsenm · arsenm · commit 7c9c6c6897c7 · 2024-12-06T14:10:51.000-05:00
Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z
dimensions are always 1. Some of the generated OpenMP functions don't
seem to get the correct attributes. The kernels do, but the callable
__omp_offloading functions are missing them for some reason.
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
@@ -706,6 +706,10 @@ class LangOptions : public LangOptionsBase {
     return OpenCL || CUDA;
   }
 
+  /// Return true if the dispatch size for an offload language only uses one
+  /// dimension.
+  bool gridSizeIsOneDimension() const { return CUDA || HIP || OpenMP; }
+
   /// Return the OpenCL C or C++ version as a VersionTuple.
   VersionTuple getOpenCLVersionTuple() const;
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -377,29 +377,6 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
-
-  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getMaxNumWorkGroupsX()
-                     ->EvaluateKnownConstInt(M.getContext())
-                     .getExtValue();
-    // Y and Z dimensions default to 1 if not specified
-    uint32_t Y = Attr->getMaxNumWorkGroupsY()
-                     ? Attr->getMaxNumWorkGroupsY()
-                           ->EvaluateKnownConstInt(M.getContext())
-                           .getExtValue()
-                     : 1;
-    uint32_t Z = Attr->getMaxNumWorkGroupsZ()
-                     ? Attr->getMaxNumWorkGroupsZ()
-                           ->EvaluateKnownConstInt(M.getContext())
-                           .getExtValue()
-                     : 1;
-
-    llvm::SmallString<32> AttrVal;
-    llvm::raw_svector_ostream OS(AttrVal);
-    OS << X << ',' << Y << ',' << Z;
-
-    F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
-  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
@@ -450,9 +427,40 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
   if (!F)
     return;
 
+  // TODO: Use AttrBuilder
   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
-  if (FD)
+  const AMDGPUMaxNumWorkGroupsAttr *MaxNumWorkGroupsAttr = nullptr;
+  if (FD) {
     setFunctionDeclAttributes(FD, F, M);
+    MaxNumWorkGroupsAttr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>();
+  }
+
+  if (MaxNumWorkGroupsAttr) {
+    const auto *Attr = MaxNumWorkGroupsAttr;
+    uint32_t X = Attr->getMaxNumWorkGroupsX()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
+    // Y and Z dimensions default to 1 if not specified
+    uint32_t Y = Attr->getMaxNumWorkGroupsY()
+                     ? Attr->getMaxNumWorkGroupsY()
+                           ->EvaluateKnownConstInt(M.getContext())
+                           .getExtValue()
+                     : 1;
+    uint32_t Z = Attr->getMaxNumWorkGroupsZ()
+                     ? Attr->getMaxNumWorkGroupsZ()
+                           ->EvaluateKnownConstInt(M.getContext())
+                           .getExtValue()
+                     : 1;
+
+    llvm::SmallString<32> AttrVal;
+    llvm::raw_svector_ostream OS(AttrVal);
+    OS << X << ',' << Y << ',' << Z;
+
+    F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
+  } else if (M.getLangOpts().gridSizeIsOneDimension()) {
+    // If the language only has 1D dispatches, disable Y/Z by default.
+    F->addFnAttr("amdgpu-max-num-workgroups", "4294967295,1,1");
+  }
 
   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
     F->addFnAttr("amdgpu-ieee", "false");
diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip
@@ -34,9 +34,9 @@ __global__ void kernel() {
  extern_func();
 }
 //.
-// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 // OPTNONE: attributes #[[ATTR3]] = { convergent nounwind }
 //.
 // OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp
@@ -31,10 +31,10 @@ int callable(int x) {
   return x + 1;
 }
 
-// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
-// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
+// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 
-// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
-// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
@@ -98,7 +98,7 @@ S A;
 //
 //.
 // CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // CHECK: attributes #[[ATTR3]] = { convergent }
 // CHECK: attributes #[[ATTR4]] = { convergent nounwind }

Original file line number	Diff line number	Diff line change
`@@ -706,6 +706,10 @@ class LangOptions : public LangOptionsBase {`
`706`	`706`	`return OpenCL \|\| CUDA;`
`707`	`707`	`}`
`708`	`708`
	`709`	`+ /// Return true if the dispatch size for an offload language only uses one`
	`710`	`+ /// dimension.`
	`711`	`+ bool gridSizeIsOneDimension() const { return CUDA \|\| HIP \|\| OpenMP; }`
	`712`	`+`
`709`	`713`	`/// Return the OpenCL C or C++ version as a VersionTuple.`
`710`	`714`	`VersionTuple getOpenCLVersionTuple() const;`
`711`	`715`