-
Notifications
You must be signed in to change notification settings - Fork 15.4k
clang/AMDGPU: Set amdgpu-max-num-workgroups to disable Y/Z by default #119009
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
clang/AMDGPU: Set amdgpu-max-num-workgroups to disable Y/Z by default #119009
Conversation
Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z dimensions are always 1. Some of the generated OpenMP functions don't seem to get the correct attributes. The kernels do, but the callable __omp_offloading functions are missing them for some reason.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesOnly OpenCL supports 2d and 3d dispatches, the other languages Y and Z Full diff: https://github.com/llvm/llvm-project/pull/119009.diff 5 Files Affected:
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 949c8f5d448bcf..d5532eec0a683e 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -706,6 +706,10 @@ class LangOptions : public LangOptionsBase {
return OpenCL || CUDA;
}
+ /// Return true if the dispatch size for an offload language only uses one
+ /// dimension.
+ bool gridSizeIsOneDimension() const { return CUDA || HIP || OpenMP; }
+
/// Return the OpenCL C or C++ version as a VersionTuple.
VersionTuple getOpenCLVersionTuple() const;
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 56ad0503a11ab2..904e03b3cc7182 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -377,29 +377,6 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
if (NumVGPR != 0)
F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
}
-
- if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
- uint32_t X = Attr->getMaxNumWorkGroupsX()
- ->EvaluateKnownConstInt(M.getContext())
- .getExtValue();
- // Y and Z dimensions default to 1 if not specified
- uint32_t Y = Attr->getMaxNumWorkGroupsY()
- ? Attr->getMaxNumWorkGroupsY()
- ->EvaluateKnownConstInt(M.getContext())
- .getExtValue()
- : 1;
- uint32_t Z = Attr->getMaxNumWorkGroupsZ()
- ? Attr->getMaxNumWorkGroupsZ()
- ->EvaluateKnownConstInt(M.getContext())
- .getExtValue()
- : 1;
-
- llvm::SmallString<32> AttrVal;
- llvm::raw_svector_ostream OS(AttrVal);
- OS << X << ',' << Y << ',' << Z;
-
- F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
- }
}
/// Emits control constants used to change per-architecture behaviour in the
@@ -450,9 +427,40 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
if (!F)
return;
+ // TODO: Use AttrBuilder
const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
- if (FD)
+ const AMDGPUMaxNumWorkGroupsAttr *MaxNumWorkGroupsAttr = nullptr;
+ if (FD) {
setFunctionDeclAttributes(FD, F, M);
+ MaxNumWorkGroupsAttr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>();
+ }
+
+ if (MaxNumWorkGroupsAttr) {
+ const auto *Attr = MaxNumWorkGroupsAttr;
+ uint32_t X = Attr->getMaxNumWorkGroupsX()
+ ->EvaluateKnownConstInt(M.getContext())
+ .getExtValue();
+ // Y and Z dimensions default to 1 if not specified
+ uint32_t Y = Attr->getMaxNumWorkGroupsY()
+ ? Attr->getMaxNumWorkGroupsY()
+ ->EvaluateKnownConstInt(M.getContext())
+ .getExtValue()
+ : 1;
+ uint32_t Z = Attr->getMaxNumWorkGroupsZ()
+ ? Attr->getMaxNumWorkGroupsZ()
+ ->EvaluateKnownConstInt(M.getContext())
+ .getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+
+ F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
+ } else if (M.getLangOpts().gridSizeIsOneDimension()) {
+ // If the language only has 1D dispatches, disable Y/Z by default.
+ F->addFnAttr("amdgpu-max-num-workgroups", "4294967295,1,1");
+ }
if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
F->addFnAttr("amdgpu-ieee", "false");
diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip
index 1b53ebec9b5821..1a2cc42828c2f6 100644
--- a/clang/test/CodeGenHIP/default-attributes.hip
+++ b/clang/test/CodeGenHIP/default-attributes.hip
@@ -34,9 +34,9 @@ __global__ void kernel() {
extern_func();
}
//.
-// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind }
//.
// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp
index 2c9e16a4f5098e..270cc225d05da2 100644
--- a/clang/test/OpenMP/amdgcn-attributes.cpp
+++ b/clang/test/OpenMP/amdgcn-attributes.cpp
@@ -31,10 +31,10 @@ int callable(int x) {
return x + 1;
}
-// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
-// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
+// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
-// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
index 9f1e68d4ea0fec..ffde39479761c4 100644
--- a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
+++ b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
@@ -98,7 +98,7 @@ S A;
//
//.
// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// CHECK: attributes #[[ATTR3]] = { convergent }
// CHECK: attributes #[[ATTR4]] = { convergent nounwind }
|
|
|
||
| /// Return true if the dispatch size for an offload language only uses one | ||
| /// dimension. | ||
| bool gridSizeIsOneDimension() const { return CUDA || HIP || OpenMP; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA and HIP don't support 3d grid? Did I misunderstand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The OpenMP GPU CodeGen only calls it on the target entry point. |
It should be calling in on every single function |
|
So this is wrong. It was correct for openmp. Is there a language version we check can where 3d grids were introduced? |
Even OpenMP is discussing to introduce multi-dim support, though in LLVM I already implemented it as an extension. :-) |

Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z
dimensions are always 1. Some of the generated OpenMP functions don't
seem to get the correct attributes. The kernels do, but the callable
__omp_offloading functions are missing them for some reason.