Skip to content

Commit 7c9c6c6

Browse files
committed
clang/AMDGPU: Set amdgpu-max-num-workgroups to disable Y/Z by default
Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z dimensions are always 1. Some of the generated OpenMP functions don't seem to get the correct attributes. The kernels do, but the callable __omp_offloading functions are missing them for some reason.
1 parent 84efad0 commit 7c9c6c6

File tree

5 files changed

+45
-33
lines changed

5 files changed

+45
-33
lines changed

clang/include/clang/Basic/LangOptions.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,10 @@ class LangOptions : public LangOptionsBase {
706706
return OpenCL || CUDA;
707707
}
708708

709+
/// Return true if the dispatch size for an offload language only uses one
710+
/// dimension.
711+
bool gridSizeIsOneDimension() const { return CUDA || HIP || OpenMP; }
712+
709713
/// Return the OpenCL C or C++ version as a VersionTuple.
710714
VersionTuple getOpenCLVersionTuple() const;
711715

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -377,29 +377,6 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
377377
if (NumVGPR != 0)
378378
F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
379379
}
380-
381-
if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
382-
uint32_t X = Attr->getMaxNumWorkGroupsX()
383-
->EvaluateKnownConstInt(M.getContext())
384-
.getExtValue();
385-
// Y and Z dimensions default to 1 if not specified
386-
uint32_t Y = Attr->getMaxNumWorkGroupsY()
387-
? Attr->getMaxNumWorkGroupsY()
388-
->EvaluateKnownConstInt(M.getContext())
389-
.getExtValue()
390-
: 1;
391-
uint32_t Z = Attr->getMaxNumWorkGroupsZ()
392-
? Attr->getMaxNumWorkGroupsZ()
393-
->EvaluateKnownConstInt(M.getContext())
394-
.getExtValue()
395-
: 1;
396-
397-
llvm::SmallString<32> AttrVal;
398-
llvm::raw_svector_ostream OS(AttrVal);
399-
OS << X << ',' << Y << ',' << Z;
400-
401-
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
402-
}
403380
}
404381

405382
/// Emits control constants used to change per-architecture behaviour in the
@@ -450,9 +427,40 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
450427
if (!F)
451428
return;
452429

430+
// TODO: Use AttrBuilder
453431
const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
454-
if (FD)
432+
const AMDGPUMaxNumWorkGroupsAttr *MaxNumWorkGroupsAttr = nullptr;
433+
if (FD) {
455434
setFunctionDeclAttributes(FD, F, M);
435+
MaxNumWorkGroupsAttr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>();
436+
}
437+
438+
if (MaxNumWorkGroupsAttr) {
439+
const auto *Attr = MaxNumWorkGroupsAttr;
440+
uint32_t X = Attr->getMaxNumWorkGroupsX()
441+
->EvaluateKnownConstInt(M.getContext())
442+
.getExtValue();
443+
// Y and Z dimensions default to 1 if not specified
444+
uint32_t Y = Attr->getMaxNumWorkGroupsY()
445+
? Attr->getMaxNumWorkGroupsY()
446+
->EvaluateKnownConstInt(M.getContext())
447+
.getExtValue()
448+
: 1;
449+
uint32_t Z = Attr->getMaxNumWorkGroupsZ()
450+
? Attr->getMaxNumWorkGroupsZ()
451+
->EvaluateKnownConstInt(M.getContext())
452+
.getExtValue()
453+
: 1;
454+
455+
llvm::SmallString<32> AttrVal;
456+
llvm::raw_svector_ostream OS(AttrVal);
457+
OS << X << ',' << Y << ',' << Z;
458+
459+
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
460+
} else if (M.getLangOpts().gridSizeIsOneDimension()) {
461+
// If the language only has 1D dispatches, disable Y/Z by default.
462+
F->addFnAttr("amdgpu-max-num-workgroups", "4294967295,1,1");
463+
}
456464

457465
if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
458466
F->addFnAttr("amdgpu-ieee", "false");

clang/test/CodeGenHIP/default-attributes.hip

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ __global__ void kernel() {
3434
extern_func();
3535
}
3636
//.
37-
// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
37+
// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
3838
// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
39-
// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
39+
// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
4040
// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind }
4141
//.
4242
// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}

clang/test/OpenMP/amdgcn-attributes.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ int callable(int x) {
3131
return x + 1;
3232
}
3333

34-
// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
35-
// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
36-
// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
34+
// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
35+
// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
36+
// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
3737

38-
// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
39-
// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
40-
// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
38+
// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
39+
// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
40+
// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }

clang/test/OpenMP/amdgcn_target_global_constructor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ S A;
9898
//
9999
//.
100100
// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
101-
// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
101+
// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
102102
// CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
103103
// CHECK: attributes #[[ATTR3]] = { convergent }
104104
// CHECK: attributes #[[ATTR4]] = { convergent nounwind }

0 commit comments

Comments
 (0)