Skip to content

Commit 9906334

Browse files
committed
Rebase on main (integrate changes from 1b34722)
1 parent 76a55d3 commit 9906334

File tree

12 files changed

+153
-81
lines changed

12 files changed

+153
-81
lines changed

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -743,20 +743,21 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr(
743743
llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
744744
unsigned Min =
745745
Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
746-
unsigned Max =
747-
Attr->getMax()
748-
? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
749-
: 0;
750746

751-
if (Min != 0) {
752-
assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
753-
754-
std::string AttrVal = llvm::utostr(Min);
755-
if (Max != 0)
756-
AttrVal = AttrVal + "," + llvm::utostr(Max);
757-
F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
758-
} else
759-
assert(Max == 0 && "Max must be zero");
747+
if (Attr->getMax()) {
748+
unsigned Max =
749+
Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
750+
assert(Min == 0 || (Min != 0 && Max != 0) &&
751+
"Min must be non-zero when Max is non-zero");
752+
assert(Min <= Max && "Min must be less than or equal to Max");
753+
// Do not add the attribute if min,max=0,0.
754+
if (Min != 0) {
755+
std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
756+
F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
757+
}
758+
} else if (Min != 0) {
759+
F->addFnAttr("amdgpu-waves-per-eu", llvm::utostr(Min));
760+
}
760761
}
761762

762763
std::unique_ptr<TargetCodeGenInfo>

clang/lib/Sema/SemaAMDGPU.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,6 @@ static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
244244
if (MaxExpr && !S.checkUInt32Argument(Attr, MaxExpr, Max, 1))
245245
return true;
246246

247-
if (Min == 0 && Max != 0) {
248-
S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
249-
<< &Attr << 0;
250-
return true;
251-
}
252247
if (Max != 0 && Min > Max) {
253248
S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
254249
<< &Attr << 1;

clang/test/SemaOpenCL/amdgpu-attrs.cl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {}
4646
__attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
4747

4848
__attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}}
49-
__attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}}
5049

5150
__attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}}
5251
__attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}}

llvm/lib/IR/Verifier.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2473,6 +2473,29 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
24732473
CheckFailed("invalid value for 'denormal-fp-math-f32' attribute: " + S,
24742474
V);
24752475
}
2476+
2477+
if (TT.isAMDGPU()) {
2478+
if (auto A = Attrs.getFnAttr("amdgpu-waves-per-eu"); A.isValid()) {
2479+
std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
2480+
unsigned Min = 0;
2481+
StringRef MinStr = Strs.first.trim();
2482+
Check(!MinStr.getAsInteger(0, Min),
2483+
"minimum for 'amdgpu-waves-per-eu' must be integer: " + MinStr);
2484+
if (!Strs.second.empty()) {
2485+
unsigned Max = 0;
2486+
StringRef MaxStr = Strs.second.trim();
2487+
Check(!MaxStr.getAsInteger(0, Max),
2488+
"maximum for 'amdgpu-waves-per-eu' must be integer: " + MaxStr);
2489+
Check(Max, "maximum for 'amdgpu-waves-per-eu' must be non-zero");
2490+
Check(Min <= Max, "minimum must be less than or equal to maximum for "
2491+
"'amdgpu-waves-per-eu': " +
2492+
MinStr + " > " + MaxStr);
2493+
} else {
2494+
Check(Min, "minimum for 'amdgpu-waves-per-eu' must be non-zero when "
2495+
"maximum is not provided");
2496+
}
2497+
}
2498+
}
24762499
}
24772500

24782501
void Verifier::verifyFunctionMetadata(

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,10 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
13671367
return Changed;
13681368
}
13691369

1370+
// 14 ==> 15
1371+
// 15 ==> 16
1372+
// 16 ==> 17
1373+
13701374
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13711375
AMDGPUAttributorOptions Options,
13721376
ThinOrFullLTOPhase LTOPhase) {

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -156,15 +156,15 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
156156
}
157157
}
158158

159-
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
160-
const Function &F) const {
159+
std::pair<unsigned, unsigned>
160+
AMDGPUSubtarget::getFlatWorkGroupSizes(const Function &F) const {
161161
// Default minimum/maximum flat work group sizes.
162162
std::pair<unsigned, unsigned> Default =
163-
getDefaultFlatWorkGroupSize(F.getCallingConv());
163+
getDefaultFlatWorkGroupSize(F.getCallingConv());
164164

165165
// Requested minimum/maximum flat work group sizes.
166166
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
167-
F, "amdgpu-flat-work-group-size", Default);
167+
F, "amdgpu-flat-work-group-size", Default);
168168

169169
// Make sure requested minimum is less than requested maximum.
170170
if (Requested.first > Requested.second)
@@ -186,23 +186,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
186186
// sizes limits the achievable maximum, and we aim to support enough waves per
187187
// EU so that we can concurrently execute all waves of a single workgroup of
188188
// maximum size on a CU.
189-
std::pair<unsigned, unsigned> Default = {
189+
std::pair<unsigned, unsigned> WavesPerEU = {
190190
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
191191
getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
192-
Default.first = std::min(Default.first, Default.second);
193-
194-
// Make sure requested minimum is within the default range and lower than the
195-
// requested maximum. The latter must not violate target specification.
196-
if (RequestedWavesPerEU.first < Default.first ||
197-
RequestedWavesPerEU.first > Default.second ||
198-
RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
199-
RequestedWavesPerEU.second > getMaxWavesPerEU())
200-
return Default;
201-
202-
// We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
203-
RequestedWavesPerEU.second =
204-
std::min(RequestedWavesPerEU.second, Default.second);
205-
return RequestedWavesPerEU;
192+
WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
193+
194+
// Requested minimum must not violate subtarget's specifications and be no
195+
// greater than maximum.
196+
if (RequestedWavesPerEU.first &&
197+
(RequestedWavesPerEU.first < getMinWavesPerEU() ||
198+
RequestedWavesPerEU.first > RequestedWavesPerEU.second))
199+
return WavesPerEU;
200+
// Requested maximum must not violate subtarget's specifications.
201+
if (RequestedWavesPerEU.second > getMaxWavesPerEU())
202+
return WavesPerEU;
203+
204+
// A requested maximum may limit both the final minimum and maximum, but
205+
// not increase them. A requested minimum can either decrease or increase the
206+
// default minimum as long as it doesn't exceed the maximum.
207+
WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
208+
if (RequestedWavesPerEU.first)
209+
WavesPerEU.first = RequestedWavesPerEU.first;
210+
WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
211+
return WavesPerEU;
206212
}
207213

208214
std::pair<unsigned, unsigned>
@@ -229,7 +235,7 @@ std::pair<unsigned, unsigned>
229235
AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
230236
unsigned LDSBytes, const Function &F) const {
231237
// Default minimum/maximum number of waves per execution unit.
232-
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
238+
std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());
233239

234240
// Requested minimum/maximum number of waves per execution unit.
235241
std::pair<unsigned, unsigned> Requested =

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,9 @@ class AMDGPUSubtarget {
126126
/// Returns the target minimum/maximum number of waves per EU. This is based
127127
/// on the minimum/maximum number of \p RequestedWavesPerEU and further
128128
/// limited by the maximum achievable occupancy derived from the range of \p
129-
/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
129+
/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A
130+
/// minimum requested waves/EU value of 0 indicates an intent to not restrict
131+
/// the minimum target occupancy.
130132
std::pair<unsigned, unsigned>
131133
getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
132134
std::pair<unsigned, unsigned> FlatWorkGroupSizes,

llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,3 +225,15 @@ entry:
225225
ret void
226226
}
227227
attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}
228+
229+
; At most 2 waves per execution unit.
230+
; CHECK-LABEL: {{^}}empty_at_most_2:
231+
; CHECK: SGPRBlocks: 12
232+
; CHECK: VGPRBlocks: 21
233+
; CHECK: NumSGPRsForWavesPerEU: 102
234+
; CHECK: NumVGPRsForWavesPerEU: 85
235+
define amdgpu_kernel void @empty_at_most_2() #13 {
236+
entry:
237+
ret void
238+
}
239+
attributes #13 = {"amdgpu-waves-per-eu"="0,2"}

llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,5 @@ entry:
5757
ret void
5858
}
5959

60-
attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
61-
attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
60+
attributes #0 = { "amdgpu-waves-per-eu"="1" }
61+
attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" }

llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,7 @@ body: |
10331033
; GFX908-NEXT: [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
10341034
; GFX908-NEXT: [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
10351035
; GFX908-NEXT: [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
1036+
; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
10361037
; GFX908-NEXT: {{ $}}
10371038
; GFX908-NEXT: bb.1:
10381039
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
@@ -1060,9 +1061,8 @@ body: |
10601061
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
10611062
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
10621063
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
1063-
; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
1064-
; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
1065-
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
1064+
; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
1065+
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[DEF]]
10661066
; GFX908-NEXT: S_ENDPGM 0
10671067
;
10681068
; GFX90A-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
@@ -1660,6 +1660,7 @@ body: |
16601660
; GFX908: bb.0:
16611661
; GFX908-NEXT: successors: %bb.1(0x80000000)
16621662
; GFX908-NEXT: {{ $}}
1663+
; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
16631664
; GFX908-NEXT: [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
16641665
; GFX908-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
16651666
; GFX908-NEXT: [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
@@ -1944,9 +1945,8 @@ body: |
19441945
; GFX908-NEXT: S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
19451946
; GFX908-NEXT: S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
19461947
; GFX908-NEXT: S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
1947-
; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
1948-
; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
1949-
; GFX908-NEXT: S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
1948+
; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
1949+
; GFX908-NEXT: S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_]]
19501950
; GFX908-NEXT: S_ENDPGM 0
19511951
;
19521952
; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit

0 commit comments

Comments
 (0)