Skip to content

Commit c36a128

Browse files
authored
[AMDGPU] Add support for point sample accel out of order returns (llvm#796)
Add target feature for point sample acceleration and enable it for relevant targets. Also add support to insert waitcnts where required when point sample accel may have occurred. This has implications for out of order returns, which is why extra waitcnts are required. Add a VMEM_NOSAMPLER bit in the register masks to determine when waitcnt is required.
1 parent 766c635 commit c36a128

File tree

7 files changed

+172
-34
lines changed

7 files changed

+172
-34
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,12 @@ def FeaturePrngInst : SubtargetFeature<"prng-inst",
10821082
"Has v_prng_b32 instruction"
10831083
>;
10841084

1085+
def FeaturePointSampleAccel : SubtargetFeature<"point-sample-accel",
1086+
"HasPointSampleAccel",
1087+
"true",
1088+
"Has point sample acceleration feature"
1089+
>;
1090+
10851091
//===------------------------------------------------------------===//
10861092
// Subtarget Features (options and debugging)
10871093
//===------------------------------------------------------------===//
@@ -1769,20 +1775,23 @@ def FeatureISAVersion11_5_0 : FeatureSet<
17691775
!listconcat(FeatureISAVersion11_Common.Features,
17701776
[FeatureSALUFloatInsts,
17711777
FeatureDPPSrc1SGPR,
1772-
FeatureRequiredExportPriority])>;
1778+
FeatureRequiredExportPriority,
1779+
FeaturePointSampleAccel])>;
17731780

17741781
def FeatureISAVersion11_5_1 : FeatureSet<
17751782
!listconcat(FeatureISAVersion11_Common.Features,
17761783
[FeatureSALUFloatInsts,
17771784
FeatureDPPSrc1SGPR,
17781785
Feature1_5xVGPRs,
1779-
FeatureRequiredExportPriority])>;
1786+
FeatureRequiredExportPriority,
1787+
FeaturePointSampleAccel])>;
17801788

17811789
def FeatureISAVersion11_5_2 : FeatureSet<
17821790
!listconcat(FeatureISAVersion11_Common.Features,
17831791
[FeatureSALUFloatInsts,
17841792
FeatureDPPSrc1SGPR,
1785-
FeatureRequiredExportPriority])>;
1793+
FeatureRequiredExportPriority,
1794+
FeaturePointSampleAccel])>;
17861795

17871796
def FeatureISAVersion11_5_3 : FeatureSet<
17881797
!listconcat(FeatureISAVersion11_Common.Features,

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
252252
bool HasMinimum3Maximum3F32 = false;
253253
bool HasMinimum3Maximum3F16 = false;
254254
bool HasMinimum3Maximum3PKF16 = false;
255+
bool HasPointSampleAccel = false;
255256

256257
bool RequiresCOV6 = false;
257258

@@ -1352,6 +1353,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13521353
return HasMinimum3Maximum3PKF16;
13531354
}
13541355

1356+
bool hasPointSampleAccel() const { return HasPointSampleAccel; }
1357+
13551358
/// \returns The maximum number of instructions that can be enclosed in an
13561359
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
13571360
/// instruction.

llvm/lib/Target/AMDGPU/MIMGInstructions.td

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class MIMGBaseOpcode : PredicateControl {
5252
bit BVH = 0;
5353
bit A16 = 0;
5454
bit NoReturn = 0;
55+
bit PointSampleAccel = 0; // Opcode eligible for gfx11.5 point sample acceleration
5556
}
5657

5758
def MIMGBaseOpcode : GenericEnum {
@@ -63,7 +64,8 @@ def MIMGBaseOpcodesTable : GenericTable {
6364
let CppTypeName = "MIMGBaseOpcodeInfo";
6465
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
6566
"Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
66-
"LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16", "NoReturn"];
67+
"LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16", "NoReturn",
68+
"PointSampleAccel"];
6769
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
6870

6971
let PrimaryKey = ["BaseOpcode"];
@@ -1458,13 +1460,14 @@ multiclass MIMG_Sampler_NoReturn <mimgopc op, AMDGPUSampleVariant sample, bit wq
14581460
}
14591461
}
14601462

1461-
multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
1462-
bit isG16 = 0, bit isGetLod = 0,
1463+
multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit isPointSampleAccel = 0,
1464+
bit wqm = 0, bit isG16 = 0, bit isGetLod = 0,
14631465
string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""),
14641466
bit ExtendedImageInst = !ne(sample.LowerCaseMod, "")> {
14651467
def "" : MIMG_Sampler_BaseOpcode<sample> {
14661468
let HasD16 = !not(isGetLod);
14671469
let G16 = isG16;
1470+
let PointSampleAccel = isPointSampleAccel;
14681471
}
14691472

14701473
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -1485,8 +1488,8 @@ multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
14851488
defm "_nortn" : MIMG_Sampler_NoReturn <op, sample, wqm, isG16, asm>;
14861489
}
14871490

1488-
multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample>
1489-
: MIMG_Sampler<op, sample, 1>;
1491+
multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample, bit isPointSampleAccel = 0>
1492+
: MIMG_Sampler<op, sample, isPointSampleAccel, 1>;
14901493

14911494
multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
14921495
string asm = "image_gather4"#sample.LowerCaseMod> {
@@ -1670,15 +1673,15 @@ let AssemblerPredicate = isGFX12Plus in {
16701673
def : AMDGPUMnemonicAlias<"image_atomic_fmax", "image_atomic_max_flt">;
16711674
}
16721675

1673-
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>;
1676+
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample, 1>;
16741677
let OtherPredicates = [HasImageInsts, HasExtendedImageInsts] in {
16751678
defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x40, 0x21>, AMDGPUSample_cl>;
16761679
defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x1c, 0x22>, AMDGPUSample_d>;
16771680
defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x41, 0x41, 0x23>, AMDGPUSample_d_cl>;
1678-
defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x1d, 0x24>, AMDGPUSample_l>;
1681+
defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x1d, 0x24>, AMDGPUSample_l, 1>;
16791682
defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x1e, 0x1e, 0x25>, AMDGPUSample_b>;
16801683
defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x42, 0x42, 0x26>, AMDGPUSample_b_cl>;
1681-
defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x1f, 0x27>, AMDGPUSample_lz>;
1684+
defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x1f, 0x27>, AMDGPUSample_lz, 1>;
16821685
defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x20, 0x20, 0x28>, AMDGPUSample_c>;
16831686
defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x43, 0x43, 0x29>, AMDGPUSample_c_cl>;
16841687
defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x21, 0x21, 0x2a>, AMDGPUSample_c_d>;
@@ -1731,7 +1734,7 @@ defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x37, 0x37, 0x5f>, AMDGPU
17311734
let OtherPredicates = [HasImageInsts, HasExtendedImageInsts, isGFX9Plus] in
17321735
defm IMAGE_GATHER4H : MIMG_Gather <mimgopc<0x90, 0x90, 0x61, 0x42>, AMDGPUSample, 1, "image_gather4h">;
17331736

1734-
defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
1737+
defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x38, 0x60>, AMDGPUSample, 0, 1, 0, 1, "image_get_lod">;
17351738

17361739
defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x68>, AMDGPUSample_cd>;
17371740
defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>;
@@ -1744,22 +1747,22 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6f
17441747
} // End OtherPredicates = [HasImageInsts, HasExtendedImageInsts]
17451748

17461749
let OtherPredicates = [HasImageInsts, HasExtendedImageInsts, HasG16] in {
1747-
defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0x39, 0xa2>, AMDGPUSample_d, 0, 1>;
1748-
defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>;
1749-
defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>;
1750-
defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>;
1751-
defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>;
1752-
defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
1753-
defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>;
1754-
defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
1755-
defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>;
1756-
defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>;
1757-
defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>;
1758-
defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
1759-
defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>;
1760-
defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
1761-
defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>;
1762-
defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
1750+
defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0x39, 0xa2>, AMDGPUSample_d, 0, 0, 1>;
1751+
defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 0, 1>;
1752+
defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0x3a, 0xaa>, AMDGPUSample_c_d, 0, 0, 1>;
1753+
defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 0, 1>;
1754+
defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0x3b, 0xb2>, AMDGPUSample_d_o, 0, 0, 1>;
1755+
defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 0, 1>;
1756+
defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 0, 1>;
1757+
defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 0, 1>;
1758+
defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 0, 1>;
1759+
defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 0, 1>;
1760+
defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 0, 1>;
1761+
defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 0, 1>;
1762+
defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 0, 1>;
1763+
defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 0, 1>;
1764+
defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 0, 1>;
1765+
defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 0, 1>;
17631766
} // End OtherPredicates = [HasImageInsts, HasExtendedImageInsts, HasG16]
17641767

17651768
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>;

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,10 @@ class WaitcntBrackets {
384384
return LDSDMAStores;
385385
}
386386

387+
bool hasPointSampleAccel(const MachineInstr &MI) const;
388+
bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
389+
RegInterval Interval) const;
390+
387391
void print(raw_ostream &);
388392
void dump() { print(dbgs()); }
389393

@@ -808,6 +812,34 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
808812
setScoreByInterval(Interval, CntTy, Score);
809813
}
810814

815+
// Return true if the subtarget is one that enables Point Sample Acceleration
816+
// and the MachineInstr passed in is one to which it might be applied (the
817+
// hardware makes this decision based on several factors, but we can't determine
818+
// this at compile time, so we have to assume it might be applied if the
819+
// instruction supports it).
820+
bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
821+
if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
822+
return false;
823+
824+
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
825+
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
826+
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
827+
return BaseInfo->PointSampleAccel;
828+
}
829+
830+
// Return true if the subtarget enables Point Sample Acceleration, the supplied
831+
// MachineInstr is one to which it might be applied and the supplied interval is
832+
// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
833+
// (this is the type that a point sample accelerated instruction effectively
834+
// becomes)
835+
bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
836+
const MachineInstr &MI, RegInterval Interval) const {
837+
if (!hasPointSampleAccel(MI))
838+
return false;
839+
840+
return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
841+
}
842+
811843
void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
812844
const SIRegisterInfo *TRI,
813845
const MachineRegisterInfo *MRI,
@@ -924,8 +956,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
924956
// defs. That's required for a sane index into `VgprMemTypes` below
925957
assert(TRI->isVectorRegister(*MRI, Op.getReg()));
926958
VmemType V = getVmemType(Inst);
959+
unsigned char TypesMask = 1 << V;
960+
// If instruction can have Point Sample Accel applied, we have to flag
961+
// this with another potential dependency
962+
if (hasPointSampleAccel(Inst))
963+
TypesMask |= 1 << VMEM_NOSAMPLER;
927964
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
928-
VgprVmemTypes[RegNo] |= 1 << V;
965+
VgprVmemTypes[RegNo] |= TypesMask;
929966
}
930967
}
931968
setScoreByInterval(Interval, T, CurrScore);
@@ -1787,9 +1824,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17871824
// previous write and this write are the same type of VMEM
17881825
// instruction, in which case they are (in some architectures)
17891826
// guaranteed to write their results in order anyway.
1827+
// Additionally check instructions where Point Sample Acceleration
1828+
// might be applied.
17901829
if (Op.isUse() || !updateVMCntOnly(MI) ||
17911830
ScoreBrackets.hasOtherPendingVmemTypes(Interval,
17921831
getVmemType(MI)) ||
1832+
ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
17931833
!ST->hasVmemWriteVgprInOrder()) {
17941834
ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
17951835
ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,7 @@ struct MIMGBaseOpcodeInfo {
430430
bool BVH;
431431
bool A16;
432432
bool NoReturn;
433+
bool PointSampleAccel;
433434
};
434435

435436
LLVM_READONLY

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waterfall.ll

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ define amdgpu_ps <4 x float> @test_waterfall_non_uniform_img(<8 x i32> addrspace
273273
; GFX1150-NEXT: s_add_u32 s10, s0, s10
274274
; GFX1150-NEXT: s_addc_u32 s11, s1, s11
275275
; GFX1150-NEXT: s_load_b256 s[12:19], s[10:11], 0x0
276-
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
276+
; GFX1150-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
277277
; GFX1150-NEXT: image_sample v[0:3], v4, s[12:19], s[4:7] dmask:0xf dim:SQ_RSRC_IMG_1D
278278
; GFX1150-NEXT: ; implicit-def: $vgpr4
279279
; GFX1150-NEXT: s_xor_b64 exec, exec, s[20:21]
@@ -509,6 +509,7 @@ define amdgpu_ps <4 x float> @test_waterfall_non_uniform_img_single_read(<8 x i3
509509
; GFX1150-NEXT: v_readfirstlane_b32 s19, v14
510510
; GFX1150-NEXT: ; implicit-def: $vgpr4
511511
; GFX1150-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14
512+
; GFX1150-NEXT: s_waitcnt vmcnt(0)
512513
; GFX1150-NEXT: image_sample v[0:3], v6, s[12:19], s[4:7] dmask:0xf dim:SQ_RSRC_IMG_1D
513514
; GFX1150-NEXT: ; implicit-def: $vgpr6
514515
; GFX1150-NEXT: s_xor_b64 exec, exec, s[2:3]
@@ -896,7 +897,7 @@ define amdgpu_ps <4 x float> @test_waterfall_non_uniform_img_multi_rl(<8 x i32>
896897
; GFX1150-NEXT: s_addc_u32 s17, s3, s11
897898
; GFX1150-NEXT: s_load_b256 s[8:15], s[8:9], 0x0
898899
; GFX1150-NEXT: s_load_b128 s[16:19], s[16:17], 0x0
899-
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
900+
; GFX1150-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
900901
; GFX1150-NEXT: image_sample v[0:3], v4, s[8:15], s[16:19] dmask:0xf dim:SQ_RSRC_IMG_1D
901902
; GFX1150-NEXT: ; implicit-def: $vgpr4
902903
; GFX1150-NEXT: s_xor_b64 exec, exec, s[20:21]
@@ -1110,7 +1111,7 @@ define amdgpu_ps <4 x float> @test_waterfall_non_uni_img_2_idx(<8 x i32> addrspa
11101111
; GFX1150-NEXT: s_addc_u32 s17, s3, s11
11111112
; GFX1150-NEXT: s_load_b256 s[8:15], s[8:9], 0x0
11121113
; GFX1150-NEXT: s_load_b128 s[16:19], s[16:17], 0x0
1113-
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
1114+
; GFX1150-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11141115
; GFX1150-NEXT: image_sample v[0:3], v6, s[8:15], s[16:19] dmask:0xf dim:SQ_RSRC_IMG_1D
11151116
; GFX1150-NEXT: ; implicit-def: $vgpr6
11161117
; GFX1150-NEXT: s_xor_b64 exec, exec, s[20:21]
@@ -1644,7 +1645,7 @@ define amdgpu_ps <4 x float> @test_keep_waterfall_multi_rl(<8 x i32> addrspace(4
16441645
; GFX1150-NEXT: s_addc_u32 s21, s3, s13
16451646
; GFX1150-NEXT: s_load_b256 s[12:19], s[10:11], 0x0
16461647
; GFX1150-NEXT: s_load_b128 s[20:23], s[20:21], 0x0
1647-
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
1648+
; GFX1150-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16481649
; GFX1150-NEXT: image_sample v[0:3], v4, s[12:19], s[20:23] dmask:0xf dim:SQ_RSRC_IMG_1D
16491650
; GFX1150-NEXT: ; implicit-def: $vgpr4
16501651
; GFX1150-NEXT: s_xor_b64 exec, exec, s[24:25]
@@ -1888,7 +1889,7 @@ define amdgpu_ps void @test_waterfall_sample_with_kill(<8 x i32> addrspace(4)* i
18881889
; GFX1150-NEXT: s_addc_u32 s21, s3, s13
18891890
; GFX1150-NEXT: s_load_b256 s[12:19], s[10:11], 0x0
18901891
; GFX1150-NEXT: s_load_b128 s[20:23], s[20:21], 0x0
1891-
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
1892+
; GFX1150-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18921893
; GFX1150-NEXT: image_sample v[2:5], v1, s[12:19], s[20:23] dmask:0xf dim:SQ_RSRC_IMG_1D
18931894
; GFX1150-NEXT: ; implicit-def: $vgpr1
18941895
; GFX1150-NEXT: s_xor_b64 exec, exec, s[24:25]
@@ -4002,6 +4003,7 @@ define amdgpu_ps {<4 x float>,<4 x float>} @test_waterfall_multi_begin_uniform_i
40024003
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
40034004
; GFX1150-NEXT: ; implicit-def: $vgpr5
40044005
; GFX1150-NEXT: ; implicit-def: $vgpr6
4006+
; GFX1150-NEXT: s_waitcnt vmcnt(0)
40054007
; GFX1150-NEXT: image_sample v[0:3], [v4, v4], s[8:15], s[16:19] dmask:0xf dim:SQ_RSRC_IMG_2D
40064008
; GFX1150-NEXT: s_xor_b64 exec, exec, s[2:3]
40074009
; GFX1150-NEXT: s_cbranch_execnz .LBB17_1

0 commit comments

Comments
 (0)