diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 057412d41e7a2..df0320fd0f177 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -554,31 +554,38 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, unsigned NumBytes) const { // If the mem ops (to be clustered) do not have the same base ptr, then they // should not be clustered + unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit; if (!BaseOps1.empty() && !BaseOps2.empty()) { const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; + + const SIMachineFunctionInfo *MFI = + FirstLdSt.getMF()->getInfo(); + MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords(); } else if (!BaseOps1.empty() || !BaseOps2.empty()) { // If only one base op is empty, they do not have the same base ptr return false; } // In order to avoid register pressure, on an average, the number of DWORDS - // loaded together by all clustered mem ops should not exceed 8. This is an - // empirical value based on certain observations and performance related - // experiments. + // loaded together by all clustered mem ops should not exceed + // MaxMemoryClusterDWords. This is an empirical value based on certain + // observations and performance related experiments. // The good thing about this heuristic is - it avoids clustering of too many // sub-word loads, and also avoids clustering of wide loads. Below is the - // brief summary of how the heuristic behaves for various `LoadSize`. + // brief summary of how the heuristic behaves for various `LoadSize` when + // MaxMemoryClusterDWords is 8. + // // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops // (5) LoadSize >= 17: do not cluster const unsigned LoadSize = NumBytes / ClusterSize; - const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize; - return NumDWORDs <= 8; + const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize; + return NumDWords <= MaxMemoryClusterDWords; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 960fbb7ea15ce..8f9ca6141816d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -36,6 +36,8 @@ class RegScavenger; class TargetRegisterClass; class ScheduleHazardRecognizer; +constexpr unsigned DefaultMemoryClusterDWordsLimit = 8; + /// Mark the MMO of a uniform load if there are no potentially clobbering stores /// on any path from the start of an entry function to this load. static const MachineMemOperand::Flags MONoClobber = diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 1e43d2727a00d..8d7df73f3cee8 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -163,6 +163,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); + MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger( + "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit); + // On GFX908, in order to guarantee copying between AGPRs, we need a scratch // VGPR available at all times. For now, reserve highest available VGPR. After // RA, shift it to the lowest available unused VGPR if the one exist. @@ -694,8 +697,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( const llvm::MachineFunction &MF) : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()), - GDSSize(MFI.getGDSSize()), - DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()), + GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()), + IsEntryFunction(MFI.isEntryFunction()), NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), HasSpilledSGPRs(MFI.hasSpilledSGPRs()), @@ -708,8 +711,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( BytesInStackArgArea(MFI.getBytesInStackArgArea()), ReturnsVoid(MFI.returnsVoid()), ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), - PSInputAddr(MFI.getPSInputAddr()), - PSInputEnable(MFI.getPSInputEnable()), + PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()), + MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()), Mode(MFI.getMode()) { for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) SpillPhysVGPRS.push_back(regToString(Reg, TRI)); @@ -744,6 +747,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( DynLDSAlign = YamlMFI.DynLDSAlign; PSInputAddr = YamlMFI.PSInputAddr; PSInputEnable = YamlMFI.PSInputEnable; + MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords; HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; Occupancy = YamlMFI.Occupancy; IsEntryFunction = YamlMFI.IsEntryFunction; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 2a754680fdc8c..2e2716f1ce888 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -289,6 +289,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { unsigned PSInputAddr = 0; unsigned PSInputEnable = 0; + unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit; SIMode Mode; std::optional ScavengeFI; @@ -333,6 +334,8 @@ template <> struct MappingTraits { YamlIO.mapOptional("argumentInfo", MFI.ArgInfo); YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u); YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u); + YamlIO.mapOptional("maxMemoryClusterDWords", MFI.MaxMemoryClusterDWords, + DefaultMemoryClusterDWordsLimit); YamlIO.mapOptional("mode", MFI.Mode, SIMode()); YamlIO.mapOptional("highBitsOf32BitAddress", MFI.HighBitsOf32BitAddress, 0u); @@ -487,6 +490,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, // Current recorded maximum possible occupancy. unsigned Occupancy; + // Maximum number of dwords that can be clusterred during instruction + // scheduler stage. + unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit; + mutable std::optional UsesAGPRs; MCPhysReg getNextUserSGPR() const; @@ -1109,6 +1116,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, limitOccupancy(MF); } + unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; } + bool mayNeedAGPRs() const { return MayNeedAGPRs; } diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll index 7f587ac0b8716..80d4fa69be425 100644 --- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll +++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll @@ -4,7 +4,7 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 { ; GFX11-LABEL: group_image_sample: ; GFX11: ; %bb.0: ; %.entry -; GFX11-NEXT: s_mov_b32 s24, exec_lo +; GFX11-NEXT: s_mov_b32 s33, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-NEXT: s_mov_b32 m0, s4 ; GFX11-NEXT: s_getpc_b64 s[4:5] @@ -21,73 +21,79 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in ; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15 ; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15 ; GFX11-NEXT: s_mov_b32 exec_lo, s16 +; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1 +; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0xf ; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x10 ; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x20 ; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x30 ; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x40 -; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1 -; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x50 +; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x60 +; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0x70 +; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0x80 +; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0x90 +; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xa0 +; GFX11-NEXT: s_buffer_load_b64 s[38:39], s[12:15], 0xb0 +; GFX11-NEXT: s_buffer_load_b64 s[40:41], s[12:15], 0xc0 +; GFX11-NEXT: s_buffer_load_b64 s[42:43], s[12:15], 0xd0 +; GFX11-NEXT: s_buffer_load_b64 s[44:45], s[12:15], 0xe0 +; GFX11-NEXT: s_buffer_load_b64 s[46:47], s[12:15], 0xf0 +; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100 +; GFX11-NEXT: v_interp_p2_f32 v36, v2, v1, v4 wait_exp:7 ; GFX11-NEXT: v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7 -; GFX11-NEXT: v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1 -; GFX11-NEXT: v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1 -; GFX11-NEXT: v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1 -; GFX11-NEXT: v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v5, s17, v36 +; GFX11-NEXT: v_add_f32_e32 v4, s16, v0 +; GFX11-NEXT: v_add_f32_e32 v8, s18, v0 +; GFX11-NEXT: v_add_f32_e32 v9, s19, v36 +; GFX11-NEXT: v_add_f32_e32 v12, s20, v0 +; GFX11-NEXT: v_add_f32_e32 v13, s21, v36 +; GFX11-NEXT: v_add_f32_e32 v16, s22, v0 +; GFX11-NEXT: v_add_f32_e32 v17, s23, v36 +; GFX11-NEXT: v_add_f32_e32 v20, s24, v0 +; GFX11-NEXT: v_add_f32_e32 v21, s25, v36 +; GFX11-NEXT: v_add_f32_e32 v24, s26, v0 +; GFX11-NEXT: v_add_f32_e32 v25, s27, v36 +; GFX11-NEXT: v_add_f32_e32 v28, s28, v0 +; GFX11-NEXT: v_add_f32_e32 v29, s29, v36 +; GFX11-NEXT: v_add_f32_e32 v32, s30, v0 +; GFX11-NEXT: v_add_f32_e32 v33, s31, v36 +; GFX11-NEXT: s_clause 0x7 ; GFX11-NEXT: image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x50 -; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60 -; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70 -; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1 -; GFX11-NEXT: v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1 -; GFX11-NEXT: v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1 -; GFX11-NEXT: v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1 -; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x90 -; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xa0 -; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xb0 -; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xc0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1 -; GFX11-NEXT: v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1 -; GFX11-NEXT: v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1 -; GFX11-NEXT: v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: v_add_f32_e32 v37, s34, v0 +; GFX11-NEXT: v_add_f32_e32 v38, s35, v36 +; GFX11-NEXT: v_add_f32_e32 v40, s36, v0 +; GFX11-NEXT: v_add_f32_e32 v41, s37, v36 +; GFX11-NEXT: v_add_f32_e32 v44, s38, v0 +; GFX11-NEXT: v_add_f32_e32 v45, s39, v36 +; GFX11-NEXT: v_add_f32_e32 v48, s40, v0 +; GFX11-NEXT: v_add_f32_e32 v49, s41, v36 +; GFX11-NEXT: v_add_f32_e32 v52, s42, v0 +; GFX11-NEXT: v_add_f32_e32 v53, s43, v36 +; GFX11-NEXT: v_add_f32_e32 v56, s44, v0 +; GFX11-NEXT: v_add_f32_e32 v57, s45, v36 +; GFX11-NEXT: v_add_f32_e32 v60, s46, v0 +; GFX11-NEXT: v_add_f32_e32 v61, s47, v36 +; GFX11-NEXT: v_add_f32_e32 v0, s12, v0 +; GFX11-NEXT: v_add_f32_e32 v1, s13, v36 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s33 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: image_sample v[36:39], v[37:38], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0xd0 -; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xe0 -; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xf0 -; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1 -; GFX11-NEXT: v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1 -; GFX11-NEXT: v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1 -; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s24 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(14) @@ -446,7 +452,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8 -attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"} +attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "amdgpu-max-memory-cluster-dwords"="32"} attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index 0f7a5f8e0941a..eb4ee118ec2e4 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -29,6 +29,7 @@ ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: maxMemoryClusterDWords: 8 ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true @@ -295,6 +296,7 @@ ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: maxMemoryClusterDWords: 8 ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index 7759501ea4226..6f5467b00ebcc 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -29,6 +29,7 @@ ; AFTER-PEI-NEXT: workItemIDX: { reg: '$vgpr0' } ; AFTER-PEI-NEXT: psInputAddr: 0 ; AFTER-PEI-NEXT: psInputEnable: 0 +; AFTER-PEI-NEXT: maxMemoryClusterDWords: 8 ; AFTER-PEI-NEXT: mode: ; AFTER-PEI-NEXT: ieee: true ; AFTER-PEI-NEXT: dx10-clamp: true diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index 4545c8bbeb3e6..d1d8240a1007a 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -30,6 +30,7 @@ ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: maxMemoryClusterDWords: 8 ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 8215ba834170f..ad6e92a25b861 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -30,6 +30,7 @@ ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: maxMemoryClusterDWords: 8 ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 51795a4fea515..3eff89239d541 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -39,6 +39,7 @@ # FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: psInputAddr: 0 # FULL-NEXT: psInputEnable: 0 +# FULL-NEXT: maxMemoryClusterDWords: 8 # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -143,6 +144,7 @@ body: | # FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: psInputAddr: 0 # FULL-NEXT: psInputEnable: 0 +# FULL-NEXT: maxMemoryClusterDWords: 8 # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -218,6 +220,7 @@ body: | # FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: psInputAddr: 0 # FULL-NEXT: psInputEnable: 0 +# FULL-NEXT: maxMemoryClusterDWords: 8 # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -294,6 +297,7 @@ body: | # FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: psInputAddr: 0 # FULL-NEXT: psInputEnable: 0 +# FULL-NEXT: maxMemoryClusterDWords: 8 # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -593,3 +597,15 @@ body: | %2:sgpr_64 = COPY %1 %1:sgpr_64 = COPY %0 ... + +--- +# ALL-LABEL: name: max_memory_cluster_dwords +# ALL: maxMemoryClusterDWords: 16 +name: max_memory_cluster_dwords +machineFunctionInfo: + maxMemoryClusterDWords: 16 +body: | + bb.0: + SI_RETURN + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index 077d22fc895ae..eca3f99b64955 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -40,6 +40,7 @@ ; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: maxMemoryClusterDWords: 8 ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true @@ -86,6 +87,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' } ; CHECK-NEXT: psInputAddr: 1 ; CHECK-NEXT: psInputEnable: 1 +; CHECK-NEXT: maxMemoryClusterDWords: 8 ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: false ; CHECK-NEXT: dx10-clamp: true @@ -156,6 +158,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { ; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: maxMemoryClusterDWords: 8 ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true @@ -208,6 +211,7 @@ define void @function() { ; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 +; CHECK-NEXT: maxMemoryClusterDWords: 8 ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true