diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 87fa845f3cff7..b9ce8dc0c5cdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -209,7 +209,7 @@ class AMDGPUInformationCache : public InformationCache { getWavesPerEU(const Function &F, std::pair FlatWorkGroupSize) { const GCNSubtarget &ST = TM.getSubtarget(F); - return ST.getWavesPerEU(F, FlatWorkGroupSize); + return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F); } std::optional> @@ -230,7 +230,8 @@ class AMDGPUInformationCache : public InformationCache { std::pair WavesPerEU, std::pair FlatWorkGroupSize) { const GCNSubtarget &ST = TM.getSubtarget(F); - return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize); + return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize, + getLDSSize(F)); } unsigned getMaxWavesPerEU(const Function &F) { @@ -255,6 +256,14 @@ class AMDGPUInformationCache : public InformationCache { return Status; } + /// Returns the minimum amount of LDS space used by a workgroup running + /// function \p F. + static unsigned getLDSSize(const Function &F) { + return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", + {0, UINT32_MAX}, true) + .first; + } + /// Get the constant access bitmap for \p C. uint8_t getConstantAccess(const Constant *C, SmallPtrSetImpl &Visited) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 6c01f6dd370f1..933ee6ceeaf4a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -195,12 +195,14 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass { } }; -unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) { +static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM, + const Function &F) { if (!TM.getTargetTriple().isAMDGCN()) return 128; const GCNSubtarget &ST = TM.getSubtarget(F); - unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + unsigned MaxVGPRs = ST.getMaxNumVGPRs( + ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first); // A non-entry function has only 32 caller preserved registers. // Do not promote alloca which will force spilling unless we know the function @@ -336,10 +338,9 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { if (!ST.isPromoteAllocaEnabled()) return false; - MaxVGPRs = getMaxVGPRs(TM, F); - setFunctionLimits(F); - bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F); + MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F); + setFunctionLimits(F); unsigned VectorizationBudget = (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 @@ -1452,29 +1453,14 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } unsigned MaxOccupancy = - ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second; - - // Restrict local memory usage so that we don't drastically reduce occupancy, - // unless it is already significantly reduced. - - // TODO: Have some sort of hint or other heuristics to guess occupancy based - // on other factors.. - unsigned OccupancyHint = ST.getWavesPerEU(F).second; - if (OccupancyHint == 0) - OccupancyHint = 7; - - // Clamp to max value. - OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); - - // Check the hint but ignore it if it's obviously wrong from the existing LDS - // usage. - MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F) + .second; // Round up to the next tier of usage. unsigned MaxSizeWithWaveCount = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); - // Program is possibly broken by using more local mem than available. + // Program may already use more LDS than is usable at maximum occupancy. if (CurrentLocalMemUsage > MaxSizeWithWaveCount) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 4373528d6d517..563605f964cc6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -55,9 +55,9 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() / WorkGroupsPerCU; } -std::pair -AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, - const Function &F) const { +std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( + uint32_t LDSBytes, std::pair FlatWorkGroupSizes) const { + // FIXME: We should take into account the LDS allocation granularity. const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); @@ -81,7 +81,7 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, // workgroups, maximum number of waves, and minimum occupancy. The opposite is // generally true for the minimum group size. LDS or barrier ressource // limitations can flip those minimums/maximums. - const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F); + const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes; auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize); auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize); @@ -180,45 +180,52 @@ std::pair AMDGPUSubtarget::getFlatWorkGroupSizes( } std::pair AMDGPUSubtarget::getEffectiveWavesPerEU( - std::pair Requested, - std::pair FlatWorkGroupSizes) const { - // Default minimum/maximum number of waves per execution unit. - std::pair Default(1, getMaxWavesPerEU()); - - // If minimum/maximum flat work group sizes were explicitly requested using - // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum - // number of waves per execution unit to values implied by requested - // minimum/maximum flat work group sizes. - unsigned MinImpliedByFlatWorkGroupSize = - getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); - Default.first = MinImpliedByFlatWorkGroupSize; + std::pair RequestedWavesPerEU, + std::pair FlatWorkGroupSizes, unsigned LDSBytes) const { + // Default minimum/maximum number of waves per EU. The range of flat workgroup + // sizes limits the achievable maximum, and we aim to support enough waves per + // EU so that we can concurrently execute all waves of a single workgroup of + // maximum size on a CU. + std::pair Default = { + getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second), + getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second}; + Default.first = std::min(Default.first, Default.second); // Make sure requested minimum is less than requested maximum. - if (Requested.second && Requested.first > Requested.second) + if (RequestedWavesPerEU.second && + RequestedWavesPerEU.first > RequestedWavesPerEU.second) return Default; - // Make sure requested values do not violate subtarget's specifications. - if (Requested.first < getMinWavesPerEU() || - Requested.second > getMaxWavesPerEU()) + // Make sure requested values do not violate subtarget's specifications and + // are compatible with values implied by minimum/maximum flat workgroup sizes. + if (RequestedWavesPerEU.first < Default.first || + RequestedWavesPerEU.second > Default.second) return Default; - // Make sure requested values are compatible with values implied by requested - // minimum/maximum flat work group sizes. - if (Requested.first < MinImpliedByFlatWorkGroupSize) - return Default; + return RequestedWavesPerEU; +} - return Requested; +std::pair +AMDGPUSubtarget::getWavesPerEU(const Function &F) const { + // Default/requested minimum/maximum flat work group sizes. + std::pair FlatWorkGroupSizes = getFlatWorkGroupSizes(F); + // Minimum number of bytes allocated in the LDS. + unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size", + {0, UINT32_MAX}, true) + .first; + return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F); } -std::pair AMDGPUSubtarget::getWavesPerEU( - const Function &F, std::pair FlatWorkGroupSizes) const { +std::pair +AMDGPUSubtarget::getWavesPerEU(std::pair FlatWorkGroupSizes, + unsigned LDSBytes, const Function &F) const { // Default minimum/maximum number of waves per execution unit. std::pair Default(1, getMaxWavesPerEU()); // Requested minimum/maximum number of waves per execution unit. std::pair Requested = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); - return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); + return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes); } static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index a71731ecf8a3f..91fe2a69bc0b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -106,21 +106,24 @@ class AMDGPUSubtarget { /// be converted to integer, violate subtarget's specifications, or are not /// compatible with minimum/maximum number of waves limited by flat work group /// size, register usage, and/or lds usage. - std::pair getWavesPerEU(const Function &F) const { - // Default/requested minimum/maximum flat work group sizes. - std::pair FlatWorkGroupSizes = getFlatWorkGroupSizes(F); - return getWavesPerEU(F, FlatWorkGroupSizes); - } + std::pair getWavesPerEU(const Function &F) const; - /// Overload which uses the specified values for the flat work group sizes, - /// rather than querying the function itself. \p FlatWorkGroupSizes Should - /// correspond to the function's value for getFlatWorkGroupSizes. + /// Overload which uses the specified values for the flat workgroup sizes and + /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes + /// should correspond to the function's value for getFlatWorkGroupSizes and \p + /// LDSBytes to the per-workgroup LDS allocation. std::pair - getWavesPerEU(const Function &F, - std::pair FlatWorkGroupSizes) const; - std::pair getEffectiveWavesPerEU( - std::pair WavesPerEU, - std::pair FlatWorkGroupSizes) const; + getWavesPerEU(std::pair FlatWorkGroupSizes, + unsigned LDSBytes, const Function &F) const; + + /// Returns the target minimum/maximum number of waves per EU. This is based + /// on the minimum/maximum number of \p RequestedWavesPerEU and further + /// limited by the maximum achievable occupancy derived from the range of \p + /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. + std::pair + getEffectiveWavesPerEU(std::pair RequestedWavesPerEU, + std::pair FlatWorkGroupSizes, + unsigned LDSBytes) const; /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. @@ -133,7 +136,16 @@ class AMDGPUSubtarget { /// This notably depends on the range of allowed flat group sizes for the /// function and hardware characteristics. std::pair - getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const; + getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const { + return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F)); + } + + /// Overload which uses the specified values for the flat work group sizes, + /// rather than querying the function itself. \p FlatWorkGroupSizes should + /// correspond to the function's value for getFlatWorkGroupSizes. + std::pair getOccupancyWithWorkGroupSizes( + uint32_t LDSBytes, + std::pair FlatWorkGroupSizes) const; /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can /// be achieved when the only function running on a CU is \p MF. This notably diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index 22cc5af30da66..616867481d177 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -24,10 +24,10 @@ entry: attributes #1 = {"amdgpu-flat-work-group-size"="64,128"} ; CHECK-LABEL: {{^}}min_128_max_128: -; CHECK: SGPRBlocks: 0 -; CHECK: VGPRBlocks: 0 -; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: SGPRBlocks: 8 +; CHECK: VGPRBlocks: 7 +; CHECK: NumSGPRsForWavesPerEU: 65 +; CHECK: NumVGPRsForWavesPerEU: 29 define amdgpu_kernel void @min_128_max_128() #2 { entry: ret void @@ -35,9 +35,9 @@ entry: attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_1024 -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 8 ; CHECK: VGPRBlocks: 10 -; CHECK: NumSGPRsForWavesPerEU: 24{{$}} +; CHECK: NumSGPRsForWavesPerEU: 65 ; CHECK: NumVGPRsForWavesPerEU: 43 @var = addrspace(1) global float 0.0 define amdgpu_kernel void @min_1024_max_1024() #3 { diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 9054e509cde8e..b19486b0e7671 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -6581,50 +6581,50 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v8 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll index 1e5d6755fbc85..bd1258cb1cf98 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll @@ -42,4 +42,4 @@ bb2: declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-num-vgpr"="9" "amdgpu-flat-work-group-size"="1024,1024" } +attributes #1 = { "amdgpu-num-vgpr"="9" }