From c3bf55b6369f8399bc0da2d509a03b63f928798d Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Thu, 12 Dec 2024 13:41:40 +0100 Subject: [PATCH 1/6] [AMDGPU] Occupancy w.r.t. WG size is now a range All unit tests updated. --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 4 +- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 125 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 18 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 9 +- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 20 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 16 +- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 5 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 9 +- .../CodeGen/AMDGPU/GlobalISel/add.vni16.ll | 140 +- .../CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll | 336 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 434 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 406 +- .../AMDGPU/GlobalISel/insertelement.ll | 30 +- .../CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll | 275 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 384 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 192 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 998 +-- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 395 +- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 1573 ++--- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 116 +- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 1284 ++-- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 463 +- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 1374 ++-- .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 44 +- llvm/test/CodeGen/AMDGPU/abs_i16.ll | 174 +- llvm/test/CodeGen/AMDGPU/add.ll | 64 +- llvm/test/CodeGen/AMDGPU/addrspacecast.ll | 460 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 80 +- .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 4 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 6164 ++++++++--------- .../test/CodeGen/AMDGPU/branch-relax-spill.ll | 8 +- .../CodeGen/AMDGPU/calling-conventions.ll | 6 +- .../AMDGPU/dbg-value-ends-sched-region.mir | 32 +- .../AMDGPU/debug-value-scheduler-crash.mir | 38 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 212 +- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 1995 +++--- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 53 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 649 +- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 560 +- llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll | 168 +- llvm/test/CodeGen/AMDGPU/function-args.ll | 1732 ++--- llvm/test/CodeGen/AMDGPU/function-returns.ll | 28 +- .../AMDGPU/gfx-callable-argument-types.ll | 12 +- .../AMDGPU/gfx-callable-return-types.ll | 190 +- llvm/test/CodeGen/AMDGPU/half.ll | 509 +- llvm/test/CodeGen/AMDGPU/idot8s.ll | 6 +- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 966 +-- .../AMDGPU/insert_vector_elt.v2bf16.ll | 246 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 224 +- .../CodeGen/AMDGPU/integer-mad-patterns.ll | 84 +- llvm/test/CodeGen/AMDGPU/licm-regpressure.mir | 16 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 324 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 198 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 150 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 198 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 97 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 1690 +++-- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 2106 +++--- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 819 ++- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 95 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 1472 ++-- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 2 +- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 2 +- .../machine-scheduler-sink-trivial-remats.mir | 160 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 4 +- llvm/test/CodeGen/AMDGPU/memory_clause.mir | 2 +- .../AMDGPU/min-waves-per-eu-not-respected.ll | 2 +- llvm/test/CodeGen/AMDGPU/mul.ll | 101 +- .../CodeGen/AMDGPU/mul24-pass-ordering.ll | 16 +- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 148 +- llvm/test/CodeGen/AMDGPU/pr51516.mir | 4 +- .../AMDGPU/promote-constOffset-to-imm.ll | 251 +- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 94 +- .../CodeGen/AMDGPU/remat-fp64-constants.ll | 4 +- .../AMDGPU/resource-optimization-remarks.ll | 6 +- llvm/test/CodeGen/AMDGPU/rsq.f64.ll | 218 +- ...dleMoveUp-subreg-def-across-subreg-def.mir | 4 +- .../AMDGPU/schedule-amdgpu-trackers.ll | 14 +- llvm/test/CodeGen/AMDGPU/schedule-barrier.mir | 18 +- .../schedule-regpressure-limit-clustering.ll | 2 +- .../AMDGPU/schedule-relaxed-occupancy.ll | 12 +- llvm/test/CodeGen/AMDGPU/sdiv.ll | 408 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 14 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 368 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 36 +- llvm/test/CodeGen/AMDGPU/shl.ll | 22 +- llvm/test/CodeGen/AMDGPU/sra.ll | 44 +- llvm/test/CodeGen/AMDGPU/srem.ll | 232 +- llvm/test/CodeGen/AMDGPU/srl.ll | 22 +- llvm/test/CodeGen/AMDGPU/ssubsat.ll | 20 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 36 +- 93 files changed, 16171 insertions(+), 16578 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 737b2f740d6f7..bdf12ccb302cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -455,7 +455,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { uint64_t NumSGPRsForWavesPerEU = std::max( {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( - STM.computeOccupancy(F, MFI.getLDSSize()), + STM.getOccupancyWithWorkGroupSizes(*MF).second, MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM, OutContext); @@ -1262,7 +1262,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( - STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU, + STM.computeOccupancy(F, ProgInfo.LDSSize).second, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); const auto [MinWEU, MaxWEU] = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index e27ef71c1c088..907f82ed7fc52 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } unsigned MaxOccupancy = - ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F); + ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second; // Restrict local memory usage so that we don't drastically reduce occupancy, // unless it is already significantly reduced. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index ae563df2a7a12..da729d4dc7e08 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -55,55 +55,92 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() / WorkGroupsPerCU; } -// FIXME: Should return min,max range. -// -// Returns the maximum occupancy, in number of waves per SIMD / EU, that can -// be achieved when only the given function is running on the machine; and -// taking into account the overall number of wave slots, the (maximum) workgroup -// size, and the per-workgroup LDS allocation size. -unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, - const Function &F) const { - const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; - const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); - if (!MaxWorkGroupsPerCu) - return 0; - - const unsigned WaveSize = getWavefrontSize(); - - // FIXME: Do we need to account for alignment requirement of LDS rounding the - // size up? - // Compute restriction based on LDS usage - unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); - - // This can be queried with more LDS than is possible, so just assume the - // worst. - if (NumGroups == 0) - return 1; - - NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); - - // Round to the number of waves per CU. - const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize); - unsigned MaxWaves = NumGroups * MaxGroupNumWaves; - - // Number of waves per EU (SIMD). - MaxWaves = divideCeil(MaxWaves, getEUsPerCU()); - - // Clamp to the maximum possible number of waves. - MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); +std::pair +AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, + const Function &F) const { + // FIXME: Is there an allocation granularity for the LDS? If so we would need + // to make sure the amount of bytes is aligned on that granularity. + + // Compute occupancy restriction based on LDS usage. + const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); + + // Queried LDS size may be larger than available on a CU, in which case we + // consider the only achievable occupancy to be 1, in line with what we + // consider the occupancy to be when the number of requested registers in a + // particular bank is higher than the number of available ones in that bank. + if (!MaxWGsLDS) + return {1, 1}; + + const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU(); + const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU(); + + auto PropsFromWGSize = [&](unsigned WGSize) + -> std::tuple { + unsigned WavesPerWG = divideCeil(WGSize, WaveSize); + unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS); + return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU}; + }; + + // The maximum group size will generally yield the minimum number of + // workgroups, maximum number of waves, and minimum occupancy. The opposite is + // generally true for the minimum group size. LDS or barrier ressource + // limitations can flip those minimums/maximums. + const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F); + auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize); + auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize); + + // It is possible that we end up with flipped minimum and maximum number of + // waves per CU when the number of minimum/maximum concurrent groups on the CU + // is limited by LDS usage or barrier ressources. + if (MinWavesPerCU >= MaxWavesPerCU) { + std::swap(MinWavesPerCU, MaxWavesPerCU); + } else { + // Look for a potential smaller group size than the maximum which decreases + // the concurrent number of waves on the CU for the same number of + // concurrent workgroups on the CU. + unsigned MinWavesPerCUForWGSize = + divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU; + if (MinWavesPerCU > MinWavesPerCUForWGSize) { + unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize; + if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) { + // There may exist a smaller group size than the maximum that achieves + // the minimum number of waves per CU. This group size is the largest + // possible size that requires MaxWavesPerWG - E waves where E is + // maximized under the following constraints. + // 1. 0 <= E <= ExcessSlotsPerWG + // 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize + MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG, + MaxWavesPerWG - MinWavesPerWG); + } + } - // FIXME: Needs to be a multiple of the group size? - //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); + // Look for a potential larger group size than the minimum which increases + // the concurrent number of waves on the CU for the same number of + // concurrent workgroups on the CU. + unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG; + if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) { + // There may exist a larger group size than the minimum that achieves the + // maximum number of waves per CU. This group size is the smallest + // possible size that requires MinWavesPerWG + L waves where L is + // maximized under the following constraints. + // 1. 0 <= L <= LeftoverSlotsPerWG + // 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize + MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG, + ((MaxWGSize - 1) / WaveSize) + 1 - + MinWavesPerWG); + } + } - assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && - "computed invalid occupancy"); - return MaxWaves; + // Return the minimum/maximum number of waves on any EU, assuming that all + // wavefronts are spread across all EUs as evenly as possible. + return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU), + std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)}; } -unsigned -AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { +std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( + const MachineFunction &MF) const { const auto *MFI = MF.getInfo(); - return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); + return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction()); } std::pair diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 7701fef536584..5944b69ce6416 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -127,11 +127,21 @@ class AMDGPUSubtarget { unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const; - /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if - /// the given LDS memory size is the only constraint. - unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; + /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can + /// be achieved when the only function running on a CU is \p F and each + /// workgroup running the function requires \p LDSBytes bytes of LDS space. + /// This notably depends on the range of allowed flat group sizes for the + /// function and hardware characteristics. + std::pair + getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const; - unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; + /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can + /// be achieved when the only function running on a CU is \p MF. This notably + /// depends on the range of allowed flat group sizes for the function, the + /// amount of per-workgroup LDS space required by the function, and hardware + /// characteristics. + std::pair + getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const; bool isAmdHsaOS() const { return TargetTriple.getOS() == Triple::AMDHSA; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f8b60630bb7f6..05acd418a1cd0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1717,7 +1717,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( if (MFI->Occupancy == 0) { // Fixup the subtarget dependent default value. - MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); + MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second; } auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index f5bbc5482d347..b00105ae9bd52 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1089,9 +1089,8 @@ bool PreRARematStage::initGCNSchedStage() { return false; const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - // Check maximum occupancy - if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == - DAG.MinOccupancy) + // Rematerialization will not help if occupancy is not limited by reg usage. + if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy) return false; // FIXME: This pass will invalidate cached MBBLiveIns for regions @@ -1272,8 +1271,8 @@ void GCNSchedStage::checkScheduling() { return; } - unsigned TargetOccupancy = - std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF)); + unsigned TargetOccupancy = std::min( + S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); unsigned WavesAfter = std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); unsigned WavesBefore = diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 117afc4a8e8c6..22a550450dc2e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -405,16 +405,16 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { return getBaseReservedNumSGPRs(KernelUsesFlatScratch); } -unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, - unsigned NumSGPRs, - unsigned NumVGPRs) const { - unsigned Occupancy = - std::min(getMaxWavesPerEU(), getOccupancyWithLocalMemSize(LDSSize, F)); - if (NumSGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); - if (NumVGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); - return Occupancy; +std::pair +GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, + unsigned NumSGPRs, unsigned NumVGPRs) const { + auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); + unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); + unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs); + + // Maximum occupancy may be further limited by high SGPR/VGPR usage. + MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc)); + return {std::min(MinOcc, MaxOcc), MaxOcc}; } unsigned GCNSubtarget::getBaseMaxNumSGPRs( diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 3388bc3c5a8de..a22e413508021 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1368,12 +1368,18 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; - /// Return occupancy for the given function. Used LDS and a number of - /// registers if provided. - /// Note, occupancy can be affected by the scratch allocation as well, but + /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can + /// be achieved when the only function running on a CU is \p F, each workgroup + /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p + /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a + /// range, so this returns a range as well. + /// + /// Note that occupancy can be affected by the scratch allocation as well, but /// we do not have enough information to compute it. - unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, - unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; + std::pair computeOccupancy(const Function &F, + unsigned LDSSize = 0, + unsigned NumSGPRs = 0, + unsigned NumVGPRs = 0) const; /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 169f1369fb543..b73af92940906 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -48,7 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, MaxNumWorkGroups = ST.getMaxNumWorkGroups(F); assert(MaxNumWorkGroups.size() == 3); - Occupancy = ST.computeOccupancy(F, getLDSSize()); + Occupancy = ST.computeOccupancy(F, getLDSSize()).second; CallingConv::ID CC = F.getCallingConv(); VRegFlags.reserve(1024); @@ -185,8 +185,7 @@ MachineFunctionInfo *SIMachineFunctionInfo::clone( void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { limitOccupancy(getMaxWavesPerEU()); const GCNSubtarget& ST = MF.getSubtarget(); - limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(), - MF.getFunction())); + limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second); } Register SIMachineFunctionInfo::addPrivateSegmentBuffer( diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 704435dad65d7..11121e6058770 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3642,18 +3642,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const SIMachineFunctionInfo *MFI = MF.getInfo(); - - unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), - MF.getFunction()); + unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first; switch (RC->getID()) { default: return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); case AMDGPU::VGPR_32RegClassID: - return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); + return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF)); case AMDGPU::SGPR_32RegClassID: case AMDGPU::SGPR_LO16RegClassID: - return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); + return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF)); } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll index ab95c226b08b0..27b93872b9f1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -513,29 +513,29 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v16, v[0:1] +; GFX8-NEXT: flat_load_ushort v14, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 ; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v6, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v7, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 ; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v10, v9, v13 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v11, v16, v0 +; GFX8-NEXT: v_add_u16_e32 v13, v14, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[14:15], v11 +; GFX8-NEXT: flat_store_short v[6:7], v13 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -661,55 +661,55 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-LABEL: add_v11i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 18, v0 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v18, v[10:11] -; GFX8-NEXT: flat_load_ushort v19, v[12:13] -; GFX8-NEXT: flat_load_ushort v20, v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 18, v2 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2 ; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: flat_load_ushort v1, v[14:15] -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v4 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_ushort v14, v[14:15] +; GFX8-NEXT: flat_load_ushort v15, v[16:17] +; GFX8-NEXT: flat_load_ushort v16, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u16_e32 v3, v6, v10 +; GFX8-NEXT: v_add_u16_e32 v17, v6, v10 ; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v21, v7, v11 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0 +; GFX8-NEXT: v_add_u16_e32 v18, v7, v11 ; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v22, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: flat_load_ushort v3, v[6:7] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v21, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u16_e32 v19, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4 +; GFX8-NEXT: v_add_u16_e32 v20, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v17, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v18, v11 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v13, v18, v0 +; GFX8-NEXT: v_add_u16_e32 v14, v2, v14 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v18, v19, v1 +; GFX8-NEXT: v_add_u16_e32 v15, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v2, v19, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v20, v13 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v19, v20, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v21, v11 -; GFX8-NEXT: v_or_b32_e32 v2, v22, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_add_u16_e32 v16, v21, v16 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[14:15], v13 -; GFX8-NEXT: flat_store_short v[16:17], v18 -; GFX8-NEXT: flat_store_short v[6:7], v19 +; GFX8-NEXT: flat_store_short v[6:7], v14 +; GFX8-NEXT: flat_store_short v[8:9], v15 +; GFX8-NEXT: flat_store_short v[10:11], v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -794,34 +794,34 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v0, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v2, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v6, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v8, v9, v13 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: v_add_u16_e32 v16, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v7 -; GFX8-NEXT: v_or_b32_e32 v3, v8, v9 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v6, v14, v16 -; GFX8-NEXT: v_add_u16_sdwa v7, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v8, v15, v17 -; GFX8-NEXT: v_add_u16_sdwa v9, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v16, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v8, v6, v14 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v9, v7, v15 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; GFX8-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll index 0b66185d25f3e..8db1f46b0342a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -712,33 +712,33 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] -; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 -; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] -; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] -; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 +; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: s_nop 1 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -747,26 +747,26 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -775,26 +775,26 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -945,33 +945,33 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] -; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 -; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] -; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] -; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 +; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: s_nop 1 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -980,26 +980,26 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1008,26 +1008,26 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1106,7 +1106,7 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1115,23 +1115,23 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1266,7 +1266,7 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1275,23 +1275,23 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,7 +1493,7 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1502,23 +1502,23 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1725,33 +1725,33 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v17 +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] +; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_rcp_f64_e32 v[18:19], v[12:13] -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[10:11] -; GFX6-NEXT: v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0 -; GFX6-NEXT: v_fma_f64 v[22:23], -v[8:9], v[14:15], v[16:17] -; GFX6-NEXT: v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19] -; GFX6-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_fma_f64 v[16:17], -v[12:13], v[18:19], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 -; GFX6-NEXT: v_fma_f64 v[8:9], v[18:19], v[16:17], v[18:19] -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[16:17], v[20:21], v[8:9] -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[22:23], v[10:11], v[14:15] -; GFX6-NEXT: v_fma_f64 v[14:15], -v[12:13], v[16:17], v[20:21] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v21 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v13 +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 +; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] +; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17] +; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: s_nop 1 +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1760,26 +1760,26 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX8-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1788,26 +1788,26 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] ; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_div_scale_f64 v[16:17], vcc, v[0:1], v[4:5], v[0:1] ; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_fma_f64 v[18:19], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[22:23], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[22:23], v[14:15] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] -; GFX9-NEXT: v_mul_f64 v[22:23], v[20:21], v[14:15] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[20:21] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 +; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] ; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 5d76b542fad89..e60739fd84059 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -7678,274 +7678,274 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX6-NEXT: v_not_b32_e32 v25, 63 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19 -; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_lshl_b64 v[23:24], v[0:1], v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v26 -; GFX6-NEXT: v_or_b32_e32 v17, v17, v21 -; GFX6-NEXT: v_or_b32_e32 v18, v18, v22 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 -; GFX6-NEXT: v_not_b32_e32 v8, v16 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v23 -; GFX6-NEXT: v_add_i32_e32 v24, vcc, v23, v25 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v23 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v23 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v24 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1 +; GFX6-NEXT: v_not_b32_e32 v16, v16 +; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 +; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 +; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 +; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24 +; GFX6-NEXT: v_not_b32_e32 v25, 63 +; GFX6-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v23, v25 +; GFX6-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v24, v25 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18 -; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v19 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v26, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 64, v17 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v3 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17 +; GFX6-NEXT: v_or_b32_e32 v3, v16, v19 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v17, v25 +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v17 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc ; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 ; GFX6-NEXT: v_not_b32_e32 v8, v20 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 -; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v14 -; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v25 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v14 +; GFX6-NEXT: v_and_b32_e32 v12, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v12 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v12 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 -; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v14 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v15 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX6-NEXT: v_add_i32_e32 v13, vcc, v12, v25 +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v12 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v18, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX6-NEXT: v_or_b32_e32 v6, v17, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX8-NEXT: v_not_b32_e32 v25, 63 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25 +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v17, v17, v21 -; GFX8-NEXT: v_or_b32_e32 v18, v18, v22 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 -; GFX8-NEXT: v_not_b32_e32 v8, v16 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v23 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v23, v25 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] -; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] +; GFX8-NEXT: v_not_b32_e32 v16, v16 +; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 +; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX8-NEXT: v_not_b32_e32 v25, 63 +; GFX8-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v23, v25 +; GFX8-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v24, v25 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5] -; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v26, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 64, v17 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v3, v16, v19 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v17, v25 +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 ; GFX8-NEXT: v_not_b32_e32 v8, v20 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v14 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v25 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] +; GFX8-NEXT: v_and_b32_e32 v12, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v12 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v12, v25 +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v18, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v17, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19 -; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 +; GFX9-NEXT: v_not_b32_e32 v16, v16 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v17, v17, v21 -; GFX9-NEXT: v_or_b32_e32 v18, v18, v22 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v18, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] -; GFX9-NEXT: v_not_b32_e32 v8, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v17, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v8 -; GFX9-NEXT: v_lshl_or_b32 v1, v10, 31, v1 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v23 -; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v23 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 +; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23 +; GFX9-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 -; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX9-NEXT: v_or_b32_e32 v0, v25, v2 +; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5] -; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v19, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc ; GFX9-NEXT: v_not_b32_e32 v8, v20 -; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v14 -; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] +; GFX9-NEXT: v_and_b32_e32 v13, 0x7f, v8 +; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX9-NEXT: v_add_u32_e32 v14, 0xffffffc0, v13 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v22, v1 -; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v18, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v17, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index dbc8f12c2c25c..36a6614a5620c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -7719,86 +7719,86 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_not_b32_e32 v0, v16 ; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 -; GFX6-NEXT: v_not_b32_e32 v25, 63 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25 -; GFX6-NEXT: v_lshl_b64 v[23:24], v[17:18], v19 -; GFX6-NEXT: v_or_b32_e32 v21, v0, v21 -; GFX6-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v26 +; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX6-NEXT: v_or_b32_e32 v23, v0, v21 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v25 +; GFX6-NEXT: v_or_b32_e32 v24, v1, v22 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[10:11], v0 +; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25 +; GFX6-NEXT: v_not_b32_e32 v26, 63 +; GFX6-NEXT: v_or_b32_e32 v21, v21, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v19, v26 +; GFX6-NEXT: v_or_b32_e32 v22, v22, v1 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v0 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX6-NEXT: v_and_b32_e32 v22, 0x7f, v16 -; GFX6-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v22 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v22 -; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 -; GFX6-NEXT: v_add_i32_e32 v24, vcc, v22, v25 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v25, v26 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX6-NEXT: v_or_b32_e32 v1, v17, v9 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v22 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX6-NEXT: v_not_b32_e32 v4, v20 -; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v18 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 -; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v18 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v16, v26 ; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v19 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v18 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v17 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v10 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v19 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[14:15], v18 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v1, v23, v1 -; GFX6-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v11, v7 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, v10, v26 +; GFX6-NEXT: v_or_b32_e32 v16, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v19, v5, v7 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v11 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v18, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v7, v9, v11 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i128: @@ -7811,86 +7811,86 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_not_b32_e32 v0, v16 ; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 -; GFX8-NEXT: v_not_b32_e32 v25, 63 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25 -; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] -; GFX8-NEXT: v_or_b32_e32 v21, v0, v21 -; GFX8-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[17:18] +; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX8-NEXT: v_or_b32_e32 v23, v0, v21 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v25 +; GFX8-NEXT: v_or_b32_e32 v24, v1, v22 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] +; GFX8-NEXT: v_not_b32_e32 v26, 63 +; GFX8-NEXT: v_or_b32_e32 v21, v21, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v26 +; GFX8-NEXT: v_or_b32_e32 v22, v22, v1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX8-NEXT: v_and_b32_e32 v22, 0x7f, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v22 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v22, v25 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v25, v26 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v17, v9 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11] ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX8-NEXT: v_not_b32_e32 v4, v20 -; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v18 +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v16, v26 ; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 -; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v18, v[14:15] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v1, v23, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v11, v7 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v16, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v19, v5, v7 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v18, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v11 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i128: @@ -7905,83 +7905,83 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 -; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] -; GFX9-NEXT: v_or_b32_e32 v21, v0, v21 -; GFX9-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX9-NEXT: v_or_b32_e32 v23, v0, v21 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v25 +; GFX9-NEXT: v_or_b32_e32 v24, v1, v22 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GFX9-NEXT: v_and_b32_e32 v22, 0x7f, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v22 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 +; GFX9-NEXT: v_or_b32_e32 v21, v21, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v19 +; GFX9-NEXT: v_or_b32_e32 v22, v22, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX9-NEXT: v_or_b32_e32 v1, v17, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v20 -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11] -; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 ; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 -; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 -; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v18, v[14:15] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v1, v23, v1 -; GFX9-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v11, v7 +; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 +; GFX9-NEXT: v_or_b32_e32 v16, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v19, v5, v7 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v17, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v18, v7 +; GFX9-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v7, v9, v11 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index df1afdf77983c..298dfcf048fc4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -715,27 +715,27 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[4:5] -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 4, v2 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[10:11] ; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[14:15] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[10:11] ; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll index 75d4d8816fb30..e8de761540b7a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll @@ -14,167 +14,168 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) ; LOOP-NEXT: v_mov_b32_e32 v4, s0 ; LOOP-NEXT: .LBB0_1: ; %load-store-loop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 +; LOOP-NEXT: v_add_i32_e32 v6, vcc, v2, v4 +; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v3, v5, vcc +; LOOP-NEXT: buffer_load_ubyte v26, v[6:7], s[0:3], 0 addr64 +; LOOP-NEXT: s_waitcnt expcnt(5) +; LOOP-NEXT: buffer_load_ubyte v29, v[6:7], s[0:3], 0 addr64 offset:1 ; LOOP-NEXT: s_waitcnt expcnt(2) -; LOOP-NEXT: v_add_i32_e32 v29, vcc, v2, v4 -; LOOP-NEXT: v_addc_u32_e32 v30, vcc, v3, v5, vcc -; LOOP-NEXT: buffer_load_ubyte v24, v[29:30], s[0:3], 0 addr64 -; LOOP-NEXT: buffer_load_ubyte v27, v[29:30], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: buffer_load_ubyte v34, v[29:30], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_load_ubyte v35, v[29:30], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: buffer_load_ubyte v36, v[29:30], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: buffer_load_ubyte v37, v[29:30], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: buffer_load_ubyte v38, v[29:30], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_load_ubyte v39, v[29:30], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: buffer_load_ubyte v6, v[29:30], s[0:3], 0 addr64 offset:8 -; LOOP-NEXT: buffer_load_ubyte v9, v[29:30], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_load_ubyte v10, v[29:30], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_load_ubyte v31, v[6:7], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_load_ubyte v32, v[6:7], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_load_ubyte v36, v[6:7], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: buffer_load_ubyte v37, v[6:7], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_load_ubyte v38, v[6:7], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_load_ubyte v39, v[6:7], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_load_ubyte v8, v[6:7], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:10 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: buffer_load_ubyte v11, v[29:30], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: buffer_load_ubyte v7, v[29:30], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: buffer_load_ubyte v13, v[29:30], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: buffer_load_ubyte v14, v[29:30], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_load_ubyte v15, v[29:30], s[0:3], 0 addr64 offset:15 -; LOOP-NEXT: buffer_load_ubyte v8, v[29:30], s[0:3], 0 addr64 offset:16 -; LOOP-NEXT: buffer_load_ubyte v17, v[29:30], s[0:3], 0 addr64 offset:17 -; LOOP-NEXT: buffer_load_ubyte v18, v[29:30], s[0:3], 0 addr64 offset:18 -; LOOP-NEXT: buffer_load_ubyte v19, v[29:30], s[0:3], 0 addr64 offset:19 -; LOOP-NEXT: buffer_load_ubyte v12, v[29:30], s[0:3], 0 addr64 offset:20 -; LOOP-NEXT: buffer_load_ubyte v21, v[29:30], s[0:3], 0 addr64 offset:21 -; LOOP-NEXT: buffer_load_ubyte v22, v[29:30], s[0:3], 0 addr64 offset:22 -; LOOP-NEXT: buffer_load_ubyte v23, v[29:30], s[0:3], 0 addr64 offset:23 -; LOOP-NEXT: buffer_load_ubyte v16, v[29:30], s[0:3], 0 addr64 offset:24 -; LOOP-NEXT: buffer_load_ubyte v25, v[29:30], s[0:3], 0 addr64 offset:25 -; LOOP-NEXT: buffer_load_ubyte v26, v[29:30], s[0:3], 0 addr64 offset:26 -; LOOP-NEXT: buffer_load_ubyte v28, v[29:30], s[0:3], 0 addr64 offset:27 -; LOOP-NEXT: buffer_load_ubyte v20, v[29:30], s[0:3], 0 addr64 offset:28 -; LOOP-NEXT: buffer_load_ubyte v31, v[29:30], s[0:3], 0 addr64 offset:29 -; LOOP-NEXT: buffer_load_ubyte v32, v[29:30], s[0:3], 0 addr64 offset:30 -; LOOP-NEXT: buffer_load_ubyte v33, v[29:30], s[0:3], 0 addr64 offset:31 +; LOOP-NEXT: buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_load_ubyte v9, v[6:7], s[0:3], 0 addr64 offset:12 +; LOOP-NEXT: buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_load_ubyte v25, v[6:7], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: buffer_load_ubyte v27, v[6:7], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_load_ubyte v28, v[6:7], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_load_ubyte v30, v[6:7], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: buffer_load_ubyte v33, v[6:7], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_load_ubyte v34, v[6:7], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: buffer_load_ubyte v35, v[6:7], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_waitcnt vmcnt(14) -; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; LOOP-NEXT: v_or_b32_e32 v24, v27, v24 -; LOOP-NEXT: v_lshlrev_b32_e32 v27, 24, v35 -; LOOP-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; LOOP-NEXT: v_or_b32_e32 v27, v27, v29 -; LOOP-NEXT: v_lshlrev_b32_e32 v29, 8, v37 -; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v39 -; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v38 -; LOOP-NEXT: v_or_b32_e32 v29, v29, v36 -; LOOP-NEXT: v_or_b32_e32 v30, v30, v34 -; LOOP-NEXT: v_add_i32_e32 v34, vcc, v0, v4 -; LOOP-NEXT: v_addc_u32_e32 v35, vcc, v1, v5, vcc +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v29 +; LOOP-NEXT: v_or_b32_e32 v26, v6, v26 +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v32 +; LOOP-NEXT: v_lshlrev_b32_e32 v7, 16, v31 +; LOOP-NEXT: v_or_b32_e32 v29, v6, v7 +; LOOP-NEXT: v_lshlrev_b32_e32 v6, 8, v37 +; LOOP-NEXT: v_lshlrev_b32_e32 v7, 24, v39 +; LOOP-NEXT: v_lshlrev_b32_e32 v32, 16, v38 +; LOOP-NEXT: v_or_b32_e32 v31, v6, v36 +; LOOP-NEXT: v_or_b32_e32 v32, v7, v32 +; LOOP-NEXT: v_add_i32_e32 v6, vcc, v0, v4 +; LOOP-NEXT: v_addc_u32_e32 v7, vcc, v1, v5, vcc ; LOOP-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; LOOP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v4 -; LOOP-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; LOOP-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; LOOP-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; LOOP-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; LOOP-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; LOOP-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; LOOP-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; LOOP-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; LOOP-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; LOOP-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; LOOP-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; LOOP-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; LOOP-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; LOOP-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; LOOP-NEXT: s_waitcnt vmcnt(12) -; LOOP-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; LOOP-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; LOOP-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; LOOP-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; LOOP-NEXT: s_waitcnt vmcnt(10) -; LOOP-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; LOOP-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; LOOP-NEXT: s_waitcnt vmcnt(8) -; LOOP-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; LOOP-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; LOOP-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; LOOP-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; LOOP-NEXT: s_waitcnt vmcnt(6) -; LOOP-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; LOOP-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; LOOP-NEXT: s_waitcnt vmcnt(4) -; LOOP-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; LOOP-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; LOOP-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; LOOP-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; LOOP-NEXT: s_waitcnt vmcnt(2) -; LOOP-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; LOOP-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; LOOP-NEXT: s_waitcnt vmcnt(0) -; LOOP-NEXT: v_lshlrev_b32_e32 v33, 24, v33 -; LOOP-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; LOOP-NEXT: v_or_b32_e32 v6, v9, v6 -; LOOP-NEXT: v_or_b32_e32 v9, v11, v10 -; LOOP-NEXT: v_or_b32_e32 v7, v13, v7 -; LOOP-NEXT: v_or_b32_e32 v10, v15, v14 -; LOOP-NEXT: v_or_b32_e32 v8, v17, v8 -; LOOP-NEXT: v_or_b32_e32 v11, v19, v18 -; LOOP-NEXT: v_or_b32_e32 v12, v21, v12 -; LOOP-NEXT: v_or_b32_e32 v13, v23, v22 -; LOOP-NEXT: v_or_b32_e32 v14, v25, v16 -; LOOP-NEXT: v_or_b32_e32 v15, v28, v26 -; LOOP-NEXT: v_or_b32_e32 v16, v31, v20 -; LOOP-NEXT: v_or_b32_e32 v17, v33, v32 -; LOOP-NEXT: v_or_b32_e32 v18, v27, v24 -; LOOP-NEXT: v_or_b32_e32 v19, v30, v29 -; LOOP-NEXT: v_or_b32_e32 v6, v9, v6 -; LOOP-NEXT: v_or_b32_e32 v7, v10, v7 +; LOOP-NEXT: v_lshlrev_b32_e32 v35, 24, v35 +; LOOP-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; LOOP-NEXT: v_or_b32_e32 v8, v11, v8 -; LOOP-NEXT: v_or_b32_e32 v9, v13, v12 -; LOOP-NEXT: v_or_b32_e32 v10, v15, v14 -; LOOP-NEXT: v_or_b32_e32 v11, v17, v16 -; LOOP-NEXT: v_lshrrev_b32_e32 v12, 16, v18 -; LOOP-NEXT: v_bfe_u32 v13, v18, 8, 8 -; LOOP-NEXT: buffer_store_byte v18, v[34:35], s[0:3], 0 addr64 -; LOOP-NEXT: v_lshrrev_b32_e32 v14, 24, v18 -; LOOP-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; LOOP-NEXT: v_bfe_u32 v16, v19, 8, 8 -; LOOP-NEXT: buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:4 -; LOOP-NEXT: v_lshrrev_b32_e32 v17, 24, v19 +; LOOP-NEXT: v_or_b32_e32 v11, v13, v12 +; LOOP-NEXT: v_or_b32_e32 v9, v15, v9 +; LOOP-NEXT: v_or_b32_e32 v12, v17, v16 +; LOOP-NEXT: v_or_b32_e32 v10, v19, v10 +; LOOP-NEXT: v_or_b32_e32 v13, v21, v20 +; LOOP-NEXT: v_or_b32_e32 v14, v23, v14 +; LOOP-NEXT: v_or_b32_e32 v15, v25, v24 +; LOOP-NEXT: v_or_b32_e32 v16, v27, v18 +; LOOP-NEXT: v_or_b32_e32 v17, v30, v28 +; LOOP-NEXT: v_or_b32_e32 v18, v33, v22 +; LOOP-NEXT: v_or_b32_e32 v19, v35, v34 +; LOOP-NEXT: v_or_b32_e32 v20, v29, v26 +; LOOP-NEXT: v_or_b32_e32 v21, v32, v31 +; LOOP-NEXT: v_or_b32_e32 v8, v11, v8 +; LOOP-NEXT: v_or_b32_e32 v9, v12, v9 +; LOOP-NEXT: v_or_b32_e32 v10, v13, v10 +; LOOP-NEXT: v_or_b32_e32 v11, v15, v14 +; LOOP-NEXT: v_or_b32_e32 v12, v17, v16 +; LOOP-NEXT: v_or_b32_e32 v13, v19, v18 +; LOOP-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; LOOP-NEXT: v_bfe_u32 v15, v20, 8, 8 +; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 +; LOOP-NEXT: v_lshrrev_b32_e32 v16, 24, v20 +; LOOP-NEXT: v_lshrrev_b32_e32 v17, 16, v21 +; LOOP-NEXT: v_bfe_u32 v18, v21, 8, 8 +; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:4 +; LOOP-NEXT: v_lshrrev_b32_e32 v19, 24, v21 ; LOOP-NEXT: s_waitcnt expcnt(1) -; LOOP-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_bfe_u32 v19, v6, 8, 8 -; LOOP-NEXT: buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:8 +; LOOP-NEXT: v_lshrrev_b32_e32 v20, 16, v8 ; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; LOOP-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; LOOP-NEXT: v_bfe_u32 v21, v7, 8, 8 -; LOOP-NEXT: buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:12 -; LOOP-NEXT: s_waitcnt expcnt(0) -; LOOP-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; LOOP-NEXT: v_bfe_u32 v23, v8, 8, 8 -; LOOP-NEXT: buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:16 +; LOOP-NEXT: v_bfe_u32 v21, v8, 8, 8 +; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:8 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v8 -; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v9 -; LOOP-NEXT: v_bfe_u32 v25, v9, 8, 8 -; LOOP-NEXT: buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:20 +; LOOP-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; LOOP-NEXT: v_bfe_u32 v23, v9, 8, 8 +; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:12 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; LOOP-NEXT: v_bfe_u32 v27, v10, 8, 8 -; LOOP-NEXT: buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; LOOP-NEXT: v_bfe_u32 v25, v10, 8, 8 +; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:16 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; LOOP-NEXT: v_bfe_u32 v29, v11, 8, 8 -; LOOP-NEXT: buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; LOOP-NEXT: v_bfe_u32 v27, v11, 8, 8 +; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:20 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: v_lshrrev_b32_e32 v11, 24, v11 -; LOOP-NEXT: buffer_store_byte v13, v[34:35], s[0:3], 0 addr64 offset:1 -; LOOP-NEXT: buffer_store_byte v12, v[34:35], s[0:3], 0 addr64 offset:2 -; LOOP-NEXT: buffer_store_byte v14, v[34:35], s[0:3], 0 addr64 offset:3 -; LOOP-NEXT: buffer_store_byte v16, v[34:35], s[0:3], 0 addr64 offset:5 -; LOOP-NEXT: buffer_store_byte v15, v[34:35], s[0:3], 0 addr64 offset:6 -; LOOP-NEXT: buffer_store_byte v17, v[34:35], s[0:3], 0 addr64 offset:7 -; LOOP-NEXT: buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:9 -; LOOP-NEXT: buffer_store_byte v18, v[34:35], s[0:3], 0 addr64 offset:10 -; LOOP-NEXT: buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:11 -; LOOP-NEXT: buffer_store_byte v21, v[34:35], s[0:3], 0 addr64 offset:13 -; LOOP-NEXT: buffer_store_byte v20, v[34:35], s[0:3], 0 addr64 offset:14 -; LOOP-NEXT: buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:15 -; LOOP-NEXT: buffer_store_byte v23, v[34:35], s[0:3], 0 addr64 offset:17 -; LOOP-NEXT: buffer_store_byte v22, v[34:35], s[0:3], 0 addr64 offset:18 -; LOOP-NEXT: buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:19 -; LOOP-NEXT: buffer_store_byte v25, v[34:35], s[0:3], 0 addr64 offset:21 -; LOOP-NEXT: buffer_store_byte v24, v[34:35], s[0:3], 0 addr64 offset:22 -; LOOP-NEXT: buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:23 -; LOOP-NEXT: buffer_store_byte v27, v[34:35], s[0:3], 0 addr64 offset:25 -; LOOP-NEXT: buffer_store_byte v26, v[34:35], s[0:3], 0 addr64 offset:26 -; LOOP-NEXT: buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:27 -; LOOP-NEXT: buffer_store_byte v29, v[34:35], s[0:3], 0 addr64 offset:29 -; LOOP-NEXT: buffer_store_byte v28, v[34:35], s[0:3], 0 addr64 offset:30 -; LOOP-NEXT: buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:31 +; LOOP-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; LOOP-NEXT: v_bfe_u32 v29, v12, 8, 8 +; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:24 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; LOOP-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; LOOP-NEXT: v_bfe_u32 v31, v13, 8, 8 +; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:28 +; LOOP-NEXT: s_waitcnt expcnt(0) +; LOOP-NEXT: v_lshrrev_b32_e32 v13, 24, v13 +; LOOP-NEXT: buffer_store_byte v15, v[6:7], s[0:3], 0 addr64 offset:1 +; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[0:3], 0 addr64 offset:2 +; LOOP-NEXT: buffer_store_byte v16, v[6:7], s[0:3], 0 addr64 offset:3 +; LOOP-NEXT: buffer_store_byte v18, v[6:7], s[0:3], 0 addr64 offset:5 +; LOOP-NEXT: buffer_store_byte v17, v[6:7], s[0:3], 0 addr64 offset:6 +; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[0:3], 0 addr64 offset:7 +; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[0:3], 0 addr64 offset:9 +; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[0:3], 0 addr64 offset:10 +; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 offset:11 +; LOOP-NEXT: buffer_store_byte v23, v[6:7], s[0:3], 0 addr64 offset:13 +; LOOP-NEXT: buffer_store_byte v22, v[6:7], s[0:3], 0 addr64 offset:14 +; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[0:3], 0 addr64 offset:15 +; LOOP-NEXT: buffer_store_byte v25, v[6:7], s[0:3], 0 addr64 offset:17 +; LOOP-NEXT: buffer_store_byte v24, v[6:7], s[0:3], 0 addr64 offset:18 +; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[0:3], 0 addr64 offset:19 +; LOOP-NEXT: buffer_store_byte v27, v[6:7], s[0:3], 0 addr64 offset:21 +; LOOP-NEXT: buffer_store_byte v26, v[6:7], s[0:3], 0 addr64 offset:22 +; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[0:3], 0 addr64 offset:23 +; LOOP-NEXT: buffer_store_byte v29, v[6:7], s[0:3], 0 addr64 offset:25 +; LOOP-NEXT: buffer_store_byte v28, v[6:7], s[0:3], 0 addr64 offset:26 +; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[0:3], 0 addr64 offset:27 +; LOOP-NEXT: buffer_store_byte v31, v[6:7], s[0:3], 0 addr64 offset:29 +; LOOP-NEXT: buffer_store_byte v30, v[6:7], s[0:3], 0 addr64 offset:30 +; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[0:3], 0 addr64 offset:31 ; LOOP-NEXT: s_cbranch_vccnz .LBB0_1 ; LOOP-NEXT: ; %bb.2: ; %memcpy-split ; LOOP-NEXT: s_mov_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 756eb2788607b..7c6daf769aec2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2074,208 +2074,208 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v16, v0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX7-NEXT: v_mov_b32_e32 v17, v1 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mov_b32_e32 v18, v23 -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-NEXT: v_mov_b32_e32 v1, v23 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX7-NEXT: v_mov_b32_e32 v2, v22 -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc +; GFX7-NEXT: v_mov_b32_e32 v20, v18 +; GFX7-NEXT: v_mov_b32_e32 v18, v19 +; GFX7-NEXT: v_mov_b32_e32 v19, v16 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX7-NEXT: v_mov_b32_e32 v19, v22 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mov_b32_e32 v20, v11 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc +; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v16, v0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mov_b32_e32 v18, v23 -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-NEXT: v_mov_b32_e32 v1, v23 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v2, v22 -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc +; GFX8-NEXT: v_mov_b32_e32 v20, v18 +; GFX8-NEXT: v_mov_b32_e32 v18, v19 +; GFX8-NEXT: v_mov_b32_e32 v19, v16 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX8-NEXT: v_mov_b32_e32 v19, v22 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mov_b32_e32 v20, v11 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v10 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mov_b32_e32 v18, v23 -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, v20 -; GFX9-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v22 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc +; GFX9-NEXT: v_mov_b32_e32 v20, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v16 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] +; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] +; GFX9-NEXT: v_mov_b32_e32 v19, v22 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] +; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] +; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mov_b32_e32 v20, v11 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13] +; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13] +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index e289ee759da15..4bfd29430ff1e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -1962,8 +1962,9 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v17 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 -; GFX6-NEXT: v_max_i32_e32 v18, 0, v3 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 +; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17 @@ -1987,70 +1988,69 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v17 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v7 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v7 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v7 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v16, v19 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v8 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v8 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v24 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v9 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v8 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v8 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v9 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v25 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v10 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v9 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v9 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v10 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v26 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v11 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v10 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v10 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v11 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v27 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v12 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v11 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v11 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v12 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v28 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v13 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v12 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v12 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v13 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v29 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX6-NEXT: v_min_i32_e32 v19, 0, v14 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v13 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v13 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GFX6-NEXT: v_max_i32_e32 v17, 0, v14 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v31, v19 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, v19, v30 -; GFX6-NEXT: v_min_i32_e32 v17, v19, v17 +; GFX6-NEXT: v_min_i32_e32 v17, 0, v14 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 +; GFX6-NEXT: v_max_i32_e32 v18, 0, v14 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; GFX6-NEXT: v_max_i32_e32 v17, 0, v15 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v15 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v31, v17 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2083,8 +2083,9 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 -; GFX8-NEXT: v_max_i32_e32 v18, 0, v3 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 +; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 @@ -2108,70 +2109,69 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 ; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v7 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v7 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v7 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v16, v19 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v8 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v8 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v24 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v9 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v8 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v8 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v9 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v25 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v10 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v9 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v9 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v10 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v26 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v11 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v10 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v10 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v11 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v27 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v12 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v11 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v11 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v12 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v28 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v13 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v12 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v12 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v13 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v29 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX8-NEXT: v_min_i32_e32 v19, 0, v14 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v13 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v13 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17 -; GFX8-NEXT: v_max_i32_e32 v17, 0, v14 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v31, v19 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, v19, v30 -; GFX8-NEXT: v_min_i32_e32 v17, v19, v17 +; GFX8-NEXT: v_min_i32_e32 v17, 0, v14 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 +; GFX8-NEXT: v_max_i32_e32 v18, 0, v14 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 +; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v16, v18 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17 ; GFX8-NEXT: v_max_i32_e32 v17, 0, v15 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v17 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v15 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v31, v17 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 14b30e0d79946..c77438f98b84e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -429,190 +429,193 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v5, v0, v9 +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 ; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 ; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v14, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v11, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v11, v12, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5 +; GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v6 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v18, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v16, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 +; GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v15, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v18, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v21, v19, v[10:11] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v19, v10 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 +; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v10, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_xor_b32_e32 v1, v4, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14 -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v10, v8 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v14 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 ; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v12, v13 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v13 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -622,23 +625,23 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1189,123 +1192,123 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_trunc_f32_e32 v8, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mov_b32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mov_b32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v17, v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v15, vcc +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -1313,34 +1316,34 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 ; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 ; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -1348,19 +1351,19 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1421,178 +1424,178 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_trunc_f32_e32 v8, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; CGP-NEXT: v_mov_b32_e32 v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v18, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v19, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v16 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15 -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 +; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1851,6 +1854,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v13, v11 @@ -1861,22 +1865,22 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v7, v12 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] ; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v11 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 ; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v18, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v18, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc @@ -1891,24 +1895,24 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v7 +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7 ; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v15, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 ; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7 ; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1917,164 +1921,166 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v1 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v14, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v15, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v14, v[11:12] -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v16, v11, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v11, v12, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 ; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v14 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v18, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v16, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v18, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v21, v19, v[11:12] -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v19, v11 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v18, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 +; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v11, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v12, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v11, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] ; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] @@ -2085,7 +2091,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 @@ -2099,8 +2105,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2661,16 +2667,16 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, v13, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v1, vcc ; GISEL-NEXT: v_mul_hi_u32 v1, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, v13, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 ; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 5f568839a28dd..40f29c56c8f12 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1537,36 +1537,36 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s2, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 -; GFX8-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 +; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, s1, v9 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 +; GFX8-NEXT: v_mov_b32_e32 v6, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9 ; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX8-NEXT: v_mov_b32_e32 v8, s6 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 @@ -1635,7 +1635,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] @@ -1680,206 +1679,206 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1] +; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v1, s11, v2 +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, v13, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s18, s6 ; GFX9-NEXT: s_addc_u32 s1, s19, s6 ; GFX9-NEXT: s_add_u32 s2, s2, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v16, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v17, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v17 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX9-NEXT: v_trunc_f32_e32 v16, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v18, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v14, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v17 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v14, v[2:3] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v4, v12, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s20, v18, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v3, v14, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v16, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 -; GFX9-NEXT: v_mul_hi_u32 v11, v18, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v14, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 +; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v11, v14, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v18, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v11, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v11, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v18, v1 -; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s5, v11, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2] -; GFX9-NEXT: v_xor_b32_e32 v8, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v9, s17, v9 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v11, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v10, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s16, v8 -; GFX9-NEXT: v_xor_b32_e32 v5, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v12, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v9, v10, vcc -; GFX9-NEXT: v_mul_hi_u32 v9, v11, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, v12, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, v11, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v9, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 +; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 +; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc +; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s9, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s8, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v4 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, s9, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, s8, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 +; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 ; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v7 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc -; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v3 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s2, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s2, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 -; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v11, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 +; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 +; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 +; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v3, s0, v10 -; GFX9-NEXT: v_xor_b32_e32 v4, s1, v9 -; GFX9-NEXT: v_mov_b32_e32 v9, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX9-NEXT: v_xor_b32_e32 v8, s6, v8 -; GFX9-NEXT: v_mov_b32_e32 v9, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v9, vcc -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[12:13] -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[14:15] +; GFX9-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc +; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index ee7a040e41fd5..bb8f3cd6990f8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -419,24 +419,24 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 ; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 ; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 ; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -445,190 +445,191 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v9 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v12, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v12, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v11, v0 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v12, v10 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1] ; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v8, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v16, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v7, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v9, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v17, v[8:9] -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v8 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v16, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v13, v[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v8 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -1117,93 +1118,96 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_trunc_f32_e32 v8, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mov_b32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mov_b32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc @@ -1216,22 +1220,19 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -1239,34 +1240,34 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] ; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 ; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 ; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -1274,19 +1275,19 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1345,96 +1346,96 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_trunc_f32_e32 v8, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; CGP-NEXT: v_mov_b32_e32 v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 ; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 @@ -1443,78 +1444,78 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] +; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1710,93 +1711,96 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_trunc_f32_e32 v8, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mov_b32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mov_b32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v8, vcc +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc @@ -1809,22 +1813,19 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v0 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v19 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -1832,34 +1833,34 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] ; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 ; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 ; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 @@ -1867,19 +1868,19 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 ; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1938,96 +1939,96 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_trunc_f32_e32 v8, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; CGP-NEXT: v_mov_b32_e32 v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 ; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v18, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v18, v13, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 ; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 @@ -2036,78 +2037,78 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] +; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 ; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc ; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -2350,7 +2351,6 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v8, 0x1000 ; GISEL-NEXT: v_mov_b32_e32 v9, 0 ; GISEL-NEXT: v_lshl_b64 v[4:5], v[8:9], v4 -; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], v6 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v5, v7, vcc @@ -2425,172 +2425,175 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v10, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v13, v[10:11] -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v0 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v15, v10, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v0, v7, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v9, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v8 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v11, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v10, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v7 -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, v9, v1, s[4:5] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v8, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v20, v9 -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v20, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v14, v5 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v17, v[9:10] -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v17, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v20, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6 +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 +; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 +; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8 +; GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1] +; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v13, v10 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mov_b32_e32 v1, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 +; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v17, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v20, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v20, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v14, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v15, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v14, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v7 -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v13, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 @@ -2605,13 +2608,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -3030,33 +3033,34 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v9, v7 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9 ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_mov_b32_e32 v4, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v7 ; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] ; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc @@ -3064,215 +3068,214 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v4 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_mov_b32_e32 v4, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] ; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v13, v4, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_and_b32_e32 v9, 0xffffff, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9] -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v6, v9, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v5, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 0, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v10, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v11, v6 +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v1 -; GISEL-NEXT: v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v3, v7 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v6, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 -; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v2 -; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v3 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 0, v4 +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, v3, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v14, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v5 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v4 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v11, v[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v15, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v0, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v11, v5 ; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v16 +; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v1 +; GISEL-NEXT: v_subbrev_u32_e64 v17, s[6:7], 0, v10, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v17, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v16, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v17, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, v18, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v19, v17, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v19, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v15, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v15, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_mul_hi_u32 v5, v17, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v18, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v11, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v4 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 0, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v3, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v15, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v16, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v14, v[0:1] +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[0:1] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v16, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, 0, v2 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v17, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v14, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v6 ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v1 -; GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[1:2] -; GISEL-NEXT: v_subrev_i32_e32 v1, vcc, 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v1 +; GISEL-NEXT: v_mov_b32_e32 v1, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v7, v[1:2] +; GISEL-NEXT: v_subrev_i32_e32 v1, vcc, 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v3, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v4 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0, v3 ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 43ebe156eb2a2..5673a6c6e869d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -1965,8 +1965,9 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 +; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17 @@ -1990,70 +1991,69 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 ; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v7 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v7 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v7 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v8 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v8 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v9 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v9 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v10 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v10 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v11 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v11 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v12 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v12 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v13 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v13 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v14 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v14 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v15 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v15 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v19, v16 +; GFX6-NEXT: v_min_i32_e32 v18, -1, v15 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v18, v16 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 +; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2086,8 +2086,9 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 +; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17 @@ -2111,70 +2112,69 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 -; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v7 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v7 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v8 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v8 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v9 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v9 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v10 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v10 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v11 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v11 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v12 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v12 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v13 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v13 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v14 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v14 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v15 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v15 -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v19, v16 +; GFX8-NEXT: v_min_i32_e32 v18, -1, v15 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v18, v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 ; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 1ee521b3dedac..f5a901b024ef5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -365,256 +365,256 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_udiv_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v17, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v8 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 -; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 -; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v20, v4, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v6, v12 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v12, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[16:17] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v19, v4 -; GISEL-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v0, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v2, s[12:13] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], v1, v16, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v16 -; GISEL-NEXT: v_subb_u32_e64 v16, s[6:7], v3, v4, s[8:9] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[22:23] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19] -; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v20, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v18, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v17, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64: @@ -1252,256 +1252,256 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v10, 0 ; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4 ; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v18, v6 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v18, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v6 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 -; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 -; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v7 -; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v2, v4, v12 -; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v16, v20 -; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v12, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v19, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 -; GISEL-NEXT: v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[16:17] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v17, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[14:15], v17, v8 -; GISEL-NEXT: v_addc_u32_e64 v17, s[10:11], 0, v0, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[18:19] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, v8 -; GISEL-NEXT: v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[22:23] -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, vcc, v3, v2, s[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v7, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[14:15] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v4, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v19, s[6:7] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v13, v18, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v17, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: @@ -1904,16 +1904,14 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_udiv_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 @@ -1929,76 +1927,78 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v17 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_lo_u32 v23, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v24, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v25, v7, v13 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v23, v18 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 ; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 @@ -2007,140 +2007,140 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v12, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4 ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v20, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v21 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v20, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v18 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v15 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v13, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v13, s[6:7], 0, v13 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v14, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v15, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v13, vcc, 0, v13, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18 +; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11] +; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2 +; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9] +; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v17, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v20, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, v19, v21, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index ffebde52df4a3..e3c1a52696b47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -1087,95 +1087,95 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s14 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14 -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 -; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 ; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v6 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX8-NEXT: v_trunc_f32_e32 v14, v2 -; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v14 +; GFX8-NEXT: v_trunc_f32_e32 v3, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v3 ; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v12, 0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v3 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v12, v[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3] -; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12 -; GFX8-NEXT: v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v15, v[2:3] -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1 -; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v3, v15, v1 +; GFX8-NEXT: v_mul_lo_u32 v17, v12, v2 +; GFX8-NEXT: v_mul_hi_u32 v5, v12, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v15, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, v14, v2 -; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3 +; GFX8-NEXT: v_mul_hi_u32 v17, v12, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v17 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 +; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v1 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v12, 0 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v17, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v15, v[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v12, v[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v12, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v19, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3 -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v15, v3 +; GFX8-NEXT: v_mul_lo_u32 v9, v12, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v8, v15, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, v12, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v20, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, v14, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3 +; GFX8-NEXT: v_mul_lo_u32 v8, v15, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v12, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc ; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3 ; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] @@ -1216,27 +1216,27 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 -; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v9 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v14 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v7 +; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v14, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc ; GFX8-NEXT: v_mov_b32_e32 v9, s4 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] @@ -1330,182 +1330,181 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v9, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v8, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v1 -; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v1 +; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v4, v2, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s17, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s4, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v8 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX9-NEXT: v_trunc_f32_e32 v15, v4 -; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v15 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v11 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; GFX9-NEXT: v_trunc_f32_e32 v4, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 +; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2 +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13 -; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v16, v[4:5] -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 +; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, v12, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s4, v11 -; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 +; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 +; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v16, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v3 -; GFX9-NEXT: v_add3_u32 v4, v6, v5, v4 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v16, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v4, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v18, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v19, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s3, v16, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v13, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, v15, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, v16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v20, vcc -; GFX9-NEXT: v_mul_hi_u32 v11, v16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v11 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v6, v17 +; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 +; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10 +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2 +; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v7, v15, v4 +; GFX9-NEXT: v_mul_lo_u32 v8, v12, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc +; GFX9-NEXT: v_mul_hi_u32 v10, v12, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v20, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v11, v15, v6 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v15, v5 +; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, v12, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX9-NEXT: v_mul_hi_u32 v9, v16, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v15, v6 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_add_u32_e32 v9, v11, v9 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s19, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, s18, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v2, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, s19, v5 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s19, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_hi_u32 v9, s18, v6 -; GFX9-NEXT: v_mul_hi_u32 v13, s19, v6 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s6, v12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, v11, v9 -; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13 -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v9, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v10, s19 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v12, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s18, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, s19, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s6, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s6, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1] -; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[12:13] -; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[14:15] +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v5, v8, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v8, s18, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s19, v5 +; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, s18, v5 +; GFX9-NEXT: v_mul_hi_u32 v12, s19, v5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v9, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, v10, v8 +; GFX9-NEXT: v_add3_u32 v8, v0, v1, v12 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[0:1] +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s18, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v9, v0, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_sub_u32_e32 v0, s19, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s6, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 1, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s6, v10 +; GFX9-NEXT: v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[0:1] +; GFX9-NEXT: global_store_dwordx4 v13, v[2:5], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v13, v[6:9], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index faad7e93da5d3..2be4b52198b45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -359,254 +359,254 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_urem_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v5, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 -; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 -; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 -; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] -; GISEL-NEXT: v_subbrev_u32_e64 v19, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v19, v7 -; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v19, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64: @@ -1103,20 +1103,20 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s4, 0, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 ; GISEL-NEXT: s_cmp_lg_u32 s5, 0 -; GISEL-NEXT: s_subb_u32 s5, 0, 0 +; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: v_trunc_f32_e32 v7, v7 ; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v8, v7, v5 ; GISEL-NEXT: v_mul_lo_u32 v9, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s4, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, s6, v6 ; GISEL-NEXT: v_mul_hi_u32 v11, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, s5, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, s7, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v8 ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 ; GISEL-NEXT: v_mul_hi_u32 v14, v6, v9 @@ -1134,41 +1134,41 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v17, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v15 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v18 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v11 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v7, v10, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, s4, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, s6, v11 ; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc ; GISEL-NEXT: v_mul_lo_u32 v8, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s5, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, s7, v6 ; GISEL-NEXT: v_mul_hi_u32 v15, v6, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, v10, v5 ; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 @@ -1176,9 +1176,9 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_mul_lo_u32 v5, v7, v5 ; GISEL-NEXT: v_mul_lo_u32 v19, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 @@ -1186,38 +1186,38 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v15, v11, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v21, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v21, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v5 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v5 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v19, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v14, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v21, v18 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v11 @@ -1675,254 +1675,254 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mov_b32_e32 v10, 0 ; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4 ; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 -; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 -; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 -; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 -; GISEL-NEXT: v_trunc_f32_e32 v13, v13 -; GISEL-NEXT: v_trunc_f32_e32 v14, v14 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 -; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 -; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v21, s[6:7], v21, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v22, vcc, v22, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[16:17] -; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 -; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 -; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 -; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 ; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v4, v11 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v13 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v6 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v4 -; GISEL-NEXT: v_sub_i32_e64 v7, s[14:15], v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v4, s[16:17], v13, v4 -; GISEL-NEXT: v_add_i32_e64 v6, s[18:19], v17, v6 -; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 -; GISEL-NEXT: v_subb_u32_e64 v6, s[6:7], v3, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v8, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v5, s[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v8 -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v8 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[8:9] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v12, v7, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: @@ -2319,16 +2319,14 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_urem_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 @@ -2344,76 +2342,78 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v17 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v22, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_lo_u32 v23, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v24, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v25, v7, v13 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v19, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v24, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v22 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v21 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v25 -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v20 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v23, v18 -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v24, v21 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v16, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v13 ; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 @@ -2422,136 +2422,136 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v12, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4 ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v21, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v20, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v21 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v4, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v0, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v3, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v11, vcc -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 0042d34e235d1..4faa7edadf07a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1346,29 +1346,29 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 ; GFX6-NEXT: v_min_u32_e32 v16, v3, v19 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_min_u32_e32 v17, v11, v27 -; GFX6-NEXT: v_min_u32_e32 v18, v12, v28 -; GFX6-NEXT: v_min_u32_e32 v19, v13, v29 -; GFX6-NEXT: v_min_u32_e32 v20, v14, v30 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v18 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v19 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v20 +; GFX6-NEXT: v_min_u32_e32 v17, v4, v20 +; GFX6-NEXT: v_min_u32_e32 v18, v5, v21 +; GFX6-NEXT: v_min_u32_e32 v19, v6, v22 +; GFX6-NEXT: v_min_u32_e32 v20, v7, v23 +; GFX6-NEXT: v_min_u32_e32 v21, v8, v24 +; GFX6-NEXT: v_min_u32_e32 v22, v9, v25 +; GFX6-NEXT: v_min_u32_e32 v23, v10, v26 +; GFX6-NEXT: v_min_u32_e32 v24, v11, v27 +; GFX6-NEXT: v_min_u32_e32 v25, v12, v28 +; GFX6-NEXT: v_min_u32_e32 v26, v13, v29 +; GFX6-NEXT: v_min_u32_e32 v27, v14, v30 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_min_u32_e32 v16, v15, v16 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll index daed0986fa9c8..0ae2b4f549919 100644 --- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll @@ -823,32 +823,32 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) { ; GFX8-NEXT: v_sub_u16_sdwa v14, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v0 +; GFX8-NEXT: v_max_i16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v0, v0, v19 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_sub_u16_e32 v8, 0, v1 +; GFX8-NEXT: v_max_i16_sdwa v15, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_sub_u16_e32 v16, 0, v7 ; GFX8-NEXT: v_sub_u16_e32 v17, 0, v6 ; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5 ; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v21, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v22, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v23, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v8, 0, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_sub_u16_e32 v15, 0, v2 ; GFX8-NEXT: v_max_i16_sdwa v9, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v12, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v13, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v14, v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v15, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v0, v0, v23 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v22 -; GFX8-NEXT: v_max_i16_e32 v2, v2, v21 -; GFX8-NEXT: v_max_i16_e32 v3, v3, v20 +; GFX8-NEXT: v_max_i16_e32 v2, v2, v15 +; GFX8-NEXT: v_max_i16_e32 v3, v3, v8 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v19 ; GFX8-NEXT: v_max_i16_e32 v5, v5, v18 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v17 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v14 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v12 @@ -1255,85 +1255,85 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v16, 0 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v0 -; GFX8-NEXT: v_max_i16_sdwa v19, v0, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v0, v0, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v1 -; GFX8-NEXT: v_max_i16_sdwa v20, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v1, v1, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v2 -; GFX8-NEXT: v_max_i16_sdwa v19, v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v2, v2, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v3 -; GFX8-NEXT: v_max_i16_sdwa v20, v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v3, v3, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v4 -; GFX8-NEXT: v_max_i16_sdwa v19, v4, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v4, v4, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v5 -; GFX8-NEXT: v_max_i16_sdwa v20, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v5, v5, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v6 -; GFX8-NEXT: v_max_i16_sdwa v19, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v6, v6, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v7 -; GFX8-NEXT: v_max_i16_sdwa v20, v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v7, v7, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v8 -; GFX8-NEXT: v_max_i16_sdwa v19, v8, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v8, v8, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v8, v8, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v9 -; GFX8-NEXT: v_max_i16_sdwa v20, v9, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v9, v9, v19 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v10 -; GFX8-NEXT: v_max_i16_sdwa v19, v10, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v10, v10, v20 -; GFX8-NEXT: v_sub_u16_sdwa v20, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v10, v10, v19 -; GFX8-NEXT: v_sub_u16_e32 v19, 0, v11 -; GFX8-NEXT: v_max_i16_sdwa v20, v11, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v11, v11, v19 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v0 +; GFX8-NEXT: v_max_i16_sdwa v18, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v0, v0, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v1 +; GFX8-NEXT: v_max_i16_sdwa v19, v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v1, v1, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v2 +; GFX8-NEXT: v_max_i16_sdwa v18, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v2, v2, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v3 +; GFX8-NEXT: v_max_i16_sdwa v19, v3, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v3, v3, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4 +; GFX8-NEXT: v_max_i16_sdwa v18, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v4, v4, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5 +; GFX8-NEXT: v_max_i16_sdwa v19, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v5, v5, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v6 +; GFX8-NEXT: v_max_i16_sdwa v18, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v6, v6, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v7 +; GFX8-NEXT: v_max_i16_sdwa v19, v7, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v7, v7, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v8 +; GFX8-NEXT: v_max_i16_sdwa v18, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v8, v8, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v8, v8, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v9 +; GFX8-NEXT: v_max_i16_sdwa v19, v9, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v9, v9, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v10 +; GFX8-NEXT: v_max_i16_sdwa v18, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v10, v10, v19 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v11 +; GFX8-NEXT: v_max_i16_sdwa v19, v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v11, v11, v18 +; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX8-NEXT: v_sub_u16_e32 v19, 0, v12 +; GFX8-NEXT: v_max_i16_sdwa v18, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v12, v12, v19 ; GFX8-NEXT: v_sub_u16_sdwa v17, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v18, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_sdwa v16, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v11, v11, v20 -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v12 -; GFX8-NEXT: v_max_i16_sdwa v16, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_e32 v12, v12, v20 -; GFX8-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX8-NEXT: v_sub_u16_e32 v16, 0, v13 -; GFX8-NEXT: v_max_i16_sdwa v19, v13, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e32 v20, 0, v15 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v16 +; GFX8-NEXT: v_sub_u16_sdwa v19, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_sdwa v16, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v12, v12, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v13 +; GFX8-NEXT: v_max_i16_sdwa v16, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_e32 v13, v13, v18 +; GFX8-NEXT: v_sub_u16_e32 v18, 0, v15 +; GFX8-NEXT: v_or_b32_e32 v13, v13, v16 ; GFX8-NEXT: v_sub_u16_e32 v16, 0, v14 ; GFX8-NEXT: v_max_i16_sdwa v17, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v18, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v19, v14, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_e32 v14, v14, v16 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v20 -; GFX8-NEXT: v_or_b32_e32 v13, v13, v19 -; GFX8-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v18 +; GFX8-NEXT: v_or_b32_e32 v14, v14, v19 ; GFX8-NEXT: v_or_b32_e32 v15, v15, v17 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index 033af69243801..cd5b585a8c4e2 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -474,44 +474,44 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s4, s11, s39 -; GFX6-NEXT: s_add_i32 s5, s10, s38 -; GFX6-NEXT: s_add_i32 s6, s9, s37 -; GFX6-NEXT: s_add_i32 s7, s8, s36 -; GFX6-NEXT: s_add_i32 s8, s15, s43 -; GFX6-NEXT: s_add_i32 s9, s14, s42 -; GFX6-NEXT: s_add_i32 s10, s13, s41 -; GFX6-NEXT: s_add_i32 s11, s12, s40 -; GFX6-NEXT: s_add_i32 s12, s19, s47 -; GFX6-NEXT: s_add_i32 s13, s18, s46 -; GFX6-NEXT: s_add_i32 s14, s17, s45 -; GFX6-NEXT: s_add_i32 s15, s16, s44 -; GFX6-NEXT: s_add_i32 s16, s23, s51 -; GFX6-NEXT: s_add_i32 s17, s22, s50 -; GFX6-NEXT: s_add_i32 s18, s21, s49 -; GFX6-NEXT: s_add_i32 s19, s20, s48 -; GFX6-NEXT: v_mov_b32_e32 v0, s19 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: s_add_i32 s6, s11, s39 +; GFX6-NEXT: s_add_i32 s7, s10, s38 +; GFX6-NEXT: s_add_i32 s10, s15, s43 +; GFX6-NEXT: s_add_i32 s11, s14, s42 +; GFX6-NEXT: s_add_i32 s14, s19, s47 +; GFX6-NEXT: s_add_i32 s15, s18, s46 +; GFX6-NEXT: s_add_i32 s18, s23, s51 +; GFX6-NEXT: s_add_i32 s19, s22, s50 +; GFX6-NEXT: s_add_i32 s21, s21, s49 +; GFX6-NEXT: s_add_i32 s20, s20, s48 +; GFX6-NEXT: s_add_i32 s17, s17, s45 +; GFX6-NEXT: s_add_i32 s16, s16, s44 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 +; GFX6-NEXT: v_mov_b32_e32 v3, s18 +; GFX6-NEXT: s_add_i32 s13, s13, s41 +; GFX6-NEXT: s_add_i32 s12, s12, s40 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v3, s14 +; GFX6-NEXT: s_add_i32 s9, s9, s37 +; GFX6-NEXT: s_add_i32 s8, s8, s36 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 236956c1829e7..f176f34f84736 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -485,13 +485,10 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) { ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; HSA-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[24:25] ; HSA-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; HSA-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[26:27] ; HSA-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; HSA-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[28:29] ; HSA-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v3, -1, v6, vcc @@ -500,13 +497,10 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) { ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; HSA-NEXT: v_cndmask_b32_e32 v5, -1, v10, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; HSA-NEXT: v_cndmask_b32_e64 v13, -1, v26, s[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v6, -1, v12, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; HSA-NEXT: v_cndmask_b32_e64 v12, -1, v24, s[4:5] ; HSA-NEXT: v_cndmask_b32_e32 v7, -1, v14, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; HSA-NEXT: v_cndmask_b32_e64 v14, -1, v28, s[8:9] ; HSA-NEXT: v_cndmask_b32_e32 v8, -1, v16, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; HSA-NEXT: v_cndmask_b32_e32 v9, -1, v18, vcc @@ -514,6 +508,12 @@ define <16 x ptr addrspace(5)> @addrspacecast_v16p0_to_v16p5(<16 x ptr> %ptr) { ; HSA-NEXT: v_cndmask_b32_e32 v10, -1, v20, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23] ; HSA-NEXT: v_cndmask_b32_e32 v11, -1, v22, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] +; HSA-NEXT: v_cndmask_b32_e32 v12, -1, v24, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[26:27] +; HSA-NEXT: v_cndmask_b32_e32 v13, -1, v26, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[28:29] +; HSA-NEXT: v_cndmask_b32_e32 v14, -1, v28, vcc ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[30:31] ; HSA-NEXT: v_cndmask_b32_e32 v15, -1, v30, vcc @@ -733,65 +733,64 @@ define <16 x ptr> @addrspacecast_v16p5_to_v16p0(<16 x ptr addrspace(5)> %ptr) { ; CI-NEXT: s_load_dword s4, s[6:7], 0x11 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6 -; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7 +; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 +; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v31, s4 -; CI-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; CI-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc -; CI-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc +; CI-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; CI-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc -; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; CI-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc -; CI-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4 -; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5 -; CI-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc -; CI-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9] -; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8 -; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9 -; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10 -; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11 -; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12 -; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13 -; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14 -; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15 -; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25] -; CI-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc -; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9] -; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25] -; CI-NEXT: v_mov_b32_e32 v1, v48 -; CI-NEXT: v_mov_b32_e32 v2, v35 -; CI-NEXT: v_mov_b32_e32 v3, v33 -; CI-NEXT: v_mov_b32_e32 v4, v36 -; CI-NEXT: v_mov_b32_e32 v5, v49 -; CI-NEXT: v_mov_b32_e32 v6, v37 -; CI-NEXT: v_mov_b32_e32 v7, v34 -; CI-NEXT: v_mov_b32_e32 v8, v38 -; CI-NEXT: v_mov_b32_e32 v10, v50 -; CI-NEXT: v_mov_b32_e32 v12, v39 -; CI-NEXT: v_mov_b32_e32 v14, v32 +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 +; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 +; CI-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc +; CI-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] +; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 +; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 +; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 +; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 +; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 +; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 +; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 +; CI-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 +; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] +; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] +; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] +; CI-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] +; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] +; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] +; CI-NEXT: v_mov_b32_e32 v1, v49 +; CI-NEXT: v_mov_b32_e32 v2, v34 +; CI-NEXT: v_mov_b32_e32 v3, v39 +; CI-NEXT: v_mov_b32_e32 v4, v35 +; CI-NEXT: v_mov_b32_e32 v5, v32 +; CI-NEXT: v_mov_b32_e32 v6, v36 +; CI-NEXT: v_mov_b32_e32 v8, v48 +; CI-NEXT: v_mov_b32_e32 v10, v37 +; CI-NEXT: v_mov_b32_e32 v12, v33 +; CI-NEXT: v_mov_b32_e32 v14, v38 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: addrspacecast_v16p5_to_v16p0: @@ -801,63 +800,62 @@ define <16 x ptr> @addrspacecast_v16p5_to_v16p0(<16 x ptr addrspace(5)> %ptr) { ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v31, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25] -; GFX9-NEXT: v_mov_b32_e32 v1, v48 -; GFX9-NEXT: v_mov_b32_e32 v2, v35 -; GFX9-NEXT: v_mov_b32_e32 v3, v33 -; GFX9-NEXT: v_mov_b32_e32 v4, v36 -; GFX9-NEXT: v_mov_b32_e32 v5, v49 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v34 -; GFX9-NEXT: v_mov_b32_e32 v8, v38 -; GFX9-NEXT: v_mov_b32_e32 v10, v50 -; GFX9-NEXT: v_mov_b32_e32 v12, v39 -; GFX9-NEXT: v_mov_b32_e32 v14, v32 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] +; GFX9-NEXT: v_mov_b32_e32 v1, v49 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v39 +; GFX9-NEXT: v_mov_b32_e32 v4, v35 +; GFX9-NEXT: v_mov_b32_e32 v5, v32 +; GFX9-NEXT: v_mov_b32_e32 v6, v36 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v10, v37 +; GFX9-NEXT: v_mov_b32_e32 v12, v33 +; GFX9-NEXT: v_mov_b32_e32 v14, v38 ; GFX9-NEXT: s_setpc_b64 s[30:31] %cast = addrspacecast <16 x ptr addrspace(5)> %ptr to <16 x ptr> ret <16 x ptr> %cast @@ -939,13 +937,10 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) { ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; HSA-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[24:25] ; HSA-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; HSA-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[26:27] ; HSA-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; HSA-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[28:29] ; HSA-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v3, -1, v6, vcc @@ -954,13 +949,10 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) { ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; HSA-NEXT: v_cndmask_b32_e32 v5, -1, v10, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; HSA-NEXT: v_cndmask_b32_e64 v13, -1, v26, s[6:7] ; HSA-NEXT: v_cndmask_b32_e32 v6, -1, v12, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; HSA-NEXT: v_cndmask_b32_e64 v12, -1, v24, s[4:5] ; HSA-NEXT: v_cndmask_b32_e32 v7, -1, v14, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; HSA-NEXT: v_cndmask_b32_e64 v14, -1, v28, s[8:9] ; HSA-NEXT: v_cndmask_b32_e32 v8, -1, v16, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; HSA-NEXT: v_cndmask_b32_e32 v9, -1, v18, vcc @@ -968,6 +960,12 @@ define <16 x ptr addrspace(3)> @addrspacecast_v16p0_to_v16p3(<16 x ptr> %ptr) { ; HSA-NEXT: v_cndmask_b32_e32 v10, -1, v20, vcc ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23] ; HSA-NEXT: v_cndmask_b32_e32 v11, -1, v22, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] +; HSA-NEXT: v_cndmask_b32_e32 v12, -1, v24, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[26:27] +; HSA-NEXT: v_cndmask_b32_e32 v13, -1, v26, vcc +; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[28:29] +; HSA-NEXT: v_cndmask_b32_e32 v14, -1, v28, vcc ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[30:31] ; HSA-NEXT: v_cndmask_b32_e32 v15, -1, v30, vcc @@ -1187,65 +1185,64 @@ define <16 x ptr> @addrspacecast_v16p3_to_v16p0(<16 x ptr addrspace(3)> %ptr) { ; CI-NEXT: s_load_dword s4, s[6:7], 0x10 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6 -; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7 +; CI-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 +; CI-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v31, s4 -; CI-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; CI-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc -; CI-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc +; CI-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; CI-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc -; CI-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc ; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; CI-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc -; CI-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc -; CI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4 -; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5 -; CI-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc -; CI-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9] -; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8 -; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9 -; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10 -; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11 -; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12 -; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13 -; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14 -; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15 -; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25] -; CI-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc -; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5] -; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7] -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9] -; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11] -; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13] -; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15] -; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17] -; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19] -; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21] -; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23] -; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25] -; CI-NEXT: v_mov_b32_e32 v1, v48 -; CI-NEXT: v_mov_b32_e32 v2, v35 -; CI-NEXT: v_mov_b32_e32 v3, v33 -; CI-NEXT: v_mov_b32_e32 v4, v36 -; CI-NEXT: v_mov_b32_e32 v5, v49 -; CI-NEXT: v_mov_b32_e32 v6, v37 -; CI-NEXT: v_mov_b32_e32 v7, v34 -; CI-NEXT: v_mov_b32_e32 v8, v38 -; CI-NEXT: v_mov_b32_e32 v10, v50 -; CI-NEXT: v_mov_b32_e32 v12, v39 -; CI-NEXT: v_mov_b32_e32 v14, v32 +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 +; CI-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 +; CI-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc +; CI-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] +; CI-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 +; CI-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 +; CI-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 +; CI-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 +; CI-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 +; CI-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 +; CI-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 +; CI-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 +; CI-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] +; CI-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] +; CI-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] +; CI-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc +; CI-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] +; CI-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] +; CI-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] +; CI-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] +; CI-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] +; CI-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] +; CI-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] +; CI-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] +; CI-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] +; CI-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] +; CI-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] +; CI-NEXT: v_mov_b32_e32 v1, v49 +; CI-NEXT: v_mov_b32_e32 v2, v34 +; CI-NEXT: v_mov_b32_e32 v3, v39 +; CI-NEXT: v_mov_b32_e32 v4, v35 +; CI-NEXT: v_mov_b32_e32 v5, v32 +; CI-NEXT: v_mov_b32_e32 v6, v36 +; CI-NEXT: v_mov_b32_e32 v8, v48 +; CI-NEXT: v_mov_b32_e32 v10, v37 +; CI-NEXT: v_mov_b32_e32 v12, v33 +; CI-NEXT: v_mov_b32_e32 v14, v38 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: addrspacecast_v16p3_to_v16p0: @@ -1255,63 +1252,62 @@ define <16 x ptr> @addrspacecast_v16p3_to_v16p0(<16 x ptr addrspace(3)> %ptr) { ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v31, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v48, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v33, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v39, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v49, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v35, 0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v32, 0, v31, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v37, 0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v34, 0, v31, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v4 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v6 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v38, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v39, 0, v6, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[8:9] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v8 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v9 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v11 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v12 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v13 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v14 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v15 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[24:25] -; GFX9-NEXT: v_mov_b32_e32 v1, v48 -; GFX9-NEXT: v_mov_b32_e32 v2, v35 -; GFX9-NEXT: v_mov_b32_e32 v3, v33 -; GFX9-NEXT: v_mov_b32_e32 v4, v36 -; GFX9-NEXT: v_mov_b32_e32 v5, v49 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v34 -; GFX9-NEXT: v_mov_b32_e32 v8, v38 -; GFX9-NEXT: v_mov_b32_e32 v10, v50 -; GFX9-NEXT: v_mov_b32_e32 v12, v39 -; GFX9-NEXT: v_mov_b32_e32 v14, v32 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], -1, v4 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], -1, v5 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], -1, v6 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[10:11], -1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v36, 0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v37, 0, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v38, 0, v7, s[10:11] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], -1, v8 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], -1, v9 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], -1, v10 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[18:19], -1, v11 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[20:21], -1, v12 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[22:23], -1, v13 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[24:25], -1, v14 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[26:27], -1, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v8, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v9, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, v10, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, v11, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, v12, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, v13, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v28, 0, v14, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v30, 0, v15, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v31, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v31, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v31, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v31, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v31, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v31, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, v31, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, v31, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, v31, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v27, 0, v31, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v29, 0, v31, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[26:27] +; GFX9-NEXT: v_mov_b32_e32 v1, v49 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v39 +; GFX9-NEXT: v_mov_b32_e32 v4, v35 +; GFX9-NEXT: v_mov_b32_e32 v5, v32 +; GFX9-NEXT: v_mov_b32_e32 v6, v36 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v10, v37 +; GFX9-NEXT: v_mov_b32_e32 v12, v33 +; GFX9-NEXT: v_mov_b32_e32 v14, v38 ; GFX9-NEXT: s_setpc_b64 s[30:31] %cast = addrspacecast <16 x ptr addrspace(3)> %ptr to <16 x ptr> ret <16 x ptr> %cast @@ -1550,13 +1546,9 @@ define <16 x ptr> @addrspacecast_v16p6_to_v16p0(<16 x ptr addrspace(6)> %ptr) { ; HSA-LABEL: addrspacecast_v16p6_to_v16p0: ; HSA: ; %bb.0: ; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HSA-NEXT: v_mov_b32_e32 v30, v15 ; HSA-NEXT: v_mov_b32_e32 v28, v14 -; HSA-NEXT: v_mov_b32_e32 v26, v13 ; HSA-NEXT: v_mov_b32_e32 v24, v12 -; HSA-NEXT: v_mov_b32_e32 v22, v11 ; HSA-NEXT: v_mov_b32_e32 v20, v10 -; HSA-NEXT: v_mov_b32_e32 v18, v9 ; HSA-NEXT: v_mov_b32_e32 v16, v8 ; HSA-NEXT: v_mov_b32_e32 v14, v7 ; HSA-NEXT: v_mov_b32_e32 v12, v6 @@ -1569,6 +1561,10 @@ define <16 x ptr> @addrspacecast_v16p6_to_v16p0(<16 x ptr addrspace(6)> %ptr) { ; HSA-NEXT: v_mov_b32_e32 v3, 0 ; HSA-NEXT: v_mov_b32_e32 v5, 0 ; HSA-NEXT: v_mov_b32_e32 v7, 0 +; HSA-NEXT: v_mov_b32_e32 v18, v9 +; HSA-NEXT: v_mov_b32_e32 v22, v11 +; HSA-NEXT: v_mov_b32_e32 v26, v13 +; HSA-NEXT: v_mov_b32_e32 v30, v15 ; HSA-NEXT: v_mov_b32_e32 v9, 0 ; HSA-NEXT: v_mov_b32_e32 v11, 0 ; HSA-NEXT: v_mov_b32_e32 v13, 0 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 823db84a053b8..58bb4ef5789ec 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -104,13 +104,12 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v39, a1 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v39 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v39 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: v_accvgpr_write_b32 a16, v32 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse @@ -366,9 +365,6 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ret void } -; FIXME: This case is broken. The asm value passed in v32 is live -; through the range where the reserved def for the copy is introduced, -; clobbering the user value. define void @v32_asm_def_use(float %v0, float %v1) #0 { ; GFX908-LABEL: v32_asm_def_use: ; GFX908: ; %bb.0: @@ -378,57 +374,48 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def v[0:31] a[0:15] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def v32 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a31, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a14 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a13 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a13 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a29, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a12 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v35 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a11 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a12 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a10 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a26, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a9 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a25, v35 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a8 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a10 ; GFX908-NEXT: v_accvgpr_write_b32 a24, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a7 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a23, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a6 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a22, v35 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a5 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a9 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a2 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a7 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v35 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a6 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a3 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a4 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a19, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a2 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a3 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a18, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a1 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a17, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v32 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a0 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v35 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v32 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def v32 +; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND @@ -1002,13 +989,12 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v39, a1 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a32, v39 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX908-NEXT: v_accvgpr_read_b32 v33, a1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v39 ; Reload Reuse ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: v_accvgpr_write_b32 a32, v33 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_write_b32 a13, v36 ; Reload Reuse diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index a6d8c6f41eee5..3e19ee5567929 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs -; TRAP-HANDLER-ENABLE: NumSgprs: 77 -; TRAP-HANDLER-DISABLE: NumSgprs: 92 +; TRAP-HANDLER-ENABLE: NumSgprs: 61 +; TRAP-HANDLER-DISABLE: NumSgprs: 77 define amdgpu_kernel void @amdhsa_trap_num_sgprs( ptr addrspace(1) %out0, i32 %in0, ptr addrspace(1) %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index bc359d6ff3aaa..4ccf92e68c835 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -662,14 +662,12 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x58, v0 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen @@ -677,9 +675,9 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen @@ -687,60 +685,63 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 48, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 ; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; GCN-NEXT: buffer_store_dword v3, v22, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -758,14 +759,6 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; GFX7-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GFX7-NEXT: v_add_i32_e32 v21, vcc, 44, v0 -; GFX7-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GFX7-NEXT: v_add_i32_e32 v23, vcc, 36, v0 -; GFX7-NEXT: v_add_i32_e32 v24, vcc, 32, v0 -; GFX7-NEXT: v_add_i32_e32 v25, vcc, 28, v0 -; GFX7-NEXT: v_add_i32_e32 v26, vcc, 24, v0 -; GFX7-NEXT: v_add_i32_e32 v27, vcc, 20, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen @@ -809,26 +802,34 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v9, v22, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: buffer_store_dword v14, v25, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1335,83 +1336,83 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 +; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v31, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GCN-NEXT: v_alignbit_b32 v12, v6, v7, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v10, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v9, v0, v1, 16 -; GCN-NEXT: v_alignbit_b32 v8, v6, v7, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v7, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v6, v0, v1, 16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v14, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v14, v0, v14, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16 +; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16 +; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17 -; GCN-NEXT: v_alignbit_b32 v17, v6, v18, 16 -; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26 +; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16 +; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48 ; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1421,78 +1422,78 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_alignbit_b32 v11, v7, v10, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v28, v7, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 ; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 -; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20 ; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 -; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 @@ -1564,207 +1565,203 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[16:17], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 ; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v18, v12, 16 +; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 ; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16 +; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16 +; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16 +; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: s_waitcnt vmcnt(13) +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: s_waitcnt vmcnt(11) +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16 +; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16 +; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16 +; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16 +; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_alignbit_b32 v3, v1, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_alignbit_b32 v1, v1, v13, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v6, v5, v19, 16 -; GCN-NEXT: v_alignbit_b32 v5, v13, v21, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v22 -; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(9) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16 +; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16 +; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16 +; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_alignbit_b32 v7, v8, v15, 16 -; GCN-NEXT: v_alignbit_b32 v11, v9, v20, 16 -; GCN-NEXT: v_alignbit_b32 v10, v21, v10, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v14, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v15, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v14, v19, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16 +; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -1780,24 +1777,27 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: s_waitcnt vmcnt(7) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 @@ -1832,16 +1832,97 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 ; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39 +; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49 +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48 +; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 +; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 +; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 +; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; GFX7-NEXT: s_waitcnt vmcnt(7) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 +; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 +; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 +; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 +; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; GFX7-NEXT: s_waitcnt vmcnt(7) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 +; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 +; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 +; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 +; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 +; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64 ; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1852,124 +1933,39 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 ; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 ; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v28 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v39 -; GFX7-NEXT: v_alignbit_b32 v36, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v48 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; GFX7-NEXT: v_alignbit_b32 v35, v18, v19, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v0, v1, 16 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v33, v6, v14, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29 -; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16 -; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26 -; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44 -; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25 -; GFX7-NEXT: v_alignbit_b32 v16, v16, v20, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v24 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v20, 16 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_alignbit_b32 v21, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(13) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; GFX7-NEXT: v_alignbit_b32 v20, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(11) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(10) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; GFX7-NEXT: v_alignbit_b32 v19, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v35 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(4) ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v34 -; GFX7-NEXT: v_alignbit_b32 v25, v0, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v24, v22, v23, 16 -; GFX7-NEXT: v_alignbit_b32 v23, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v22, v0, v1, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[31:32], s[4:7], 0 addr64 offset:80 -; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[31:32], s[4:7], 0 addr64 offset:64 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16 @@ -4880,12 +4876,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v21, s30, 0 -; GCN-NEXT: v_writelane_b32 v21, s31, 1 +; GCN-NEXT: v_writelane_b32 v20, s30, 0 +; GCN-NEXT: v_writelane_b32 v20, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -4911,36 +4907,36 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 22, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 20, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 18, v16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 16, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 14, v16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v16 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v16 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_store_short v11, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 6, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v16 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_store_short v10, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 2, v16 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4951,30 +4947,30 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_store_short v9, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v8, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v7, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v6, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v3, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v21, 1 -; GCN-NEXT: v_readlane_b32 s30, v21, 0 +; GCN-NEXT: v_readlane_b32 s31, v20, 1 +; GCN-NEXT: v_readlane_b32 s30, v20, 0 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 @@ -5365,10 +5361,10 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0 ; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) @@ -5587,20 +5583,20 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 @@ -5617,11 +5613,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7618,197 +7614,197 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26 ; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28 ; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30 -; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50 -; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52 -; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54 -; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56 -; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58 -; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60 -; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62 +; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50 +; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52 +; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54 +; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56 +; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58 +; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60 +; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62 ; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34 ; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36 ; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38 -; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40 -; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42 +; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40 +; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42 ; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44 ; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46 ; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xfc, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xf4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xec, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xe4, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xdc, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xd8, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd4, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xd0, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xcc, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xc4, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xc0, v0 +; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0 +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xb8, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xb4, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xac, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xa8, v0 -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xa4, v0 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa0, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x9c, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0 +; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x98, v0 +; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0 ; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x94, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x90, v0 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x88, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x84, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0 +; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x80, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0 +; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x70, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0 -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x64, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 +; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 +; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 48, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v31, vcc, 44, v0 +; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0 ; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 ; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 +; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 ; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0 ; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 8, v0 +; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0 ; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -7824,34 +7820,34 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11 -; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen ; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9 ; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12 -; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36 ; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v36 -; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13 +; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14 ; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15 ; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GCN-NEXT: buffer_store_dword v6, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -7864,258 +7860,258 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62 -; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60 -; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58 -; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56 -; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54 -; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52 -; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50 -; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34 -; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36 -; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38 -; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40 -; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42 -; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44 -; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46 -; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2 -; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6 -; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8 -; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10 -; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12 +; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62 +; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60 +; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58 +; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56 +; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54 +; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52 +; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50 +; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34 +; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36 +; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38 +; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40 +; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42 +; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44 +; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46 +; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2 +; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6 +; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8 +; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10 +; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12 ; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14 -; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18 -; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20 -; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22 -; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24 -; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26 -; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28 -; GFX7-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30 +; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18 +; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20 +; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22 +; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24 +; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26 +; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28 +; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30 ; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xfc, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf4, v0 -; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xd8, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xec, v0 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe4, v0 -; GFX7-NEXT: v_add_i32_e32 v24, vcc, 0xd0, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0 +; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v25 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; GFX7-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21 -; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v27 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xd4, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v20, v24, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xcc, v0 -; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc8, v0 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc4, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0 +; GFX7-NEXT: s_waitcnt vmcnt(14) +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xbc, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v33 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v32 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb4, v0 -; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xac, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v31 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa8, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v30 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v29 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x9c, v0 -; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x98, v0 -; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x94, v0 -; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0 -; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x8c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GFX7-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x88, v0 -; GFX7-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x84, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GFX7-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x80, v0 -; GFX7-NEXT: buffer_store_dword v20, v15, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0 +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x7c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v14 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x74, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x70, v0 -; GFX7-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x6c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v13, vcc, 0x68, v0 -; GFX7-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x64, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16 +; GFX7-NEXT: s_waitcnt vmcnt(14) +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 +; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0 +; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v8 -; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0 -; GFX7-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: buffer_store_dword v16, v8, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0 +; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0 +; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7 +; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 ; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v16 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x4c, v0 -; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v11 -; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 -; GFX7-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 64, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 60, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0 ; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; GFX7-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 ; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GFX7-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: global_extload_v32bf16_to_v32f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 2, v1 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 6, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 8, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1 ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1 ; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1 ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 14, v1 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 16, v1 -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 18, v1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 20, v1 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1 ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1 ; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1 @@ -8126,469 +8122,473 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1 ; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v33, vcc, 32, v1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1 ; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v35, vcc, 34, v1 +; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1 ; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc -; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: v_add_u32_e32 v37, vcc, 36, v1 -; GFX8-NEXT: flat_load_ushort v43, v[1:2] +; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 +; GFX8-NEXT: flat_load_ushort v44, v[1:2] ; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v48, vcc, 38, v1 +; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1 ; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v44, v[50:51] +; GFX8-NEXT: flat_load_ushort v45, v[50:51] ; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v45, v[50:51] -; GFX8-NEXT: v_add_u32_e32 v50, vcc, 40, v1 +; GFX8-NEXT: flat_load_ushort v46, v[50:51] +; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1 ; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1 ; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v46, v[52:53] -; GFX8-NEXT: v_add_u32_e32 v52, vcc, 42, v1 +; GFX8-NEXT: flat_load_ushort v47, v[52:53] +; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1 ; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1 ; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v47, v[54:55] -; GFX8-NEXT: v_add_u32_e32 v54, vcc, 44, v1 +; GFX8-NEXT: flat_load_ushort v56, v[54:55] +; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1 ; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v56, v[39:40] -; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1 -; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_ushort v57, v[39:40] -; GFX8-NEXT: v_add_u32_e32 v39, vcc, 46, v1 +; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1 ; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v41, vcc, 50, v1 -; GFX8-NEXT: v_addc_u32_e32 v42, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v41, v[41:42] -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v42, v[9:10] -; GFX8-NEXT: flat_load_ushort v9, v[35:36] -; GFX8-NEXT: flat_load_ushort v10, v[37:38] -; GFX8-NEXT: flat_load_ushort v35, v[48:49] -; GFX8-NEXT: flat_load_ushort v36, v[50:51] -; GFX8-NEXT: flat_load_ushort v37, v[52:53] -; GFX8-NEXT: flat_load_ushort v48, v[54:55] -; GFX8-NEXT: flat_load_ushort v39, v[39:40] -; GFX8-NEXT: flat_load_ushort v49, v[1:2] -; GFX8-NEXT: flat_load_ushort v50, v[3:4] -; GFX8-NEXT: flat_load_ushort v51, v[5:6] -; GFX8-NEXT: flat_load_ushort v52, v[7:8] -; GFX8-NEXT: flat_load_ushort v53, v[11:12] -; GFX8-NEXT: flat_load_ushort v38, v[13:14] -; GFX8-NEXT: flat_load_ushort v14, v[17:18] -; GFX8-NEXT: flat_load_ushort v11, v[21:22] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v0 -; GFX8-NEXT: flat_load_ushort v15, v[15:16] -; GFX8-NEXT: flat_load_ushort v13, v[19:20] -; GFX8-NEXT: flat_load_ushort v8, v[23:24] -; GFX8-NEXT: flat_load_ushort v6, v[25:26] -; GFX8-NEXT: flat_load_ushort v5, v[27:28] -; GFX8-NEXT: flat_load_ushort v7, v[29:30] -; GFX8-NEXT: flat_load_ushort v12, v[31:32] -; GFX8-NEXT: flat_load_ushort v16, v[33:34] -; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xc4, v0 -; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xbc, v0 -; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xb4, v0 -; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xac, v0 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0xa4, v0 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x9c, v0 +; GFX8-NEXT: flat_load_ushort v58, v[39:40] +; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1 +; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1 +; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v42, v[42:43] +; GFX8-NEXT: flat_load_ushort v34, v[33:34] +; GFX8-NEXT: flat_load_ushort v36, v[35:36] +; GFX8-NEXT: flat_load_ushort v38, v[37:38] +; GFX8-NEXT: flat_load_ushort v39, v[48:49] +; GFX8-NEXT: flat_load_ushort v48, v[50:51] +; GFX8-NEXT: flat_load_ushort v51, v[52:53] +; GFX8-NEXT: flat_load_ushort v52, v[54:55] +; GFX8-NEXT: flat_load_ushort v53, v[40:41] +; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1 +; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v37, v[3:4] +; GFX8-NEXT: flat_load_ushort v35, v[5:6] +; GFX8-NEXT: flat_load_ushort v33, v[7:8] +; GFX8-NEXT: flat_load_ushort v8, v[9:10] +; GFX8-NEXT: flat_load_ushort v6, v[11:12] +; GFX8-NEXT: flat_load_ushort v4, v[13:14] +; GFX8-NEXT: flat_load_ushort v2, v[15:16] +; GFX8-NEXT: flat_load_ushort v1, v[19:20] +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0 ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfc, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf8, v0 -; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf4, v0 -; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf0, v0 -; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xec, v0 -; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xe8, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX8-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe4, v0 -; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xdc, v0 -; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd8, v0 -; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd4, v0 -; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd0, v0 -; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xcc, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc8, v0 -; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 +; GFX8-NEXT: flat_load_ushort v3, v[17:18] +; GFX8-NEXT: flat_load_ushort v5, v[21:22] +; GFX8-NEXT: flat_load_ushort v7, v[23:24] +; GFX8-NEXT: flat_load_ushort v9, v[25:26] +; GFX8-NEXT: flat_load_ushort v10, v[27:28] +; GFX8-NEXT: flat_load_ushort v11, v[29:30] +; GFX8-NEXT: flat_load_ushort v12, v[31:32] +; GFX8-NEXT: flat_load_ushort v13, v[49:50] +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0 +; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v51 -; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; GFX8-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xc0, v0 -; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v17 -; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v53 -; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xb8, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0 +; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0 +; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0 +; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0 +; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0 +; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0 +; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v19 -; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v48 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xb0, v0 -; GFX8-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xa8, v0 -; GFX8-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v23 -; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v36 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23 -; GFX8-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xa0, v0 -; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v35 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v25 -; GFX8-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v10 -; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0x98, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x94, v0 -; GFX8-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x90, v0 -; GFX8-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v14 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x8c, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; GFX8-NEXT: buffer_store_dword v28, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x88, v0 -; GFX8-NEXT: buffer_store_dword v27, v14, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x84, v0 -; GFX8-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x80, v0 -; GFX8-NEXT: buffer_store_dword v27, v13, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v9 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7c, v0 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX8-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x78, v0 -; GFX8-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x74, v0 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX8-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x70, v0 -; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0 +; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x6c, v0 -; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x68, v0 -; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x64, v0 -; GFX8-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x60, v0 -; GFX8-NEXT: buffer_store_dword v12, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x58, v0 -; GFX8-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0 +; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0 +; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0 -; GFX8-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x50, v0 -; GFX8-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 -; GFX8-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x48, v0 -; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x44, v0 -; GFX8-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 64, v0 -; GFX8-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0 -; GFX8-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 56, v0 -; GFX8-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 52, v0 -; GFX8-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 48, v0 -; GFX8-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 44, v0 -; GFX8-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 40, v0 -; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 36, v0 -; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v0 -; GFX8-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 28, v0 -; GFX8-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 24, v0 -; GFX8-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 20, v0 -; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 12, v0 +; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 -; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_extload_v32bf16_to_v32f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:62 -; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:60 -; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:58 -; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:56 -; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:54 -; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:52 -; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:50 -; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:48 -; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:46 -; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:44 -; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:42 -; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:40 -; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:38 -; GFX9-NEXT: global_load_ushort v19, v[1:2], off -; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:36 -; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:4 -; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:34 -; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:32 -; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:6 -; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:30 +; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:62 +; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:60 +; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:58 +; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:56 +; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:54 +; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:52 +; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:50 +; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:48 +; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:46 +; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:44 +; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:42 +; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:40 +; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:38 +; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:36 +; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:34 +; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:32 +; GFX9-NEXT: global_load_ushort v25, v[1:2], off +; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:2 +; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:30 ; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16 ; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18 ; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20 ; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22 -; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24 -; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:26 -; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:28 -; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:10 +; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:24 +; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26 +; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28 +; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4 +; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6 +; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8 +; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10 ; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14 ; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10 ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:252 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:248 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:252 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:248 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11 ; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:244 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:240 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v27 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:236 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:232 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v23 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:236 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:232 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 ; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15 ; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v29 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v25 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v26 -; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:220 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:216 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v27 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[27:28], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:228 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:224 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v13 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:220 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:216 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v14 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 +; GFX9-NEXT: s_waitcnt vmcnt(32) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v20 ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v34 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:212 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:208 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v29 -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[29:30], v30 -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:204 -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:200 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v31 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[31:32], v32 -; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:192 -; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:188 -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:184 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:180 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:172 -; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:168 -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:164 -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160 -; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156 -; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:152 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:212 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:208 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:204 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:200 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:196 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:192 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v20 +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v19 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:188 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:184 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:180 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:176 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:156 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:152 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:148 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v24 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:140 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 +; GFX9-NEXT: s_waitcnt vmcnt(43) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v27 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:132 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 +; GFX9-NEXT: s_waitcnt vmcnt(38) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v2 +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2 ; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v13 -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v14 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v11 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v18 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2 +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 ; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:132 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v18 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v21 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v22 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v23 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v12 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v22 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8612,179 +8612,177 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26 ; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28 ; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30 -; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:32 -; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:34 -; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:36 -; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:38 -; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:40 -; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:42 -; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:44 -; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:46 -; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:48 -; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:62 -; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:50 -; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:52 -; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:54 -; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:60 -; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:56 -; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:58 +; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62 +; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32 +; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34 +; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36 +; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60 +; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38 +; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40 +; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58 +; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42 +; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44 +; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56 +; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46 +; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48 +; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54 +; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50 +; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52 ; GFX10-NEXT: s_waitcnt vmcnt(31) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3 ; GFX10-NEXT: s_waitcnt vmcnt(30) -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4 ; GFX10-NEXT: s_waitcnt vmcnt(29) -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(28) -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6 ; GFX10-NEXT: s_waitcnt vmcnt(27) -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7 ; GFX10-NEXT: s_waitcnt vmcnt(26) -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8 ; GFX10-NEXT: s_waitcnt vmcnt(25) -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9 ; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 ; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11 ; GFX10-NEXT: s_waitcnt vmcnt(22) -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12 ; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13 ; GFX10-NEXT: s_waitcnt vmcnt(20) -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v14 -; GFX10-NEXT: s_waitcnt vmcnt(19) -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v15 -; GFX10-NEXT: s_waitcnt vmcnt(18) -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v16 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v37 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v38 +; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36 +; GFX10-NEXT: s_waitcnt vmcnt(17) +; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17 +; GFX10-NEXT: s_waitcnt vmcnt(16) +; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18 ; GFX10-NEXT: s_waitcnt vmcnt(15) -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19 ; GFX10-NEXT: s_waitcnt vmcnt(14) ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20 ; GFX10-NEXT: s_waitcnt vmcnt(13) -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v21 +; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21 ; GFX10-NEXT: s_waitcnt vmcnt(12) -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22 ; GFX10-NEXT: s_waitcnt vmcnt(11) -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v23 -; GFX10-NEXT: s_waitcnt vmcnt(10) -; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25 ; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26 ; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v27 -; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v30 -; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v33 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v29 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v84 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v50 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v51 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v82 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v52 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v53 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[52:53], v80 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v35 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v36 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v48 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v49 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v54 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v55 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[54:55], v70 -; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v83 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v17 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v81 -; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v71 -; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v65 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[64:65], v64 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:220 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:216 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v67 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[66:67], v66 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v69 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v39 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[68:69], v68 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66 +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184 -; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:180 -; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:176 -; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:172 -; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:168 -; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:164 -; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:160 -; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:156 -; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:152 -; GFX10-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:148 -; GFX10-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:144 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:136 -; GFX10-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:132 -; GFX10-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:128 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52 +; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164 +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132 +; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128 ; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124 ; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120 -; GFX10-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:116 -; GFX10-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:112 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 ; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108 ; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:100 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:96 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:92 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:88 -; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:84 -; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:80 -; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:76 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:72 -; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:68 -; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:64 -; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:60 -; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:56 -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:52 -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48 -; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:44 -; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:40 -; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36 -; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:28 -; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24 -; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:20 -; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:16 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:12 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 +; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 +; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 +; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 +; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 +; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 +; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 +; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_extload_v32bf16_to_v32f64: @@ -10059,55 +10057,47 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_add_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_add_f32_e32 v11, v11, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_add_f32_e32 v4, v4, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 @@ -10116,6 +10106,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_add_f32_e32 v10, v10, v26 +; GCN-NEXT: v_add_f32_e32 v9, v9, v25 +; GCN-NEXT: v_add_f32_e32 v8, v8, v24 +; GCN-NEXT: v_add_f32_e32 v7, v7, v23 +; GCN-NEXT: v_add_f32_e32 v6, v6, v22 +; GCN-NEXT: v_add_f32_e32 v5, v5, v21 +; GCN-NEXT: v_add_f32_e32 v4, v4, v20 ; GCN-NEXT: v_add_f32_e32 v3, v3, v19 ; GCN-NEXT: v_add_f32_e32 v2, v2, v18 ; GCN-NEXT: v_add_f32_e32 v1, v1, v17 @@ -10135,7 +10133,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_add_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -10145,20 +10143,22 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -10169,25 +10169,24 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -10212,7 +10211,6 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_add_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_add_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_add_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_add_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v24 @@ -10231,7 +10229,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_add_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -11689,10 +11687,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -11995,278 +11993,278 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_add_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_add_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_add_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX10-NEXT: v_add_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 +; GFX10-NEXT: v_add_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; GFX10-NEXT: v_add_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 +; GFX10-NEXT: v_add_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GFX10-NEXT: v_add_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GFX10-NEXT: v_add_f32_e32 v27, v50, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; GFX10-NEXT: v_add_f32_e32 v29, v38, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX10-NEXT: v_add_f32_e32 v28, v48, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; GFX10-NEXT: v_add_f32_e32 v34, v34, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_add_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_add_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_add_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_add_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 -; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 -; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_add_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; GFX10-NEXT: v_add_f32_e32 v30, v36, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_add_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_add_f32_e32 v18, v48, v23 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_add_f32_e32 v17, v26, v50 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 -; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 -; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 -; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 -; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 -; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_add_f32_e32 v17, v50, v22 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_add_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX10-NEXT: v_add_f32_e32 v20, v36, v25 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_add_f32_e32 v19, v28, v38 -; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 -; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 -; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 -; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 -; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 -; GFX10-NEXT: v_add_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_add_f32_e32 v19, v38, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_add_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_add_f32_e32 v20, v29, v36 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 -; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 -; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 -; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 -; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 -; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 -; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 -; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 -; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 -; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 -; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 -; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 -; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 -; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 -; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 -; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_add_f32_e32 v21, v51, v26 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 +; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 +; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 +; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 +; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 -; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 -; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff -; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 -; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 -; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 -; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 -; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 -; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 -; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 -; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 -; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 -; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 -; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 -; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 -; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 -; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 -; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 -; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 -; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 +; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; GFX10-NEXT: v_add_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_add_f32_e32 v15, v15, v18 -; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 -; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo +; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -14496,55 +14494,47 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v4, v4, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 @@ -14553,6 +14543,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 +; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 +; GCN-NEXT: v_mul_f32_e32 v8, v8, v24 +; GCN-NEXT: v_mul_f32_e32 v7, v7, v23 +; GCN-NEXT: v_mul_f32_e32 v6, v6, v22 +; GCN-NEXT: v_mul_f32_e32 v5, v5, v21 +; GCN-NEXT: v_mul_f32_e32 v4, v4, v20 ; GCN-NEXT: v_mul_f32_e32 v3, v3, v19 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v18 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v17 @@ -14572,7 +14570,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_mul_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -14582,20 +14580,22 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -14606,25 +14606,24 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -14649,7 +14648,6 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24 @@ -14668,7 +14666,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -16126,10 +16124,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -16432,278 +16430,278 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 +; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 +; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GFX10-NEXT: v_mul_f32_e32 v27, v50, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; GFX10-NEXT: v_mul_f32_e32 v29, v38, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX10-NEXT: v_mul_f32_e32 v28, v48, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; GFX10-NEXT: v_mul_f32_e32 v34, v34, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_mul_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 -; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 -; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; GFX10-NEXT: v_mul_f32_e32 v30, v36, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_mul_f32_e32 v18, v48, v23 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 -; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 -; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 -; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 -; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 -; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_mul_f32_e32 v17, v50, v22 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX10-NEXT: v_mul_f32_e32 v20, v36, v25 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38 -; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 -; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 -; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 -; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 -; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 -; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_mul_f32_e32 v19, v38, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 -; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 -; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 -; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 -; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 -; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 -; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 -; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 -; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 -; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 -; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 -; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 -; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 -; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 -; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 -; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_mul_f32_e32 v21, v51, v26 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 +; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 +; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 +; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 +; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 -; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 -; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff -; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 -; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 -; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 -; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 -; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 -; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 -; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 -; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 -; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 -; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 -; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 -; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 -; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 -; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 -; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 -; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 -; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 +; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18 -; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 -; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo +; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18574,55 +18572,47 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_min_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_min_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_min_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_min_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_min_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_min_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_min_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_min_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_min_f32_e32 v11, v11, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_min_f32_e32 v4, v4, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 @@ -18631,6 +18621,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_min_f32_e32 v10, v10, v26 +; GCN-NEXT: v_min_f32_e32 v9, v9, v25 +; GCN-NEXT: v_min_f32_e32 v8, v8, v24 +; GCN-NEXT: v_min_f32_e32 v7, v7, v23 +; GCN-NEXT: v_min_f32_e32 v6, v6, v22 +; GCN-NEXT: v_min_f32_e32 v5, v5, v21 +; GCN-NEXT: v_min_f32_e32 v4, v4, v20 ; GCN-NEXT: v_min_f32_e32 v3, v3, v19 ; GCN-NEXT: v_min_f32_e32 v2, v2, v18 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17 @@ -18650,7 +18648,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -18660,20 +18658,22 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -18684,25 +18684,24 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -18727,7 +18726,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 @@ -18746,7 +18744,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -20204,10 +20202,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -20510,278 +20508,278 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_min_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_min_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_min_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX10-NEXT: v_min_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 +; GFX10-NEXT: v_min_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; GFX10-NEXT: v_min_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 +; GFX10-NEXT: v_min_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GFX10-NEXT: v_min_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GFX10-NEXT: v_min_f32_e32 v27, v50, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; GFX10-NEXT: v_min_f32_e32 v29, v38, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX10-NEXT: v_min_f32_e32 v28, v48, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; GFX10-NEXT: v_min_f32_e32 v34, v34, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_min_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_min_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_min_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_min_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 -; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 -; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_min_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; GFX10-NEXT: v_min_f32_e32 v30, v36, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_min_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_min_f32_e32 v18, v48, v23 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_min_f32_e32 v17, v26, v50 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 -; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 -; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 -; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 -; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 -; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_min_f32_e32 v17, v50, v22 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_min_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX10-NEXT: v_min_f32_e32 v20, v36, v25 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_min_f32_e32 v19, v28, v38 -; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 -; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 -; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 -; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 -; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 -; GFX10-NEXT: v_min_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_min_f32_e32 v19, v38, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_min_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_min_f32_e32 v20, v29, v36 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 -; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 -; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 -; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 -; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 -; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 -; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 -; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 -; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 -; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 -; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 -; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 -; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 -; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 -; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 -; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_min_f32_e32 v21, v51, v26 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 +; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 +; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 +; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 +; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 -; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 -; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff -; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 -; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 -; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 -; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 -; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 -; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 -; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 -; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 -; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 -; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 -; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 -; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 -; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 -; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 -; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 -; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 -; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 +; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; GFX10-NEXT: v_min_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_min_f32_e32 v15, v15, v18 -; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 -; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo +; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22193,55 +22191,47 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_max_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_max_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_max_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_max_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_max_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_max_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_max_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_max_f32_e32 v5, v5, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_max_f32_e32 v11, v11, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_max_f32_e32 v4, v4, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 @@ -22250,6 +22240,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_max_f32_e32 v10, v10, v26 +; GCN-NEXT: v_max_f32_e32 v9, v9, v25 +; GCN-NEXT: v_max_f32_e32 v8, v8, v24 +; GCN-NEXT: v_max_f32_e32 v7, v7, v23 +; GCN-NEXT: v_max_f32_e32 v6, v6, v22 +; GCN-NEXT: v_max_f32_e32 v5, v5, v21 +; GCN-NEXT: v_max_f32_e32 v4, v4, v20 ; GCN-NEXT: v_max_f32_e32 v3, v3, v19 ; GCN-NEXT: v_max_f32_e32 v2, v2, v18 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17 @@ -22269,7 +22267,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -22279,20 +22277,22 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -22303,25 +22303,24 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -22346,7 +22345,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 @@ -22365,7 +22363,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -23823,10 +23821,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -24129,278 +24127,278 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_max_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_max_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 ; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_max_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX10-NEXT: v_max_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 +; GFX10-NEXT: v_max_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; GFX10-NEXT: v_max_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 +; GFX10-NEXT: v_max_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GFX10-NEXT: v_max_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 +; GFX10-NEXT: v_max_f32_e32 v27, v50, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; GFX10-NEXT: v_max_f32_e32 v29, v38, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX10-NEXT: v_max_f32_e32 v28, v48, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; GFX10-NEXT: v_max_f32_e32 v34, v34, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_max_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_max_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_max_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_max_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 -; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 -; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_max_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; GFX10-NEXT: v_max_f32_e32 v30, v36, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_max_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_max_f32_e32 v18, v48, v23 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_max_f32_e32 v17, v26, v50 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 -; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 -; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 -; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 -; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 -; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 -; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_max_f32_e32 v17, v50, v22 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_max_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX10-NEXT: v_max_f32_e32 v20, v36, v25 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX10-NEXT: v_max_f32_e32 v19, v28, v38 -; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 -; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 -; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 -; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 -; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 -; GFX10-NEXT: v_max_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_max_f32_e32 v19, v38, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_max_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_max_f32_e32 v20, v29, v36 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 -; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 -; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 -; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 -; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 -; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 -; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 -; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 -; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 -; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 -; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 -; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 -; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 -; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 -; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 -; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 -; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_max_f32_e32 v21, v51, v26 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 +; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 +; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 +; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 +; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 +; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 +; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 +; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 +; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 +; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 -; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 -; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 -; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 -; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff -; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 -; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 -; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 -; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 -; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 -; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 -; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 -; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 -; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 -; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 -; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 -; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff -; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 -; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 -; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 -; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 -; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 -; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 -; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 -; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff -; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 -; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 -; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 -; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 -; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 -; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 +; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; GFX10-NEXT: v_max_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_max_f32_e32 v15, v15, v18 -; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 -; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff -; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff +; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo +; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -35657,81 +35655,81 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-LABEL: v_select_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16 +; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16 ; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v14, v19, v20, 16 +; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16 ; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -35764,67 +35762,67 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21 ; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v17, 16 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23 ; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v17, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27 +; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 +; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29 -; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -35833,21 +35831,21 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -37134,30 +37132,30 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GCN-LABEL: v_vselect_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-NEXT: v_and_b32_e32 v5, 1, v5 +; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_and_b32_e32 v3, 1, v3 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_and_b32_e32 v5, 1, v5 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_and_b32_e32 v6, 1, v6 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v7, 1, v7 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 @@ -37188,45 +37186,45 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -37495,16 +37493,16 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v1, 1, v10 ; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; GCN-NEXT: v_and_b32_e32 v3, 1, v11 -; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18 -; GCN-NEXT: v_and_b32_e32 v5, 1, v12 -; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v5 +; GCN-NEXT: v_and_b32_e32 v2, 1, v11 +; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v3, 1, v12 +; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3 ; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v7, 1, v13 ; GCN-NEXT: v_and_b32_e32 v8, 1, v14 ; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7 @@ -37571,22 +37569,22 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13] ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 @@ -37612,151 +37610,136 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-LABEL: v_vselect_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v7 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 +; GFX7-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v15 +; GFX7-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v14 +; GFX7-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v13 +; GFX7-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v12 +; GFX7-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 +; GFX7-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v12 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2 -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_and_b32_e32 v3, 1, v13 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 -; GFX7-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX7-NEXT: v_writelane_b32 v31, s34, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_and_b32_e32 v5, 1, v15 -; GFX7-NEXT: v_writelane_b32 v31, s35, 3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5 -; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[34:35] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[30:31] -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[28:29] -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v28 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v3, s[26:27] -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v27 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25] -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[12:13] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_readlane_b32 s35, v31, 3 -; GFX7-NEXT: v_readlane_b32 s34, v31, 2 -; GFX7-NEXT: v_readlane_b32 s31, v31, 1 -; GFX7-NEXT: v_readlane_b32 s30, v31, 0 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v9, v1, v5, s[20:21] -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v2, v5, s[18:19] -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v3, v5, s[16:17] -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v6, v4, v5, s[14:15] -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v8, v7, s[10:11] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v13, v8, v7, s[8:9] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v28 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v8, v7, s[6:7] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v8, v7, s[4:5] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v26 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v10, v8, v7, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21 +; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v9, v8, v7, s[18:19] +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13] -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[16:17] +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v23 +; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[14:15] +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[8:9] +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v16, vcc +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v18, v16, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_vselect_v16bf16: @@ -37787,53 +37770,51 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 ; GFX8-NEXT: v_writelane_b32 v31, s30, 0 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v12 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v13 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 ; GFX8-NEXT: v_writelane_b32 v31, s31, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v30 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 ; GFX8-NEXT: v_writelane_b32 v31, s34, 2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v3, v2, s[28:29] -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v28 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 ; GFX8-NEXT: v_writelane_b32 v31, s35, 3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21] +; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v2, s[20:21] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v21 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v29 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[24:25] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v4, s[16:17] ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v27, v19, s[14:15] -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v28, v20, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v26, v18, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11] ; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7] -; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v12, v0, v23, s[30:31] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13] @@ -37846,11 +37827,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11 ; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readlane_b32 s35, v31, 3 ; GFX8-NEXT: v_readlane_b32 s34, v31, 2 ; GFX8-NEXT: v_readlane_b32 s31, v31, 1 @@ -37864,81 +37847,81 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX9-LABEL: v_vselect_v16bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v14 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v4 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 -; GFX9-NEXT: v_and_b32_e32 v12, 1, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v8 -; GFX9-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[12:13] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v11, s[16:17] -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v30, v22, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v15, v26, v18, s[18:19] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v20, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v5, v5, v8, s4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v14, v4, v23, s[20:21] -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v4, v13, s[22:23] -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[24:25] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v4, v15, s4 -; GFX9-NEXT: v_perm_b32 v3, v11, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v9, v10, s4 -; GFX9-NEXT: v_perm_b32 v7, v13, v14, s4 +; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4 +; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4 +; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4 +; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4 +; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v16bf16: @@ -37955,13 +37938,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21 ; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v22, v30, v22, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20 ; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 @@ -37970,13 +37953,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v25 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v24 +; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v24 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo @@ -37995,11 +37978,11 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v51, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v30, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo @@ -38012,12 +37995,12 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 -; GFX10-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 -; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v16bf16: @@ -39408,219 +39391,206 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-LABEL: v_vselect_v32bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX10-NEXT: s_clause 0xa +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_ushort v35, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; GFX10-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX10-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX10-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX10-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v18 ; GFX10-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v19 ; GFX10-NEXT: v_and_b32_e32 v26, 1, v26 ; GFX10-NEXT: v_and_b32_e32 v24, 1, v24 ; GFX10-NEXT: v_and_b32_e32 v22, 1, v22 ; GFX10-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX10-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX10-NEXT: v_and_b32_e32 v21, 1, v21 ; GFX10-NEXT: v_and_b32_e32 v16, 1, v16 ; GFX10-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX10-NEXT: s_clause 0x14 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_ushort v33, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:80 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:28 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v30 -; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v28 -; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v26 -; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 1, v24 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 -; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v22 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 -; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 1, v20 -; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 1, v18 -; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s11, 1, v16 -; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; GFX10-NEXT: v_cmp_eq_u32_e64 s12, 1, v14 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24 -; GFX10-NEXT: v_cmp_eq_u32_e64 s13, 1, v12 -; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX10-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX10-NEXT: v_and_b32_e32 v15, 1, v15 ; GFX10-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX10-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX10-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX10-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX10-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX10-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX10-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX10-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX10-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX10-NEXT: v_and_b32_e32 v27, 1, v27 -; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 1, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s15, 1, v8 -; GFX10-NEXT: v_cmp_eq_u32_e64 s16, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27 -; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25 -; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23 -; GFX10-NEXT: v_cmp_eq_u32_e64 s23, 1, v21 -; GFX10-NEXT: v_cmp_eq_u32_e64 s24, 1, v19 -; GFX10-NEXT: v_cmp_eq_u32_e64 s25, 1, v17 -; GFX10-NEXT: v_cmp_eq_u32_e64 s26, 1, v15 -; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13 -; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11 -; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9 -; GFX10-NEXT: s_waitcnt vmcnt(32) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX10-NEXT: s_waitcnt vmcnt(31) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GFX10-NEXT: s_waitcnt vmcnt(30) -; GFX10-NEXT: v_and_b32_e32 v2, 1, v33 -; GFX10-NEXT: s_waitcnt vmcnt(29) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v34 -; GFX10-NEXT: s_waitcnt vmcnt(28) -; GFX10-NEXT: v_cndmask_b32_e64 v15, v34, v35, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v32, v31, s5 -; GFX10-NEXT: s_waitcnt vmcnt(25) -; GFX10-NEXT: v_cndmask_b32_e64 v19, v37, v38, s7 -; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v39 -; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, v48, s6 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v48 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v38 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v36 -; GFX10-NEXT: s_waitcnt vmcnt(18) -; GFX10-NEXT: v_cndmask_b32_e64 v27, v52, v53, s10 -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v54 -; GFX10-NEXT: s_waitcnt vmcnt(16) -; GFX10-NEXT: v_cndmask_b32_e64 v21, v54, v55, s9 -; GFX10-NEXT: s_waitcnt vmcnt(15) -; GFX10-NEXT: v_cndmask_b32_e64 v11, v64, v36, s8 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v64 -; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v55 -; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53 -; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v52 -; GFX10-NEXT: v_cndmask_b32_e64 v33, v50, v51, s11 -; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v51 -; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v50 +; GFX10-NEXT: s_waitcnt vmcnt(10) +; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v31 ; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_cndmask_b32_e64 v36, v30, v49, s12 -; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49 -; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v29, v68, s13 -; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v68 -; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v33 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_cndmask_b32_e64 v18, v34, v33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_cndmask_b32_e64 v49, v24, v22, s15 -; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_cndmask_b32_e64 v50, v67, v20, s16 -; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v67 +; GFX10-NEXT: v_and_b32_e32 v35, 1, v35 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v12 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_cndmask_b32_e64 v52, v66, v18, s17 -; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e64 v48, v28, v26, s14 -; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v66 -; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v16, s18 -; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v65 +; GFX10-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v35 +; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v34 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v32, v31, s6 +; GFX10-NEXT: s_clause 0x6 +; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; GFX10-NEXT: v_cndmask_b32_e64 v30, v50, v30, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 +; GFX10-NEXT: v_and_b32_e32 v28, 1, v29 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s5 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v52 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v64, v14, v12, s19 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v65, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v66, v6, v5, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v67, v8, v7, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v68, v10, v9, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v25, v23, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v32, v31, s24 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v35, v34, s25 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v30, v37, s26 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi -; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4 -; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100 -; GFX10-NEXT: v_perm_b32 v2, v2, v52, 0x5040100 -; GFX10-NEXT: v_perm_b32 v3, v20, v50, 0x5040100 -; GFX10-NEXT: v_perm_b32 v4, v12, v49, 0x5040100 -; GFX10-NEXT: v_perm_b32 v5, v5, v48, 0x5040100 -; GFX10-NEXT: v_perm_b32 v6, v6, v38, 0x5040100 -; GFX10-NEXT: v_perm_b32 v7, v7, v36, 0x5040100 -; GFX10-NEXT: v_perm_b32 v8, v8, v33, 0x5040100 -; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x5040100 -; GFX10-NEXT: v_perm_b32 v10, v10, v21, 0x5040100 -; GFX10-NEXT: v_perm_b32 v11, v68, v11, 0x5040100 -; GFX10-NEXT: v_perm_b32 v12, v67, v19, 0x5040100 -; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100 -; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100 -; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v29, v36, v37, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v28, v36, v37, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 +; GFX10-NEXT: v_and_b32_e32 v26, 1, v27 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 +; GFX10-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 +; GFX10-NEXT: v_and_b32_e32 v24, 1, v25 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 +; GFX10-NEXT: v_cndmask_b32_e32 v24, v36, v37, vcc_lo +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 +; GFX10-NEXT: v_and_b32_e32 v22, 1, v23 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v23, v49, v36, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v53 +; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v36, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v48 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v39 +; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; GFX10-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v16, v36, v37, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v38, v39, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 +; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX10-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v38, v39, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_cndmask_b32_e32 v10, v36, v37, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v8, v38, v39, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v53, v48, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v34, v52, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v32, v33, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v31, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v37, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v49, v48, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v31, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v34, v50, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v6, v30, v12, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v39, vcc_lo +; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX10-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v22, v23, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX10-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 +; GFX10-NEXT: v_perm_b32 v9, v13, v18, 0x5040100 +; GFX10-NEXT: v_perm_b32 v12, v24, v25, 0x5040100 +; GFX10-NEXT: v_perm_b32 v13, v26, v27, 0x5040100 +; GFX10-NEXT: v_perm_b32 v14, v28, v29, 0x5040100 +; GFX10-NEXT: v_perm_b32 v15, v35, v54, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v32bf16: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index dd9c9a3699b4f..05c2e0077f4ae 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s44, s[8:9], 0x2 +; CHECK-NEXT: s_load_dword s27, s[8:9], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] ; CHECK-NEXT: s_add_u32 s96, s96, s15 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_eq_u32 s44, 0 +; CHECK-NEXT: s_cmp_eq_u32 s27, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND @@ -971,10 +971,10 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v1, s98, 3 ; CHECK-NEXT: v_writelane_b32 v0, s92, 61 ; CHECK-NEXT: v_writelane_b32 v1, s99, 4 -; CHECK-NEXT: s_mov_b32 s49, s12 +; CHECK-NEXT: s_mov_b32 s31, s12 ; CHECK-NEXT: v_writelane_b32 v0, s93, 62 ; CHECK-NEXT: v_writelane_b32 v1, s100, 5 -; CHECK-NEXT: s_cmp_eq_u32 s49, 0 +; CHECK-NEXT: s_cmp_eq_u32 s31, 0 ; CHECK-NEXT: v_writelane_b32 v0, s94, 63 ; CHECK-NEXT: v_writelane_b32 v1, s101, 6 ; CHECK-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 0009a84765639..56ecfa298a348 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -2487,10 +2487,10 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; SI-NEXT: v_or_b32_e32 v1, v31, v1 ; SI-NEXT: v_or_b32_e32 v5, v27, v5 ; SI-NEXT: v_or_b32_e32 v9, v23, v9 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_and_b32_e32 v17, 3, v28 ; SI-NEXT: v_and_b32_e32 v18, 3, v24 -; SI-NEXT: v_and_b32_e32 v20, 3, v20 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 +; SI-NEXT: v_and_b32_e32 v19, 3, v20 ; SI-NEXT: v_and_b32_e32 v16, 3, v16 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_and_b32_e32 v12, 3, v12 @@ -2502,7 +2502,7 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; SI-NEXT: v_and_b32_e32 v0, 3, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v3, v18, v5 -; SI-NEXT: v_or_b32_e32 v5, v20, v9 +; SI-NEXT: v_or_b32_e32 v5, v19, v9 ; SI-NEXT: v_or_b32_e32 v7, v16, v13 ; SI-NEXT: v_or_b32_e32 v9, v12, v14 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir index 00eb2b7e1aa8d..4945c7020ca18 100644 --- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir +++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir @@ -49,39 +49,39 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec - ; CHECK-NEXT: dead undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] - ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1 + ; CHECK-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF3]].sub0, [[DEF5]].sub0, 0, implicit $exec + ; CHECK-NEXT: dead undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF3]].sub1, [[DEF5]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF8]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] + ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub1 ; CHECK-NEXT: dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]].sub0 - ; CHECK-NEXT: dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF6]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF7]], 288, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: undef [[DEF5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY5]] + ; CHECK-NEXT: undef [[DEF4:%[0-9]+]].sub1:vreg_64 = COPY [[COPY5]] ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir index cdd4c72f3717f..8a1c68b3f6615 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir @@ -24,7 +24,7 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF @@ -32,10 +32,9 @@ body: | ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF]], implicit $mode, implicit $exec ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -51,33 +50,34 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MUL_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec - ; CHECK-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF12]], implicit $mode, implicit $exec ; CHECK-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF13:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NEXT: $sgpr4 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = COPY [[DEF11]] + ; CHECK-NEXT: $vgpr0 = COPY [[DEF10]] ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] - ; CHECK-NEXT: $vgpr1 = COPY [[DEF7]] + ; CHECK-NEXT: $vgpr1 = COPY [[DEF6]] ; CHECK-NEXT: $vgpr0 = COPY [[V_MUL_F32_e32_1]] ; CHECK-NEXT: $vgpr1 = COPY [[V_MUL_F32_e32_2]] ; CHECK-NEXT: $vgpr2 = COPY [[V_MUL_F32_e32_3]] - ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0 - ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF13]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0 + ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF7]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF11]], [[DEF8]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF3]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF14]], [[DEF9]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index d9182d7ace8bf..59bc7f332bf1e 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -152,38 +152,38 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v14, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v5, v15, v31 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v3 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v15 ; GFX9-NEXT: v_or_b32_e32 v10, v10, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v3 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v14 ; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v9, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v21 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v20 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc -; GFX9-NEXT: v_and_b32_e32 v14, v30, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v14, vcc +; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 +; GFX9-NEXT: v_and_b32_e32 v6, v30, v0 ; GFX9-NEXT: v_and_b32_e32 v14, v30, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v6, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc -; GFX9-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v14, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 ; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 @@ -1227,13 +1227,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc ; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v16, vcc ; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5 ; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17 @@ -1245,8 +1245,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 ; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 -; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 +; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v10 +; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v11 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 ; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 @@ -1258,15 +1258,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] ; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v8 ; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] -; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v9 ; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 +; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v10 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v11 ; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3 -; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1 ; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3 ; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7] @@ -1291,10 +1291,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 ; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 @@ -1309,23 +1309,23 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc ; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 -; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] -; GFX9-G-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8 -; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v12 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v12, v[10:11] +; GFX9-G-NEXT: v_add_u32_e32 v13, 0xffffffc0, v12 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v12, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] -; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v13, v[8:9] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 ; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc -; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GFX9-G-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 @@ -1336,13 +1336,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] ; GFX9-G-NEXT: v_add_u32_e32 v24, 0xffffffc0, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20 ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -1352,54 +1352,54 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] -; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc -; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v8, 31, v7 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v13 ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] -; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v11 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v24, v2 ; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc -; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12 -; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12 -; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v19 -; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v2, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v25, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v26, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v27, v1, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v10 +; GFX9-G-NEXT: v_and_b32_e32 v10, v28, v18 +; GFX9-G-NEXT: v_and_b32_e32 v11, v28, v19 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v10 +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v11, vcc ; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4 +; GFX9-G-NEXT: v_and_b32_e32 v3, v28, v5 ; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc -; GFX9-G-NEXT: v_and_b32_e32 v0, v28, v5 -; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v3, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 ; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8 +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 +; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow @@ -1407,9 +1407,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: .LBB0_5: ; %Flow2 ; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 +; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v4 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 ; GFX9-G-NEXT: .LBB0_6: ; %Flow3 @@ -1418,9 +1418,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v12, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v4, v13, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -2439,16 +2439,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshlrev_b64 v[26:27], 1, v[10:11] ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 -; GFX9-NEXT: v_or_b32_e32 v10, v16, v26 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v10, v16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v11, v17, v27 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v9 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v17 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v16 ; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v1, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v24, v2, vcc @@ -2457,20 +2456,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_and_b32_e32 v16, v26, v4 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v16 ; GFX9-NEXT: v_and_b32_e32 v16, v26, v5 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v16, vcc ; GFX9-NEXT: v_and_b32_e32 v16, v26, v6 -; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 -; GFX9-NEXT: v_and_b32_e32 v12, v26, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v16, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v12, vcc +; GFX9-NEXT: v_and_b32_e32 v16, v26, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc +; GFX9-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX9-NEXT: v_or_b32_e32 v17, v19, v21 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 ; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 @@ -3506,37 +3506,37 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15 ; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12 ; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13 -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[16:17] -; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v3 -; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v2 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v9 -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] -; GFX9-G-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 -; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v22, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v23, v11, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v24, v12, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v25, v13, vcc -; GFX9-G-NEXT: v_add_co_u32_e64 v18, s[4:5], -1, v18 -; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX9-G-NEXT: v_addc_co_u32_e64 v19, s[4:5], -1, v19, s[4:5] -; GFX9-G-NEXT: v_and_b32_e32 v10, v3, v4 -; GFX9-G-NEXT: v_addc_co_u32_e64 v20, s[4:5], -1, v20, s[4:5] -; GFX9-G-NEXT: v_and_b32_e32 v16, v3, v5 -; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10 -; GFX9-G-NEXT: v_addc_co_u32_e64 v21, s[4:5], -1, v21, s[4:5] -; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-G-NEXT: v_and_b32_e32 v17, v3, v6 -; GFX9-G-NEXT: v_and_b32_e32 v26, v3, v7 -; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v11, v16, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[16:17] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v22, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v23, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v24, v10, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v11, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v12, 31, v12 +; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v4 +; GFX9-G-NEXT: v_and_b32_e32 v16, v12, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v13 +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v6 +; GFX9-G-NEXT: v_and_b32_e32 v17, v12, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v10, v13, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v11, v17, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 +; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20 ; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 -; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 +; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12 ; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc -; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 691f3d36bc736..8d65fa053eaa4 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -6,430 +6,430 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_sdiv_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v3 -; SDAG-NEXT: v_ashrrev_i32_e32 v27, 31, v11 ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 +; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v28, v26 -; SDAG-NEXT: v_mov_b32_e32 v29, v27 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v26, v24 +; SDAG-NEXT: v_mov_b32_e32 v27, v25 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v1, v16 -; SDAG-NEXT: v_ffbh_u32_e32 v18, v17 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 0, v8 -; SDAG-NEXT: v_or_b32_e32 v0, v16, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v2 -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v1 +; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v20 +; SDAG-NEXT: v_ffbh_u32_e32 v2, v21 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v0, v20, v16 +; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v16 +; SDAG-NEXT: v_or_b32_e32 v1, v21, v17 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v17, v3 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v21 -; SDAG-NEXT: v_min_u32_e32 v18, v22, v18 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v3 -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v20, s[4:5] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_min_u32_e32 v1, v21, v22 -; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v9, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v18, v8, v1, vcc -; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v20, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v8, v31, v0 +; SDAG-NEXT: v_min_u32_e32 v2, v19, v2 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v17 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7] +; SDAG-NEXT: v_min_u32_e32 v1, v19, v22 +; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2 +; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v3, v29 +; SDAG-NEXT: v_ffbh_u32_e32 v19, v28 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7] +; SDAG-NEXT: v_or_b32_e32 v2, v29, v0 +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3 ; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 -; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v9 -; SDAG-NEXT: v_or_b32_e32 v9, v30, v1 +; SDAG-NEXT: v_or_b32_e32 v3, v28, v1 +; SDAG-NEXT: v_min_u32_e32 v8, v8, v19 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_min_u32_e32 v20, v20, v21 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v11, v21 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v20 -; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v10, vcc -; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 -; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v9, v19 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v19, v1 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_min_u32_e32 v2, v11, v19 +; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8 +; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2 +; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v18, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v18, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v3, v11 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_and_b32_e32 v10, 1, v20 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v2, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc +; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 -; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2 +; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v2 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc +; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v2 +; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], v34 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v34 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_lshr_b64 v[10:11], v[20:21], v35 +; SDAG-NEXT: v_or_b32_e32 v3, v3, v11 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v11, 0 -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc -; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 -; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8 -; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[2:3], v24 -; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 -; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32 -; SDAG-NEXT: v_sub_i32_e32 v37, vcc, 64, v32 -; SDAG-NEXT: v_subrev_i32_e32 v48, vcc, 64, v32 -; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v32 -; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_lshl_b64 v[38:39], v[2:3], v37 -; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v48 -; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v11, v39 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v38 -; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v0, vcc -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v25, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v24, s[4:5] -; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v1, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v30 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e64 v8, s[4:5], 64, v30 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, v20, s[4:5] +; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30 +; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v25, 31, v21 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v10 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v24 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v25 -; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v2 -; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v16, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v17, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v10 -; SDAG-NEXT: v_and_b32_e32 v25, v24, v31 -; SDAG-NEXT: v_and_b32_e32 v48, v24, v30 -; SDAG-NEXT: v_and_b32_e32 v49, v24, v0 -; SDAG-NEXT: v_and_b32_e32 v10, 1, v24 -; SDAG-NEXT: v_and_b32_e32 v50, v24, v1 -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v25 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc -; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v16, v49, vcc -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v17, v50, vcc -; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_or_b32_e32 v19, v17, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v16, v18 +; SDAG-NEXT: v_or_b32_e32 v16, v22, v38 +; SDAG-NEXT: v_or_b32_e32 v17, v20, v39 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v8 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v17 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v21, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v16, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; SDAG-NEXT: v_and_b32_e32 v20, v8, v29 +; SDAG-NEXT: v_and_b32_e32 v22, v8, v28 +; SDAG-NEXT: v_and_b32_e32 v38, v8, v0 +; SDAG-NEXT: v_and_b32_e32 v39, v8, v1 +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20 +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v32, v34 -; SDAG-NEXT: v_or_b32_e32 v17, v33, v35 +; SDAG-NEXT: v_or_b32_e32 v16, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v17, v31, v33 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 -; SDAG-NEXT: v_mov_b32_e32 v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v22, v10 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v3, v11, v3 +; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v2, v10, v2 +; SDAG-NEXT: v_mov_b32_e32 v17, v9 +; SDAG-NEXT: v_mov_b32_e32 v16, v8 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SDAG-NEXT: s_cbranch_execnz .LBB0_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v20, v19, v1 -; SDAG-NEXT: v_or_b32_e32 v22, v11, v3 -; SDAG-NEXT: v_or_b32_e32 v21, v18, v0 -; SDAG-NEXT: v_or_b32_e32 v23, v10, v2 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v11, v1 +; SDAG-NEXT: v_or_b32_e32 v19, v9, v3 +; SDAG-NEXT: v_or_b32_e32 v22, v10, v0 +; SDAG-NEXT: v_or_b32_e32 v23, v8, v2 ; SDAG-NEXT: .LBB0_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7 ; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v18, v16 -; SDAG-NEXT: v_mov_b32_e32 v19, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, v16 +; SDAG-NEXT: v_mov_b32_e32 v21, v17 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v6, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12 -; SDAG-NEXT: v_or_b32_e32 v0, v2, v4 -; SDAG-NEXT: v_ffbh_u32_e32 v8, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v4, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12 +; SDAG-NEXT: v_or_b32_e32 v0, v2, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v9, v6 ; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v3, v5 -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v8 -; SDAG-NEXT: v_ffbh_u32_e32 v30, v5 -; SDAG-NEXT: v_min_u32_e32 v6, v10, v6 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v7 +; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v30, v7 +; SDAG-NEXT: v_min_u32_e32 v4, v10, v4 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] -; SDAG-NEXT: v_cndmask_b32_e64 v24, v13, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v25, v12, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_min_u32_e32 v1, v8, v30 -; SDAG-NEXT: v_add_i32_e64 v6, s[8:9], 64, v6 -; SDAG-NEXT: v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v15, vcc +; SDAG-NEXT: v_min_u32_e32 v1, v9, v30 +; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4 +; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v10, v25 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v24 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v7, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v13, v6, v1, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v8, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v6, v25, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v8, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v10, v29 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v28 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v4, v29, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v9, v0 ; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 -; SDAG-NEXT: v_or_b32_e32 v7, v24, v1 -; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v8 +; SDAG-NEXT: v_or_b32_e32 v5, v28, v1 +; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9 ; SDAG-NEXT: v_ffbh_u32_e32 v14, v1 ; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; SDAG-NEXT: v_min_u32_e32 v6, v8, v14 -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_min_u32_e32 v4, v9, v14 +; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6 -; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4 +; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v8, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v8, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v9, v10 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v7, v9 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_or_b32_e32 v9, v5, v11 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_and_b32_e32 v10, 1, v12 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v12 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 -; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 -; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4 +; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc ; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 -; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v7, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v9, vcc, 0x7f, v6 -; SDAG-NEXT: v_or_b32_e32 v8, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v9 -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, 64, v9 -; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v9 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] -; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6 -; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 -; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9 -; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4 +; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35 +; SDAG-NEXT: v_or_b32_e32 v5, v5, v11 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 -; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v25 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 ; SDAG-NEXT: v_mov_b32_e32 v14, 0 ; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35 -; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36 -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v11, v49 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v48 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35 +; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v9, v49 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v7 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v8 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v39 -; SDAG-NEXT: v_or_b32_e32 v9, v13, v9 -; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 -; SDAG-NEXT: v_or_b32_e32 v8, v12, v8 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v2 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v4, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v5, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v15, 31, v10 -; SDAG-NEXT: v_and_b32_e32 v10, 1, v15 -; SDAG-NEXT: v_and_b32_e32 v38, v15, v1 -; SDAG-NEXT: v_and_b32_e32 v39, v15, v0 -; SDAG-NEXT: v_and_b32_e32 v48, v15, v24 -; SDAG-NEXT: v_and_b32_e32 v15, v15, v25 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v39 +; SDAG-NEXT: v_or_b32_e32 v5, v13, v5 +; SDAG-NEXT: v_or_b32_e32 v11, v15, v11 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2 +; SDAG-NEXT: v_or_b32_e32 v4, v12, v4 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; SDAG-NEXT: v_and_b32_e32 v15, v8, v29 +; SDAG-NEXT: v_and_b32_e32 v38, v8, v28 +; SDAG-NEXT: v_and_b32_e32 v39, v8, v0 +; SDAG-NEXT: v_and_b32_e32 v48, v8, v1 ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 ; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 -; SDAG-NEXT: v_mov_b32_e32 v15, v11 -; SDAG-NEXT: v_mov_b32_e32 v14, v10 +; SDAG-NEXT: v_or_b32_e32 v10, v14, v10 +; SDAG-NEXT: v_mov_b32_e32 v15, v9 +; SDAG-NEXT: v_mov_b32_e32 v14, v8 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB0_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; SDAG-NEXT: v_or_b32_e32 v13, v13, v1 -; SDAG-NEXT: v_or_b32_e32 v14, v11, v3 -; SDAG-NEXT: v_or_b32_e32 v11, v12, v0 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 +; SDAG-NEXT: v_or_b32_e32 v14, v9, v3 +; SDAG-NEXT: v_or_b32_e32 v9, v12, v0 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 ; SDAG-NEXT: .LBB0_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_xor_b32_e32 v3, v29, v28 -; SDAG-NEXT: v_xor_b32_e32 v2, v27, v26 -; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18 +; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 +; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 +; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20 ; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16 -; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3 -; SDAG-NEXT: v_xor_b32_e32 v5, v21, v2 -; SDAG-NEXT: v_xor_b32_e32 v1, v22, v3 +; SDAG-NEXT: v_xor_b32_e32 v4, v18, v3 +; SDAG-NEXT: v_xor_b32_e32 v5, v22, v2 +; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2 -; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7 -; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6 +; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7 +; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6 ; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v4, v10, v6 +; SDAG-NEXT: v_xor_b32_e32 v4, v8, v6 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: v_sdiv_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f -; GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_xor_b32_e32 v0, v24, v0 ; GISEL-NEXT: v_xor_b32_e32 v1, v24, v1 ; GISEL-NEXT: v_xor_b32_e32 v2, v24, v2 @@ -438,71 +438,71 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v9, v25, v9 ; GISEL-NEXT: v_xor_b32_e32 v10, v25, v10 ; GISEL-NEXT: v_xor_b32_e32 v11, v25, v11 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v24 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v24, vcc +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v24 +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v24, vcc ; GISEL-NEXT: v_sub_i32_e64 v26, s[4:5], v8, v25 ; GISEL-NEXT: v_subb_u32_e64 v27, s[4:5], v9, v25, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v24, vcc -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v3, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v24, vcc ; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v25, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v25, vcc ; GISEL-NEXT: v_ffbh_u32_e32 v8, v27 ; GISEL-NEXT: v_ffbh_u32_e32 v9, v26 -; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 -; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v18 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v19 ; GISEL-NEXT: v_or_b32_e32 v0, v26, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v27, v11 -; GISEL-NEXT: v_or_b32_e32 v2, v16, v18 -; GISEL-NEXT: v_or_b32_e32 v3, v17, v19 +; GISEL-NEXT: v_or_b32_e32 v2, v18, v20 +; GISEL-NEXT: v_or_b32_e32 v3, v19, v21 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v9 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v11 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v10 -; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v30, v19 -; GISEL-NEXT: v_ffbh_u32_e32 v31, v18 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, 32, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v10 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v30, v20 +; GISEL-NEXT: v_ffbh_u32_e32 v31, v21 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] ; GISEL-NEXT: v_min_u32_e32 v0, v8, v9 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v29 -; GISEL-NEXT: v_min_u32_e32 v2, v22, v23 -; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v31 +; GISEL-NEXT: v_min_u32_e32 v1, v23, v22 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v28 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v30 +; GISEL-NEXT: v_min_u32_e32 v2, v29, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v31, v3 ; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 -; GISEL-NEXT: v_min_u32_e32 v1, v28, v1 -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v30, v3 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v2 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 ; GISEL-NEXT: v_or_b32_e32 v9, v3, v1 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v9, v22, v20 -; GISEL-NEXT: v_and_b32_e32 v20, 1, v9 +; GISEL-NEXT: v_or_b32_e32 v9, v22, v16 ; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: v_and_b32_e32 v9, 1, v9 +; GISEL-NEXT: v_and_b32_e32 v8, 1, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v2 @@ -511,110 +511,111 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_not_b32_e32 v2, 63 ; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v0, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v32, v2 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v32, v2 ; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], v32 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], v32 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20 +; GISEL-NEXT: v_lshr_b64 v[8:9], v[18:19], v8 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v16 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32 -; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v8, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v9, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v21, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB0_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_add_i32_e32 v34, vcc, 0xffffffc0, v28 +; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v28 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[20:21], v28 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[18:19], v28 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], v22 +; GISEL-NEXT: v_or_b32_e32 v22, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v23, v3, v23 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[20:21], v32 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v22, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v23, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, v2, v18, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v19, v3, v19, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_add_i32_e32 v32, vcc, -1, v26 -; GISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v28 ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v27, vcc -; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v22 -; GISEL-NEXT: v_lshr_b64 v[36:37], v[18:19], v34 -; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v10, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v2, v22 -; GISEL-NEXT: v_or_b32_e32 v1, v3, v23 ; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v11, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v0, v36, v0, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v1, v37, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v22, v0, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v23, v1, v17, s[6:7] -; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v23, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 ; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshrrev_b32_e32 v16, 31, v21 +; GISEL-NEXT: v_lshrrev_b32_e32 v36, 31, v17 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 +; GISEL-NEXT: v_or_b32_e32 v16, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v17, v1, v3 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v19 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], 1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_lshl_b64 v[36:37], v[22:23], 1 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 -; GISEL-NEXT: v_lshrrev_b32_e32 v23, 31, v9 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_lshrrev_b32_e32 v18, 31, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v32, v0 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v33, v1, vcc +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v34, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v35, v3, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v22, 31, v18 +; GISEL-NEXT: v_and_b32_e32 v18, v22, v26 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v18 +; GISEL-NEXT: v_and_b32_e32 v0, v22, v27 +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, v22, v10 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, v22, v11 +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v28, vcc, -1, v28 ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v18, v22 -; GISEL-NEXT: v_or_b32_e32 v3, v36, v23 ; GISEL-NEXT: v_addc_u32_e32 v30, vcc, -1, v30, vcc ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc -; GISEL-NEXT: v_or_b32_e32 v8, v8, v16 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v32, v3 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v37, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v28, v30 ; GISEL-NEXT: v_or_b32_e32 v1, v29, v31 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v34, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v19, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v16 +; GISEL-NEXT: v_and_b32_e32 v22, 1, v22 +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GISEL-NEXT: v_and_b32_e32 v1, v0, v26 -; GISEL-NEXT: v_and_b32_e32 v18, v0, v27 -; GISEL-NEXT: v_and_b32_e32 v16, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v36, v0, v10 -; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 -; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v3, v1 -; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v37, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v36, vcc -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v16 -; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: v_or_b32_e32 v8, v8, v36 +; GISEL-NEXT: v_mov_b32_e32 v0, v22 +; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB0_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB0_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] -; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 ; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21 +; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v17 ; GISEL-NEXT: v_or_b32_e32 v8, v8, v10 -; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v22, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v23, v1, v3 ; GISEL-NEXT: .LBB0_6: ; %Flow16 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15 @@ -630,18 +631,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc -; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19 -; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v4, v19 +; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc ; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc ; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v14, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v15, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v14, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v15, v20 ; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 ; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 -; GISEL-NEXT: v_or_b32_e32 v0, v22, v4 -; GISEL-NEXT: v_or_b32_e32 v1, v23, v5 +; GISEL-NEXT: v_or_b32_e32 v0, v20, v4 +; GISEL-NEXT: v_or_b32_e32 v1, v21, v5 ; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 ; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 @@ -732,8 +733,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 ; GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22 -; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v20 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v21, vcc ; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 ; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc @@ -782,8 +783,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v12, v0, v22 -; GISEL-NEXT: v_and_b32_e32 v13, v0, v23 +; GISEL-NEXT: v_and_b32_e32 v12, v0, v20 +; GISEL-NEXT: v_and_b32_e32 v13, v0, v21 ; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 ; GISEL-NEXT: v_mov_b32_e32 v0, v6 @@ -808,8 +809,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24 ; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18 -; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v21, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v22, v3 +; GISEL-NEXT: v_xor_b32_e32 v1, v23, v3 ; GISEL-NEXT: v_xor_b32_e32 v2, v8, v3 ; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 ; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 @@ -853,11 +854,11 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 ; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 ; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 ; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 ; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 ; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 @@ -868,146 +869,146 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v23, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v23 -; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24] +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22 +; SDAG-NEXT: v_subbrev_u32_e32 v24, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v26, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v25 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_or_b32_e32 v17, v23, v25 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v24, v26 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] -; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[24:25] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc ; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v23 -; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v23 +; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v22 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v22 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v24, vcc +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v23, vcc ; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v25, vcc -; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v26, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v18, v28 -; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v23 -; SDAG-NEXT: v_or_b32_e32 v20, v27, v29 -; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v30 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v24, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v25, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v26, v28 +; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22 +; SDAG-NEXT: v_or_b32_e32 v19, v27, v29 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 -; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v30 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] -; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v31 -; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v30 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v31 ; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v26, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v17, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v16, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v24, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v18 -; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v18 -; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v18 -; SDAG-NEXT: v_lshr_b64 v[32:33], v[2:3], v18 +; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v22 +; SDAG-NEXT: v_or_b32_e32 v23, v21, v23 +; SDAG-NEXT: v_or_b32_e32 v22, v20, v22 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; SDAG-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v26 +; SDAG-NEXT: v_lshr_b64 v[20:21], v[2:3], v20 +; SDAG-NEXT: v_cndmask_b32_e32 v21, v21, v23, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v21, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v20, v0, s[4:5] +; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v26 +; SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8 -; SDAG-NEXT: s_mov_b64 s[12:13], 0 -; SDAG-NEXT: v_mov_b32_e32 v25, 0 -; SDAG-NEXT: v_mov_b32_e32 v26, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 -; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v31 -; SDAG-NEXT: v_lshr_b64 v[36:37], v[2:3], v36 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v33, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v32, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v22, v22, v35 -; SDAG-NEXT: v_or_b32_e32 v21, v21, v34 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v22, v37, v22, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v22, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v21, v0, s[6:7] +; SDAG-NEXT: s_mov_b64 s[4:5], 0 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v21, 31, v24 -; SDAG-NEXT: v_lshl_b64 v[23:24], v[23:24], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v35, 31, v17 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_or_b32_e32 v24, v26, v24 -; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v34 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v35 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v21 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v30, v0 -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v31, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v32, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v33, v3, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v21 -; SDAG-NEXT: v_and_b32_e32 v25, v21, v8 -; SDAG-NEXT: v_and_b32_e32 v26, v21, v9 -; SDAG-NEXT: v_and_b32_e32 v34, v21, v10 -; SDAG-NEXT: v_and_b32_e32 v35, v21, v11 -; SDAG-NEXT: v_and_b32_e32 v21, 1, v21 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v26, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v34, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v35, vcc -; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v19 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v20 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v30, v0 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v31, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v32, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v33, v3, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v20, 31, v20 +; SDAG-NEXT: v_and_b32_e32 v24, v20, v8 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v24 +; SDAG-NEXT: v_and_b32_e32 v24, v20, v9 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v24, vcc +; SDAG-NEXT: v_and_b32_e32 v24, v20, v10 +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v24, vcc +; SDAG-NEXT: v_and_b32_e32 v24, v20, v11 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v24, vcc +; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; SDAG-NEXT: v_or_b32_e32 v25, v18, v28 -; SDAG-NEXT: v_or_b32_e32 v26, v27, v29 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] -; SDAG-NEXT: v_or_b32_e32 v17, v20, v17 -; SDAG-NEXT: s_or_b64 s[12:13], vcc, s[12:13] -; SDAG-NEXT: v_or_b32_e32 v16, v19, v16 -; SDAG-NEXT: v_mov_b32_e32 v26, v22 +; SDAG-NEXT: v_or_b32_e32 v24, v26, v28 +; SDAG-NEXT: v_or_b32_e32 v25, v27, v29 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] +; SDAG-NEXT: v_and_b32_e32 v20, 1, v20 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v34 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 ; SDAG-NEXT: v_mov_b32_e32 v25, v21 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[12:13] +; SDAG-NEXT: v_mov_b32_e32 v24, v20 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SDAG-NEXT: s_cbranch_execnz .LBB1_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_5: ; %Flow14 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[16:17], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v24 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[23:24], 1 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[0:1], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v16, v20, v1 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v3 -; SDAG-NEXT: v_or_b32_e32 v17, v19, v0 -; SDAG-NEXT: v_or_b32_e32 v19, v21, v2 +; SDAG-NEXT: v_or_b32_e32 v16, v23, v1 +; SDAG-NEXT: v_or_b32_e32 v18, v21, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v22, v0 +; SDAG-NEXT: v_or_b32_e32 v19, v20, v2 ; SDAG-NEXT: .LBB1_6: ; %Flow16 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 ; SDAG-NEXT: v_or_b32_e32 v0, v12, v14 ; SDAG-NEXT: v_or_b32_e32 v3, v5, v7 @@ -1045,20 +1046,20 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0 -; SDAG-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v24, vcc +; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0 +; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v24, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v3, v1, v21 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v1, v3 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v10 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: v_and_b32_e32 v2, 1, v8 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 @@ -1069,118 +1070,118 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0 +; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0 +; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc +; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 +; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc +; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v22, v24 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 +; SDAG-NEXT: v_or_b32_e32 v11, v23, v25 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], v26 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27 +; SDAG-NEXT: v_or_b32_e32 v1, v1, v11 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v10 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; SDAG-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 -; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc -; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v8, v24 -; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0x7f, v0 -; SDAG-NEXT: v_or_b32_e32 v2, v11, v25 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v3 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v3 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v3 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[1:2] -; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0 -; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 -; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8 -; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 -; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v8 +; SDAG-NEXT: v_lshr_b64 v[2:3], v[4:5], v22 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v22 +; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v22 +; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v22 ; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v27 ; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v28 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v21, v21, v32 -; SDAG-NEXT: v_or_b32_e32 v20, v20, v31 +; SDAG-NEXT: v_or_b32_e32 v3, v3, v32 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v8 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v7, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v6, v20, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v30, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v29, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: .LBB1_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v5 +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v5 ; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v20 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v30 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 -; SDAG-NEXT: v_or_b32_e32 v3, v10, v3 -; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 -; SDAG-NEXT: v_or_b32_e32 v2, v9, v2 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v26, v4 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v27, v5, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v28, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v29, v7, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v23, 31, v20 -; SDAG-NEXT: v_and_b32_e32 v20, 1, v23 -; SDAG-NEXT: v_and_b32_e32 v30, v23, v15 -; SDAG-NEXT: v_and_b32_e32 v31, v23, v14 -; SDAG-NEXT: v_and_b32_e32 v32, v23, v13 -; SDAG-NEXT: v_and_b32_e32 v23, v23, v12 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v23 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v32, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v31, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v30, vcc -; SDAG-NEXT: v_add_i32_e32 v8, vcc, -1, v8 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v2 +; SDAG-NEXT: v_or_b32_e32 v2, v4, v30 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v31 +; SDAG-NEXT: v_or_b32_e32 v1, v21, v1 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v26, v2 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v27, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v28, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v29, v7, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v4 +; SDAG-NEXT: v_and_b32_e32 v31, v30, v13 +; SDAG-NEXT: v_and_b32_e32 v4, v30, v12 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v2, v4 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v31, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 +; SDAG-NEXT: v_or_b32_e32 v0, v20, v0 +; SDAG-NEXT: v_and_b32_e32 v2, 1, v30 +; SDAG-NEXT: v_and_b32_e32 v11, v30, v15 +; SDAG-NEXT: v_and_b32_e32 v30, v30, v14 +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v30, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; SDAG-NEXT: v_add_i32_e32 v22, vcc, -1, v22 +; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc ; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc ; SDAG-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc -; SDAG-NEXT: v_or_b32_e32 v31, v11, v25 -; SDAG-NEXT: v_or_b32_e32 v30, v8, v24 +; SDAG-NEXT: v_or_b32_e32 v31, v23, v25 +; SDAG-NEXT: v_or_b32_e32 v30, v22, v24 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 -; SDAG-NEXT: v_mov_b32_e32 v23, v21 -; SDAG-NEXT: v_mov_b32_e32 v22, v20 +; SDAG-NEXT: v_or_b32_e32 v8, v10, v8 +; SDAG-NEXT: v_mov_b32_e32 v11, v3 +; SDAG-NEXT: v_mov_b32_e32 v10, v2 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB1_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v4 -; SDAG-NEXT: v_or_b32_e32 v8, v10, v3 -; SDAG-NEXT: v_or_b32_e32 v10, v21, v1 -; SDAG-NEXT: v_or_b32_e32 v9, v9, v2 -; SDAG-NEXT: v_or_b32_e32 v11, v20, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[8:9], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v21, v1 +; SDAG-NEXT: v_or_b32_e32 v10, v3, v5 +; SDAG-NEXT: v_or_b32_e32 v9, v20, v0 +; SDAG-NEXT: v_or_b32_e32 v11, v2, v4 ; SDAG-NEXT: .LBB1_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mov_b32_e32 v0, v19 @@ -1198,7 +1199,6 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v16, v2 ; GISEL-NEXT: v_mov_b32_e32 v17, v3 -; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v2, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v3, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v18, v0, v16 @@ -1209,20 +1209,21 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ffbh_u32_e32 v23, v10 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v17 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v16 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v16 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v17 ; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v25, 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v21 ; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v23 ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 -; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v28 ; GISEL-NEXT: v_min_u32_e32 v2, v20, v2 ; GISEL-NEXT: v_min_u32_e32 v3, v22, v3 ; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 -; GISEL-NEXT: v_min_u32_e32 v19, v28, v19 +; GISEL-NEXT: v_min_u32_e32 v19, v29, v19 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2 @@ -1235,28 +1236,28 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22 ; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v2, v2, v20 ; GISEL-NEXT: v_or_b32_e32 v3, v23, v21 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v26, v18 -; GISEL-NEXT: v_and_b32_e32 v18, 1, v3 ; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_and_b32_e32 v3, 1, v3 +; GISEL-NEXT: v_and_b32_e32 v2, 1, v2 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v24, 1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 @@ -1560,12 +1561,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc +; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] @@ -1574,106 +1575,106 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v17 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v2, v16, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 ; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc ; SDAG-NEXT: v_or_b32_e32 v3, v17, v1 -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v24, v1 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v10, vcc +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v1 ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v3, v22, v24 +; SDAG-NEXT: v_min_u32_e32 v3, v20, v22 ; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v20, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v10, v31 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v30 +; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v22, v8, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v8, v31, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v2 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v2 ; SDAG-NEXT: v_or_b32_e32 v9, v30, v3 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_ffbh_u32_e32 v18, v3 -; SDAG-NEXT: v_min_u32_e32 v10, v10, v20 +; SDAG-NEXT: v_min_u32_e32 v11, v11, v21 +; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v11, v18 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v8, v20, v21 +; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11 +; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v22 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v10, v21, vcc -; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10 ; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: v_or_b32_e32 v9, v11, v19 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v9, v19 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_and_b32_e32 v10, 1, v20 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v21, v20, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v35, v1, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc ; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB2_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 -; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 -; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc ; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 ; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10 ; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[0:1], v24 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24 ; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 -; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v32 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v32 ; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 @@ -1686,73 +1687,73 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26 ; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37 ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc -; SDAG-NEXT: v_or_b32_e32 v11, v11, v27 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v26 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v27 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v26 ; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v49, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v48, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v49, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v48, v8, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SDAG-NEXT: v_cndmask_b32_e32 v25, v11, v17, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v24, v10, v16, vcc -; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v25, v9, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v24, v8, v16, vcc +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v25 +; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v21 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v26, v26, v10 -; SDAG-NEXT: v_or_b32_e32 v24, v24, v48 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v49 -; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v24 -; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v25, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v26, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v27, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v10, 31, v10 -; SDAG-NEXT: v_and_b32_e32 v48, v10, v31 -; SDAG-NEXT: v_and_b32_e32 v49, v10, v30 -; SDAG-NEXT: v_and_b32_e32 v50, v10, v2 -; SDAG-NEXT: v_and_b32_e32 v51, v10, v3 -; SDAG-NEXT: v_and_b32_e32 v10, 1, v10 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v48 -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc -; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v26, v50, vcc -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v51, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_or_b32_e32 v22, v26, v48 +; SDAG-NEXT: v_or_b32_e32 v23, v24, v49 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v39, v27, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; SDAG-NEXT: v_and_b32_e32 v24, v8, v31 +; SDAG-NEXT: v_and_b32_e32 v26, v8, v30 +; SDAG-NEXT: v_and_b32_e32 v48, v8, v2 +; SDAG-NEXT: v_and_b32_e32 v49, v8, v3 +; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v23, v24 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v22, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v49, vcc ; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32 ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc -; SDAG-NEXT: v_or_b32_e32 v48, v32, v34 -; SDAG-NEXT: v_or_b32_e32 v49, v33, v35 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[48:49] -; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: v_or_b32_e32 v22, v32, v34 +; SDAG-NEXT: v_or_b32_e32 v23, v33, v35 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 -; SDAG-NEXT: v_mov_b32_e32 v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v22, v10 +; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 +; SDAG-NEXT: v_mov_b32_e32 v23, v9 +; SDAG-NEXT: v_mov_b32_e32 v22, v8 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB2_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v22 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v22 -; SDAG-NEXT: v_or_b32_e32 v35, v19, v9 -; SDAG-NEXT: v_or_b32_e32 v27, v11, v21 -; SDAG-NEXT: v_or_b32_e32 v32, v18, v8 -; SDAG-NEXT: v_or_b32_e32 v33, v10, v20 +; SDAG-NEXT: v_or_b32_e32 v35, v19, v11 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 +; SDAG-NEXT: v_or_b32_e32 v27, v9, v21 +; SDAG-NEXT: v_or_b32_e32 v33, v8, v20 ; SDAG-NEXT: .LBB2_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 @@ -2025,28 +2026,28 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_srem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3 -; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v11 -; GISEL-NEXT: v_mov_b32_e32 v18, 0x7f -; GISEL-NEXT: v_mov_b32_e32 v19, 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v11 +; GISEL-NEXT: v_mov_b32_e32 v19, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v20, 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v20 -; GISEL-NEXT: v_xor_b32_e32 v9, v9, v20 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v20 -; GISEL-NEXT: v_xor_b32_e32 v11, v11, v20 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v18 +; GISEL-NEXT: v_xor_b32_e32 v9, v9, v18 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v18 +; GISEL-NEXT: v_xor_b32_e32 v11, v11, v18 ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v28 ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v28, vcc -; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v20 -; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v20, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v18 +; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v18, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v28, vcc ; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc -; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v20, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v20, v29 +; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v18, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v18, v29 ; GISEL-NEXT: v_ffbh_u32_e32 v21, v30 ; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 ; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 @@ -2055,53 +2056,53 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v2, v16, v8 ; GISEL-NEXT: v_or_b32_e32 v3, v17, v9 ; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 -; GISEL-NEXT: v_ffbh_u32_e32 v24, v11 -; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 ; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v26, v9 -; GISEL-NEXT: v_ffbh_u32_e32 v27, v8 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v10 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v8 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v9 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] -; GISEL-NEXT: v_min_u32_e32 v0, v20, v21 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25 -; GISEL-NEXT: v_min_u32_e32 v2, v22, v23 -; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v27 +; GISEL-NEXT: v_min_u32_e32 v0, v18, v21 +; GISEL-NEXT: v_min_u32_e32 v1, v22, v23 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v24 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v26 +; GISEL-NEXT: v_min_u32_e32 v2, v25, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v27, v3 ; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 -; GISEL-NEXT: v_min_u32_e32 v1, v24, v1 -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 -; GISEL-NEXT: v_min_u32_e32 v3, v26, v3 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v2 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[19:20] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 ; GISEL-NEXT: v_or_b32_e32 v19, v3, v1 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v19, v20, v21 -; GISEL-NEXT: v_and_b32_e32 v20, 1, v19 +; GISEL-NEXT: v_or_b32_e32 v19, v21, v20 ; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_and_b32_e32 v19, 1, v19 +; GISEL-NEXT: v_and_b32_e32 v18, 1, v18 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v20, 1, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 @@ -2154,11 +2155,11 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v31 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, v2, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, v3, v17, vcc ; GISEL-NEXT: v_mov_b32_e32 v23, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 @@ -2166,40 +2167,40 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 -; GISEL-NEXT: v_lshl_b64 v[48:49], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v27 ; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 -; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v19 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v48, 31, v19 ; GISEL-NEXT: v_add_i32_e32 v31, vcc, -1, v31 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 -; GISEL-NEXT: v_or_b32_e32 v3, v48, v25 -; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v2, v24, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v26, v48 ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v35, v3 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v49, vcc +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v27, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v31, v33 ; GISEL-NEXT: v_or_b32_e32 v1, v32, v34 ; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v37, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v27, vcc +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v25, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 ; GISEL-NEXT: v_and_b32_e32 v1, v0, v30 -; GISEL-NEXT: v_and_b32_e32 v25, v0, v29 -; GISEL-NEXT: v_and_b32_e32 v26, v0, v10 -; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 -; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v49, v25, vcc -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v24, v0, v29 +; GISEL-NEXT: v_and_b32_e32 v48, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v49, v0, v11 +; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v2, v48, vcc +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v18, v39 ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -2486,11 +2487,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 ; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 ; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 ; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 ; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 ; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 @@ -2501,65 +2502,65 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18 ; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v23, v22, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB3_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16 -; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v17, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v18 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v18 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v19, vcc ; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc -; SDAG-NEXT: v_or_b32_e32 v20, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 -; SDAG-NEXT: v_or_b32_e32 v21, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[2:3], v26 -; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v26 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27 -; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v19, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0x7f, v18 +; SDAG-NEXT: v_or_b32_e32 v20, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v21 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v21 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v21 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v18 +; SDAG-NEXT: v_or_b32_e32 v19, v25, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v24, v18 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 +; SDAG-NEXT: v_lshr_b64 v[16:17], v[0:1], v30 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30 @@ -2572,73 +2573,73 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v28 ; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v19, v29 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v28 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v29 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v28 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v38, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v37, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v38, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v37, v16, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v17, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v16, v0, vcc +; SDAG-NEXT: v_mov_b32_e32 v17, 0 ; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v27 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v17 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v23 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_or_b32_e32 v28, v28, v18 -; SDAG-NEXT: v_or_b32_e32 v26, v26, v38 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v39 -; SDAG-NEXT: v_or_b32_e32 v17, v21, v17 -; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v34, v26 -; SDAG-NEXT: v_or_b32_e32 v16, v20, v16 -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v35, v27, vcc -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v36, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v37, v29, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v18 -; SDAG-NEXT: v_and_b32_e32 v39, v38, v8 -; SDAG-NEXT: v_and_b32_e32 v48, v38, v9 -; SDAG-NEXT: v_and_b32_e32 v49, v38, v10 -; SDAG-NEXT: v_and_b32_e32 v18, 1, v38 -; SDAG-NEXT: v_and_b32_e32 v38, v38, v11 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v39 -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc -; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v49, vcc -; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v38, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 +; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 +; SDAG-NEXT: v_or_b32_e32 v24, v28, v38 +; SDAG-NEXT: v_or_b32_e32 v25, v26, v39 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v16 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v25 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; SDAG-NEXT: v_and_b32_e32 v26, v16, v8 +; SDAG-NEXT: v_and_b32_e32 v28, v16, v9 +; SDAG-NEXT: v_and_b32_e32 v38, v16, v10 +; SDAG-NEXT: v_and_b32_e32 v39, v16, v11 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v25, v26 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v24, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v39, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 -; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] -; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 +; SDAG-NEXT: v_or_b32_e32 v24, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v25, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] +; SDAG-NEXT: v_or_b32_e32 v19, v21, v19 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 -; SDAG-NEXT: v_mov_b32_e32 v25, v19 -; SDAG-NEXT: v_mov_b32_e32 v24, v18 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v18 +; SDAG-NEXT: v_mov_b32_e32 v25, v17 +; SDAG-NEXT: v_mov_b32_e32 v24, v16 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 -; SDAG-NEXT: v_or_b32_e32 v33, v21, v17 -; SDAG-NEXT: v_or_b32_e32 v30, v19, v23 -; SDAG-NEXT: v_or_b32_e32 v31, v20, v16 -; SDAG-NEXT: v_or_b32_e32 v32, v18, v22 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 +; SDAG-NEXT: v_or_b32_e32 v33, v21, v19 +; SDAG-NEXT: v_or_b32_e32 v30, v17, v23 +; SDAG-NEXT: v_or_b32_e32 v31, v20, v18 +; SDAG-NEXT: v_or_b32_e32 v32, v16, v22 ; SDAG-NEXT: .LBB3_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 @@ -2678,63 +2679,63 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16 -; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v28, vcc +; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v20, v20, v18 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v21, v17, v19 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_and_b32_e32 v20, 1, v22 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 ; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc ; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22 -; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v18, vcc -; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v34, v36 -; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16 -; SDAG-NEXT: v_or_b32_e32 v18, v35, v37 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v19 -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v19 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18] -; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 -; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 -; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19 -; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc +; SDAG-NEXT: v_or_b32_e32 v20, v34, v36 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v21, v35, v37 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v27 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34 +; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34 ; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34 @@ -2747,100 +2748,100 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v28 ; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39 ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v21, v21, v29 -; SDAG-NEXT: v_or_b32_e32 v20, v20, v28 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v29 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v28 ; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v50, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v49, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v50, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v49, v18, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v21, v5, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v26, v20, v4, vcc -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: .LBB3_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v27 +; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v17 +; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_or_b32_e32 v28, v28, v20 +; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v28, v18 ; SDAG-NEXT: v_or_b32_e32 v26, v26, v50 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v51 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 -; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v38, v26 -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v39, v27, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v48, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v49, v29, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v20 -; SDAG-NEXT: v_and_b32_e32 v20, 1, v25 -; SDAG-NEXT: v_and_b32_e32 v50, v25, v15 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v51 +; SDAG-NEXT: v_or_b32_e32 v17, v23, v17 +; SDAG-NEXT: v_or_b32_e32 v21, v25, v21 +; SDAG-NEXT: v_sub_i32_e32 v25, vcc, v38, v26 +; SDAG-NEXT: v_or_b32_e32 v16, v22, v16 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v39, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v48, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v49, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v25 +; SDAG-NEXT: v_and_b32_e32 v28, v25, v12 +; SDAG-NEXT: v_and_b32_e32 v50, v25, v13 ; SDAG-NEXT: v_and_b32_e32 v51, v25, v14 -; SDAG-NEXT: v_and_b32_e32 v52, v25, v13 -; SDAG-NEXT: v_and_b32_e32 v25, v25, v12 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v25 -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc -; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v51, vcc -; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v50, vcc +; SDAG-NEXT: v_and_b32_e32 v52, v25, v15 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v28 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v50, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v18, v51, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v52, vcc ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc -; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 ; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 +; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v25 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 -; SDAG-NEXT: v_mov_b32_e32 v25, v21 -; SDAG-NEXT: v_mov_b32_e32 v24, v20 +; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 +; SDAG-NEXT: v_mov_b32_e32 v24, v18 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 -; SDAG-NEXT: v_or_b32_e32 v23, v23, v19 -; SDAG-NEXT: v_or_b32_e32 v21, v21, v17 -; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 -; SDAG-NEXT: v_or_b32_e32 v20, v20, v16 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v23, v17 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v21 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11 +; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 ; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 ; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 ; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mul_lo_u32 v34, v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v20, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v35, v21, v14 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v18, v15 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v18, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v35, v19, v14 ; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 ; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v20, 0 -; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; SDAG-NEXT: v_mov_b32_e32 v18, v11 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[18:19] +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v18, 0 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; SDAG-NEXT: v_mov_b32_e32 v20, v11 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v25, v34 +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v25, v34 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28 ; SDAG-NEXT: v_mov_b32_e32 v28, v27 -; SDAG-NEXT: v_mov_b32_e32 v27, v19 +; SDAG-NEXT: v_mov_b32_e32 v27, v21 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v18, v35 -; SDAG-NEXT: v_mov_b32_e32 v18, v15 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v20, v[18:19] +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v20, v35 +; SDAG-NEXT: v_mov_b32_e32 v20, v15 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v18, v[20:21] ; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17] ; SDAG-NEXT: v_mov_b32_e32 v8, v11 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v28, v8 @@ -2849,24 +2850,24 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] ; SDAG-NEXT: v_mov_b32_e32 v22, v27 -; SDAG-NEXT: v_mov_b32_e32 v27, v19 -; SDAG-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v12, v21, v[26:27] +; SDAG-NEXT: v_mov_b32_e32 v27, v21 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v19, v[26:27] ; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v29, v16 ; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18] ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v11, v20 +; SDAG-NEXT: v_mov_b32_e32 v11, v21 ; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v22, v11 ; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v33, v16 ; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v17 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v21, v[11:12] +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v19, v[11:12] ; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 ; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v11, v10 ; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v12, v17, vcc -; SDAG-NEXT: v_mov_b32_e32 v10, v19 +; SDAG-NEXT: v_mov_b32_e32 v10, v20 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v10, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc @@ -2876,7 +2877,6 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_urem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v16, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v17, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v18, v0, v2 @@ -2887,20 +2887,21 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v3 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v2 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v2 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v3 ; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23 ; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v25 ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 -; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v28 ; GISEL-NEXT: v_min_u32_e32 v16, v22, v16 ; GISEL-NEXT: v_min_u32_e32 v17, v24, v17 ; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 -; GISEL-NEXT: v_min_u32_e32 v19, v28, v19 +; GISEL-NEXT: v_min_u32_e32 v19, v29, v19 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16 @@ -2913,28 +2914,28 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v23, 0x7f, v18 ; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v18 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v20, v20, v16 +; GISEL-NEXT: v_or_b32_e32 v20, v23, v16 ; GISEL-NEXT: v_or_b32_e32 v21, v19, v17 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; GISEL-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v21, v22, v23 -; GISEL-NEXT: v_and_b32_e32 v22, 1, v21 ; GISEL-NEXT: v_or_b32_e32 v20, v21, v20 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: v_and_b32_e32 v21, 1, v21 +; GISEL-NEXT: v_and_b32_e32 v20, 1, v20 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 @@ -2987,11 +2988,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 ; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v28, 0, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v29, 0, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v17, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; GISEL-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v28, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v29, v19, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v25, 0 ; GISEL-NEXT: v_mov_b32_e32 v19, s7 ; GISEL-NEXT: v_mov_b32_e32 v18, s6 @@ -2999,40 +3000,40 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v16, s4 ; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshrrev_b32_e32 v38, 31, v23 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v23 -; GISEL-NEXT: v_lshl_b64 v[38:39], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v29 ; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v27 -; GISEL-NEXT: v_lshrrev_b32_e32 v27, 31, v21 -; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 ; GISEL-NEXT: v_or_b32_e32 v22, v16, v18 ; GISEL-NEXT: v_or_b32_e32 v23, v17, v19 -; GISEL-NEXT: v_or_b32_e32 v18, v28, v26 -; GISEL-NEXT: v_or_b32_e32 v19, v38, v27 -; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 +; GISEL-NEXT: v_or_b32_e32 v18, v26, v24 +; GISEL-NEXT: v_or_b32_e32 v19, v28, v39 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v34, v19 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v39, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v29, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v30, v32 ; GISEL-NEXT: v_or_b32_e32 v17, v31, v33 ; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v36, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v29, vcc +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v27, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v24 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v24, 1, v16 ; GISEL-NEXT: v_and_b32_e32 v17, v16, v8 -; GISEL-NEXT: v_and_b32_e32 v27, v16, v9 -; GISEL-NEXT: v_and_b32_e32 v28, v16, v10 -; GISEL-NEXT: v_and_b32_e32 v16, v16, v11 -; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v19, v17 -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v39, v27, vcc -; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc -; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v16, vcc +; GISEL-NEXT: v_and_b32_e32 v26, v16, v9 +; GISEL-NEXT: v_and_b32_e32 v39, v16, v10 +; GISEL-NEXT: v_and_b32_e32 v48, v16, v11 +; GISEL-NEXT: v_and_b32_e32 v24, 1, v16 +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v26, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v18, v39, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc +; GISEL-NEXT: v_or_b32_e32 v20, v20, v38 ; GISEL-NEXT: v_mov_b32_e32 v16, v24 ; GISEL-NEXT: v_mov_b32_e32 v17, v25 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index c3c1540383ec6..a442566676561 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -694,18 +694,14 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 -; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; SI-NEXT: v_mov_b32_e32 v9, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64 -; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0 -; SI-NEXT: v_mov_b32_e32 v10, v6 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16 -; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -721,61 +717,64 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 1, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 +; SI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 5 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 8 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 9 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 10 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 11 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 12 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 13 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 14 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 15 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_short v0, v[8:9], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 3199b76d279fa..3c70883f09d2c 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -3030,50 +3030,50 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v32f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v20 -; VI-NEXT: v_max_f16_sdwa v20, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v0, v19 +; VI-NEXT: v_max_f16_sdwa v19, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v20 -; VI-NEXT: v_max_f16_sdwa v20, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v1, v19 +; VI-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v20 -; VI-NEXT: v_max_f16_sdwa v20, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v2, v19 +; VI-NEXT: v_max_f16_sdwa v19, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v20 -; VI-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v3, v19 +; VI-NEXT: v_max_f16_sdwa v19, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v4, v4, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v20 -; VI-NEXT: v_max_f16_sdwa v20, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v4, v4, v19 +; VI-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v5, v5, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v20 -; VI-NEXT: v_max_f16_sdwa v20, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v5, v5, v19 +; VI-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v6, v6, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v20 -; VI-NEXT: v_max_f16_sdwa v20, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v6, v6, v19 +; VI-NEXT: v_max_f16_sdwa v19, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v7, v7, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v20 -; VI-NEXT: v_max_f16_sdwa v20, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v7, v7, v19 +; VI-NEXT: v_max_f16_sdwa v19, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v8, v8, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v20 -; VI-NEXT: v_max_f16_sdwa v20, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v8, v8, v19 +; VI-NEXT: v_max_f16_sdwa v19, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v9, v9, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v20 -; VI-NEXT: v_max_f16_sdwa v20, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v9, v9, v19 +; VI-NEXT: v_max_f16_sdwa v19, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v10, v10, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v19 +; VI-NEXT: v_max_f16_sdwa v19, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v11, v11, v11 ; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v11, v11, v19 ; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v10, v10, v20 -; VI-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_f16_e32 v15, v15, v15 ; VI-NEXT: v_max_f16_e32 v14, v14, v14 ; VI-NEXT: v_max_f16_e32 v13, v13, v13 ; VI-NEXT: v_max_f16_e32 v12, v12, v12 -; VI-NEXT: v_max_f16_e32 v11, v11, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v20 ; VI-NEXT: v_or_b32_e32 v12, v12, v19 ; VI-NEXT: v_or_b32_e32 v13, v13, v18 ; VI-NEXT: v_or_b32_e32 v14, v14, v17 @@ -3342,11 +3342,11 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v64f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -3358,7 +3358,7 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -3370,341 +3370,344 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v21 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v28 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v4, v5, v4 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_or_b32_e32 v5, v6, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; CI-NEXT: v_or_b32_e32 v5, v7, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; CI-NEXT: v_or_b32_e32 v6, v7, v6 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v19 -; CI-NEXT: v_or_b32_e32 v7, v9, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v18 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x7c, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_or_b32_e32 v7, v8, v7 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v18 ; CI-NEXT: v_or_b32_e32 v8, v10, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; CI-NEXT: v_or_b32_e32 v9, v11, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_or_b32_e32 v9, v10, v9 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v25 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 -; CI-NEXT: v_or_b32_e32 v10, v12, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_or_b32_e32 v10, v14, v10 +; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; CI-NEXT: v_or_b32_e32 v17, v18, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v30 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_or_b32_e32 v11, v13, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_or_b32_e32 v12, v15, v12 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v31 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 -; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v33 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; CI-NEXT: v_or_b32_e32 v13, v16, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v32 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; CI-NEXT: v_or_b32_e32 v15, v25, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v21 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v16 -; CI-NEXT: v_or_b32_e32 v16, v24, v25 -; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; CI-NEXT: v_or_b32_e32 v25, v28, v24 -; CI-NEXT: s_waitcnt vmcnt(9) -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; CI-NEXT: v_or_b32_e32 v20, v19, v20 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8 -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_or_b32_e32 v19, v20, v19 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v30 +; CI-NEXT: v_or_b32_e32 v20, v22, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v34 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v17, v17, v26 -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0 -; CI-NEXT: v_or_b32_e32 v18, v27, v18 -; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 -; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0 -; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0 -; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: s_waitcnt vmcnt(12) -; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; CI-NEXT: v_or_b32_e32 v20, v21, v20 -; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0 -; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 -; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; CI-NEXT: s_waitcnt vmcnt(13) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: s_waitcnt vmcnt(12) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v24 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; CI-NEXT: v_or_b32_e32 v20, v23, v20 -; CI-NEXT: s_waitcnt vmcnt(9) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_or_b32_e32 v23, v27, v23 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 -; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v17, v17, v18 -; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0 -; CI-NEXT: v_or_b32_e32 v25, v25, v26 -; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0 ; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_or_b32_e32 v19, v24, v19 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: v_or_b32_e32 v21, v22, v21 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v22 -; CI-NEXT: v_or_b32_e32 v22, v23, v27 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; CI-NEXT: v_or_b32_e32 v23, v28, v23 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; CI-NEXT: v_or_b32_e32 v24, v24, v27 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; CI-NEXT: v_or_b32_e32 v27, v28, v27 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; CI-NEXT: v_or_b32_e32 v28, v29, v28 -; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 -; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 -; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 -; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0 -; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 -; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 -; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 -; CI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; CI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 -; CI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; CI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; CI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x64, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; CI-NEXT: v_or_b32_e32 v31, v32, v31 +; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 +; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_or_b32_e32 v14, v15, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; CI-NEXT: v_or_b32_e32 v12, v12, v15 +; CI-NEXT: v_or_b32_e32 v11, v16, v11 +; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 +; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; CI-NEXT: buffer_store_dword v21, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; CI-NEXT: buffer_store_dword v20, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; CI-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; CI-NEXT: buffer_store_dword v17, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; CI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 ; CI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index a0fe9d88e31cf..3a7f3e41002d2 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -172,52 +172,52 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 +; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -331,34 +331,34 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB0_9: ; %Flow3 @@ -540,52 +540,52 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 +; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -699,34 +699,34 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB1_9: ; %Flow3 @@ -900,52 +900,52 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v5 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v5 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -1054,34 +1054,34 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB2_9: ; %Flow3 @@ -1255,52 +1255,52 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 -; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v5 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v5 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v11 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v12 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v13 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v15 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v17 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 -; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 @@ -1409,34 +1409,34 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB3_9: ; %Flow3 @@ -1786,34 +1786,34 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB6_9: ; %Flow3 @@ -2135,34 +2135,34 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 ; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 ; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 19, v1 ; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 -; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 -; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 -; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 -; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 -; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 -; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 -; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 -; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 -; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v3 +; GISEL-NEXT: v_or3_b32 v0, v0, v4, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v4, v5 +; GISEL-NEXT: v_or3_b32 v0, v0, v6, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v6, v7 +; GISEL-NEXT: v_or3_b32 v0, v0, v8, v9 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v8, v9 +; GISEL-NEXT: v_or3_b32 v0, v0, v10, v11 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v10, v11 +; GISEL-NEXT: v_or3_b32 v0, v0, v12, v13 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 30, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 -; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 -; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v12, v13 +; GISEL-NEXT: v_or3_b32 v0, v0, v14, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v14, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: .LBB7_9: ; %Flow3 diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index 4f3086a9eb1f9..34ee90c68569f 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1209,50 +1209,50 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] ; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; SDAG-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] +; SDAG-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10 ; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] -; SDAG-NEXT: v_rsq_f64_e32 v[10:11], v[4:5] -; SDAG-NEXT: v_mul_f64 v[12:13], v[0:1], v[6:7] +; SDAG-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] +; SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 ; SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9] ; SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5 -; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[10:11] -; SDAG-NEXT: v_mul_f64 v[10:11], v[10:11], 0.5 -; SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[12:13], 0.5 -; SDAG-NEXT: v_fma_f64 v[20:21], -v[8:9], v[14:15], 0.5 -; SDAG-NEXT: v_fma_f64 v[22:23], -v[10:11], v[16:17], 0.5 +; SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[10:11], 0.5 +; SDAG-NEXT: v_fma_f64 v[18:19], -v[8:9], v[14:15], 0.5 +; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11] +; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] +; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[12:13] +; SDAG-NEXT: v_mul_f64 v[12:13], v[12:13], 0.5 +; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[18:19], v[8:9] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[16:17], 0.5 +; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] ; SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] -; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15] -; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9] -; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17] -; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11] -; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1] -; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3] -; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5] -; SDAG-NEXT: v_fma_f64 v[12:13], v[18:19], v[6:7], v[12:13] -; SDAG-NEXT: v_fma_f64 v[14:15], v[20:21], v[8:9], v[14:15] -; SDAG-NEXT: v_fma_f64 v[16:17], v[22:23], v[10:11], v[16:17] -; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1] -; SDAG-NEXT: v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3] -; SDAG-NEXT: v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5] -; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[12:13] -; SDAG-NEXT: v_mov_b32_e32 v12, 0xffffff80 -; SDAG-NEXT: v_mov_b32_e32 v13, 0x260 -; SDAG-NEXT: v_fma_f64 v[8:9], v[20:21], v[8:9], v[14:15] -; SDAG-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5] -; SDAG-NEXT: v_fma_f64 v[10:11], v[22:23], v[10:11], v[16:17] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7] -; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14 -; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 -; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13 -; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15 -; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13 -; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12 +; SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[14:15], v[14:15], v[2:3] +; SDAG-NEXT: v_fma_f64 v[14:15], v[18:19], v[8:9], v[14:15] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5] +; SDAG-NEXT: v_fma_f64 v[16:17], v[18:19], v[12:13], v[16:17] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1] +; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[10:11] +; SDAG-NEXT: v_fma_f64 v[10:11], -v[14:15], v[14:15], v[2:3] +; SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5] +; SDAG-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[14:15] +; SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[12:13], v[16:17] +; SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SDAG-NEXT: v_mov_b32_e32 v15, 0x260 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7] +; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12 +; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13 +; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15 +; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15 ; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] @@ -1266,61 +1266,61 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: s_brev_b32 s5, 8 -; GISEL-NEXT: v_mov_b32_e32 v6, s4 ; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 +; GISEL-NEXT: v_mov_b32_e32 v6, s4 ; GISEL-NEXT: v_mov_b32_e32 v7, s5 ; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] ; GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 +; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] -; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[2:3] -; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[4:5] -; GISEL-NEXT: v_mul_f64 v[12:13], v[6:7], 0.5 -; GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] -; GISEL-NEXT: v_mul_f64 v[14:15], v[8:9], 0.5 -; GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[8:9] -; GISEL-NEXT: v_mul_f64 v[16:17], v[10:11], 0.5 -; GISEL-NEXT: v_mul_f64 v[10:11], v[4:5], v[10:11] -; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5 -; GISEL-NEXT: v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5 -; GISEL-NEXT: v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5 -; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] +; GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5] +; GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5 +; GISEL-NEXT: v_mul_f64 v[8:9], v[0:1], v[8:9] +; GISEL-NEXT: v_mul_f64 v[14:15], v[10:11], 0.5 +; GISEL-NEXT: v_mul_f64 v[10:11], v[2:3], v[10:11] +; GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[8:9], 0.5 +; GISEL-NEXT: v_fma_f64 v[18:19], -v[14:15], v[10:11], 0.5 +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9] +; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] +; GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], 0.5 +; GISEL-NEXT: v_mul_f64 v[12:13], v[4:5], v[12:13] +; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[16:17], v[12:13], 0.5 ; GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13] -; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9] -; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15] -; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11] -; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1] -; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17] -; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3] -; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5] -; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7] -; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9] -; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11] -; GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1] -; GISEL-NEXT: v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3] -; GISEL-NEXT: v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5] -; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7] -; GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 -; GISEL-NEXT: v_mov_b32_e32 v13, 0x260 -; GISEL-NEXT: v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc -; GISEL-NEXT: v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, v12, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[6:7] -; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v14 -; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 -; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v13 -; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v15 -; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v13 -; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v12 +; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1] +; GISEL-NEXT: v_fma_f64 v[8:9], v[18:19], v[6:7], v[8:9] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[2:3] +; GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[12:13], v[18:19], v[16:17], v[12:13] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1] +; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[8:9] +; GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[10:11], v[2:3] +; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5] +; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[10:11] +; GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[16:17], v[12:13] +; GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7] +; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12 +; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13 +; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15 +; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 +; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 3b2f15c8340a6..78e521aba120e 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -671,17 +671,17 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 { ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: s_waitcnt vmcnt(6) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -692,17 +692,17 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 { ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -713,19 +713,19 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 { ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1388,137 +1388,137 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; CI-LABEL: void_func_v32i8: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_or_b32_e32 v2, v3, v2 -; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; CI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; CI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; CI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; CI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; CI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; CI-NEXT: v_or_b32_e32 v5, v5, v6 -; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_or_b32_e32 v8, v8, v9 ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; CI-NEXT: v_or_b32_e32 v12, v12, v13 -; CI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; CI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; CI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; CI-NEXT: v_and_b32_e32 v9, 0xff, v14 ; CI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; CI-NEXT: v_or_b32_e32 v7, v4, v5 +; CI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; CI-NEXT: v_or_b32_e32 v12, v12, v13 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v28 -; CI-NEXT: v_and_b32_e32 v6, 0xff, v26 -; CI-NEXT: v_or_b32_e32 v8, v8, v9 -; CI-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; CI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 ; CI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; CI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; CI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; CI-NEXT: v_or_b32_e32 v1, v4, v1 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v30 -; CI-NEXT: v_lshlrev_b32_e32 v5, 24, v27 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v29 +; CI-NEXT: v_and_b32_e32 v14, 0xff, v28 +; CI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; CI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; CI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v1, v9 +; CI-NEXT: v_or_b32_e32 v9, v11, v10 +; CI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; CI-NEXT: v_or_b32_e32 v6, v7, v6 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_lshlrev_b32_e32 v15, 24, v27 +; CI-NEXT: v_and_b32_e32 v27, 0xff, v30 +; CI-NEXT: v_or_b32_e32 v13, v14, v13 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; CI-NEXT: v_or_b32_e32 v7, v3, v2 +; CI-NEXT: v_or_b32_e32 v3, v10, v1 +; CI-NEXT: v_or_b32_e32 v1, v4, v6 +; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; CI-NEXT: v_or_b32_e32 v11, v15, v14 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v9, v9, v13 -; CI-NEXT: v_or_b32_e32 v10, v11, v10 -; CI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; CI-NEXT: v_and_b32_e32 v12, 0xffff, v13 ; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v5, v5, v6 -; CI-NEXT: v_or_b32_e32 v6, v0, v2 -; CI-NEXT: v_or_b32_e32 v9, v11, v9 -; CI-NEXT: v_or_b32_e32 v8, v8, v10 -; CI-NEXT: v_lshlrev_b32_e32 v10, 8, v25 -; CI-NEXT: v_and_b32_e32 v11, 0xff, v24 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v7 +; CI-NEXT: v_or_b32_e32 v2, v8, v9 +; CI-NEXT: v_and_b32_e32 v8, 0xff, v20 +; CI-NEXT: v_and_b32_e32 v9, 0xff, v16 ; CI-NEXT: s_mov_b32 s5, 0 ; CI-NEXT: s_mov_b32 s4, 16 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; CI-NEXT: v_or_b32_e32 v0, v0, v4 -; CI-NEXT: v_or_b32_e32 v3, v1, v0 -; CI-NEXT: v_or_b32_e32 v0, v11, v10 -; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; CI-NEXT: v_or_b32_e32 v2, v0, v5 -; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v23 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v20 -; CI-NEXT: v_or_b32_e32 v1, v4, v1 -; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; CI-NEXT: v_or_b32_e32 v1, v1, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 24, v19 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_or_b32_e32 v0, v0, v4 -; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v17 -; CI-NEXT: v_and_b32_e32 v5, 0xff, v16 -; CI-NEXT: v_or_b32_e32 v4, v5, v4 -; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; CI-NEXT: v_or_b32_e32 v0, v4, v0 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; CI-NEXT: v_or_b32_e32 v5, v24, v25 +; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; CI-NEXT: v_or_b32_e32 v4, v4, v26 +; CI-NEXT: v_or_b32_e32 v6, v5, v11 +; CI-NEXT: v_and_b32_e32 v5, 0xff, v22 +; CI-NEXT: v_or_b32_e32 v7, v12, v4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_or_b32_e32 v4, v4, v5 +; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; CI-NEXT: v_or_b32_e32 v5, v8, v5 +; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; CI-NEXT: v_and_b32_e32 v8, 0xff, v18 +; CI-NEXT: v_or_b32_e32 v5, v5, v4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v19 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; CI-NEXT: v_or_b32_e32 v4, v4, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; CI-NEXT: v_or_b32_e32 v8, v9, v8 +; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; CI-NEXT: v_or_b32_e32 v4, v8, v4 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s4, s5 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: void_func_v32i8: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; GFX89-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 ; GFX89-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX89-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_lshlrev_b16_e32 v13, 8, v15 -; GFX89-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 ; GFX89-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX89-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; GFX89-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GFX89-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX89-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; GFX89-NEXT: v_lshlrev_b16_e32 v14, 8, v25 +; GFX89-NEXT: v_lshlrev_b16_e32 v15, 8, v27 +; GFX89-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX89-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX89-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX89-NEXT: v_lshlrev_b16_e32 v19, 8, v19 ; GFX89-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v5, 8, v7 -; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; GFX89-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX89-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v9, 8, v11 -; GFX89-NEXT: v_or_b32_sdwa v7, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v25 -; GFX89-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v10, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX89-NEXT: v_lshlrev_b16_e32 v2, 8, v23 -; GFX89-NEXT: v_or_b32_sdwa v11, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; GFX89-NEXT: v_lshlrev_b16_e32 v3, 8, v17 -; GFX89-NEXT: v_lshlrev_b16_e32 v15, 8, v19 -; GFX89-NEXT: v_or_b32_sdwa v19, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v17, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v16, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v11, v24, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v14, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v15, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v20, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: s_mov_b32 s5, 0 ; GFX89-NEXT: s_mov_b32 s4, 16 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v6, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v5, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v4, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v6, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v5, v15, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_lshlrev_b16_e32 v8, 8, v14 +; GFX89-NEXT: v_lshlrev_b16_e32 v8, 8, v10 ; GFX89-NEXT: v_or_b32_sdwa v8, v30, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 @@ -2622,102 +2622,37 @@ define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, pt } define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 { -; CI-LABEL: void_func_v32i32_i32_i64: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v32i32_i32_i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_i32_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v32i32_i32_i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(3) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dword v34, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_i32_i64: ; GFX11: ; %bb.0: @@ -2765,129 +2700,86 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(5) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_mul_f32_e32 v12, 1.0, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v33 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; CI-NEXT: v_and_b32_e32 v0, 1, v17 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; CI-NEXT: v_and_b32_e32 v0, 1, v34 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 ; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v19, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v13, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: void_func_v32i32_i1_i8_i16_bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 1, v20 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v17, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v18, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v19, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_i1_i8_i16_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v19, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: void_func_v32i32_i1_i8_i16_bf16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX89-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:4 +; GFX89-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX89-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX89-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; GFX89-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:20 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt vmcnt(5) +; GFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_and_b32_e32 v0, 1, v32 +; GFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_byte v33, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v34, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v35, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v36, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16: ; GFX11: ; %bb.0: @@ -2945,105 +2837,38 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg } define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 { -; CI-LABEL: void_func_v32i32_v2i32_v2f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v32i32_v2i32_v2f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_v2i32_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_v32i32_v2i32_v2f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(4) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx2 v[34:35], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i32_v2f32: ; GFX11: ; %bb.0: @@ -3093,54 +2918,54 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v10, v38 +; CI-NEXT: v_mul_f32_e32 v4, 1.0, v32 +; CI-NEXT: v_mul_f32_e32 v5, 1.0, v33 +; CI-NEXT: v_mul_f32_e32 v6, 1.0, v34 +; CI-NEXT: v_mul_f32_e32 v7, 1.0, v35 +; CI-NEXT: v_mul_f32_e32 v8, 1.0, v36 +; CI-NEXT: v_mul_f32_e32 v9, 1.0, v37 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v15, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v8, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_mul_f32_e32 v9, 1.0, v20 -; CI-NEXT: v_mul_f32_e32 v10, 1.0, v16 -; CI-NEXT: v_mul_f32_e32 v11, 1.0, v17 -; CI-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; CI-NEXT: v_mul_f32_e32 v17, 1.0, v19 -; CI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v17 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; CI-NEXT: buffer_store_short v14, off, s[4:7], 0 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; CI-NEXT: buffer_store_short v11, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v13, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v10, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v5, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3156,82 +2981,43 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v19, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v18, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v19, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX89-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; GFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; GFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt vmcnt(5) +; GFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dword v34, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dword v35, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dword v36, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16: ; GFX11: ; %bb.0: @@ -3277,284 +3063,132 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i16> %arg1, ptr addrspace(1) undef - store volatile <2 x half> %arg2, ptr addrspace(1) undef - store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef - store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef - ret void -} - -define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { -; CI-LABEL: void_func_v32i32_v2i64_v2f64: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v32i32_v2i64_v2f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_v2i64_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: void_func_v32i32_v2i64_v2f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - store volatile <32 x i32> %arg0, ptr addrspace(1) undef - store volatile <2 x i64> %arg1, ptr addrspace(1) undef - store volatile <2 x double> %arg2, ptr addrspace(1) undef - ret void -} - -define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { -; CI-LABEL: void_func_v32i32_v4i32_v4f32: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: void_func_v32i32_v4i32_v4f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: void_func_v32i32_v4i32_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] + store volatile <2 x half> %arg2, ptr addrspace(1) undef + store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef + store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef + ret void +} + +define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { +; CIGFX89-LABEL: void_func_v32i32_v2i64_v2f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; CIGFX89-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; CIGFX89-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(8) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v2i64_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + store volatile <32 x i32> %arg0, ptr addrspace(1) undef + store volatile <2 x i64> %arg1, ptr addrspace(1) undef + store volatile <2 x double> %arg2, ptr addrspace(1) undef + ret void +} + +define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { +; CIGFX89-LABEL: void_func_v32i32_v4i32_v4f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; CIGFX89-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; CIGFX89-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(8) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v4i32_v4f32: ; GFX11: ; %bb.0: @@ -3608,7 +3242,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3617,37 +3258,30 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3657,7 +3291,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3666,37 +3307,30 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3706,7 +3340,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3715,41 +3356,31 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3817,9 +3448,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3828,61 +3466,54 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3890,9 +3521,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3901,61 +3539,54 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,9 +3594,16 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -3974,69 +3612,57 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dwordx4 v[32:35], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[36:39], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4323,7 +3949,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -4332,61 +3965,54 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -4396,7 +4022,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -4405,61 +4038,54 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4469,7 +4095,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ubyte v35, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 @@ -4478,65 +4111,56 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:44 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v32, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v36, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v35, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v34, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v38, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v37, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 401cbce00ac9a..ac9f56d1ee7b1 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1497,8 +1497,8 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 @@ -1519,13 +1519,13 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 @@ -1780,8 +1780,8 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 @@ -1802,13 +1802,13 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 @@ -2063,8 +2063,8 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 @@ -2085,13 +2085,13 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:204 ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:196 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:192 ; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:188 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:180 @@ -2616,21 +2616,21 @@ define <32 x bfloat> @v32bf16_func_void() #0 { ; CI-NEXT: v_mov_b32_e32 v9, v1 ; CI-NEXT: v_mov_b32_e32 v10, v2 ; CI-NEXT: v_mov_b32_e32 v11, v3 -; CI-NEXT: v_mov_b32_e32 v12, v4 -; CI-NEXT: v_mov_b32_e32 v13, v5 -; CI-NEXT: v_mov_b32_e32 v14, v6 ; CI-NEXT: v_mov_b32_e32 v16, v0 ; CI-NEXT: v_mov_b32_e32 v17, v1 ; CI-NEXT: v_mov_b32_e32 v18, v2 ; CI-NEXT: v_mov_b32_e32 v19, v3 -; CI-NEXT: v_mov_b32_e32 v20, v4 -; CI-NEXT: v_mov_b32_e32 v21, v5 ; CI-NEXT: v_mov_b32_e32 v24, v0 ; CI-NEXT: v_mov_b32_e32 v25, v1 ; CI-NEXT: v_mov_b32_e32 v26, v2 ; CI-NEXT: v_mov_b32_e32 v27, v3 +; CI-NEXT: v_mov_b32_e32 v12, v4 +; CI-NEXT: v_mov_b32_e32 v20, v4 ; CI-NEXT: v_mov_b32_e32 v28, v4 +; CI-NEXT: v_mov_b32_e32 v13, v5 +; CI-NEXT: v_mov_b32_e32 v21, v5 ; CI-NEXT: v_mov_b32_e32 v29, v5 +; CI-NEXT: v_mov_b32_e32 v14, v6 ; CI-NEXT: v_mov_b32_e32 v22, v6 ; CI-NEXT: v_mov_b32_e32 v30, v6 ; CI-NEXT: v_mov_b32_e32 v15, v7 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 545a9af3f9a0b..5ccbc85f46dd4 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -5227,19 +5227,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v4, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; GFX9-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 7d07641f455e3..c3ab9c23d1950 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2379,140 +2379,128 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-LABEL: return_72xi32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX10-NEXT: s_clause 0x14 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:136 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:156 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:120 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:116 -; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:84 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:112 -; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:88 ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:108 -; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 +; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:136 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:144 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 -; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100 -; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96 -; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:92 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 ; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:88 -; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:84 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:80 -; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:76 -; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:72 -; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:68 -; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:64 -; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:60 -; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 ; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 -; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:52 -; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 -; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 -; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 ; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 +; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(32) -; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:284 -; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:280 -; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:276 -; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:272 -; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:268 -; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:264 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:260 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:256 -; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:252 -; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:240 -; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:220 -; GFX10-NEXT: s_waitcnt vmcnt(16) -; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:216 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:212 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:208 -; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:204 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:192 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:188 -; GFX10-NEXT: s_waitcnt vmcnt(8) -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:184 -; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:180 -; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:176 -; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:172 -; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:168 -; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:164 -; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:160 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:156 -; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 -; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:148 -; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:144 -; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140 -; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:284 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:280 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:276 +; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:272 +; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:268 +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:264 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:260 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:256 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:252 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:248 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:236 +; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:216 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:204 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:200 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:188 +; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:184 +; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:180 +; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:176 +; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:172 +; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:168 +; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164 +; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160 +; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:156 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:132 -; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:128 +; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:152 +; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:148 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:144 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:140 +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:132 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:128 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:124 ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: return_72xi32: diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index fbb54893d9b2a..a2fca33af1046 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1253,57 +1253,57 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: s_addc_u32 s5, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: s_add_u32 s2, s0, 48 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; CI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; CI-NEXT: v_mov_b32_e32 v7, s3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 -; CI-NEXT: v_mov_b32_e32 v6, s2 -; CI-NEXT: s_add_u32 s2, s0, 48 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v2 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; CI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 ; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v1 ; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_mov_b32_e32 v21, s3 -; CI-NEXT: v_mov_b32_e32 v23, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_mov_b32_e32 v20, s2 -; CI-NEXT: v_mov_b32_e32 v22, s0 -; CI-NEXT: flat_store_dwordx4 v[6:7], v[16:19] +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; CI-NEXT: flat_store_dwordx4 v[22:23], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f32: @@ -1312,26 +1312,24 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v23, s3 -; VI-NEXT: v_mov_b32_e32 v22, s2 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: v_mov_b32_e32 v21, s1 +; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v20, s0 +; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v25, s3 -; VI-NEXT: v_mov_b32_e32 v27, s1 -; VI-NEXT: v_mov_b32_e32 v24, s2 -; VI-NEXT: v_mov_b32_e32 v26, s0 +; VI-NEXT: v_mov_b32_e32 v21, s3 +; VI-NEXT: v_mov_b32_e32 v20, s2 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 ; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 @@ -1341,19 +1339,21 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v18, v7 -; VI-NEXT: v_cvt_f32_f16_e32 v16, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v19, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; VI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; VI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: flat_store_dwordx4 v[22:23], v[12:15] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; VI-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: global_extload_v16f16_to_v16f32: @@ -1665,43 +1665,43 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v7, s3 +; CI-NEXT: v_mov_b32_e32 v6, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 -; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v15, s3 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v21, s3 -; CI-NEXT: v_mov_b32_e32 v23, s1 -; CI-NEXT: v_mov_b32_e32 v20, s2 -; CI-NEXT: v_mov_b32_e32 v22, s0 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v10 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; CI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v8f16_to_v8f64: @@ -1713,39 +1713,39 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 -; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: v_mov_b32_e32 v8, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v16, s0 +; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v21, s3 -; VI-NEXT: v_mov_b32_e32 v23, s1 -; VI-NEXT: v_mov_b32_e32 v20, s2 -; VI-NEXT: v_mov_b32_e32 v22, s0 +; VI-NEXT: v_mov_b32_e32 v14, s2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 +; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 -; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 +; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_mov_b32_e32 v16, s0 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: global_extload_v8f16_to_v8f64: @@ -1794,92 +1794,91 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v16, s3 -; CI-NEXT: v_mov_b32_e32 v15, s2 +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v18, s3 -; CI-NEXT: v_mov_b32_e32 v17, s2 +; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_mov_b32_e32 v18, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x70 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v12, s1 -; CI-NEXT: v_mov_b32_e32 v11, s0 +; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v19 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; CI-NEXT: v_mov_b32_e32 v13, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; CI-NEXT: v_mov_b32_e32 v16, s3 -; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_mov_b32_e32 v15, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x50 -; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; CI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] -; CI-NEXT: v_cvt_f32_f16_e32 v12, v18 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x50 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v17 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v20, s3 +; CI-NEXT: v_mov_b32_e32 v19, s3 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v19, s2 +; CI-NEXT: v_mov_b32_e32 v18, s2 ; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; @@ -1897,76 +1896,77 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v13, s3 -; VI-NEXT: v_mov_b32_e32 v12, s2 +; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v15, s3 -; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: v_mov_b32_e32 v18, s3 +; VI-NEXT: v_mov_b32_e32 v17, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x50 +; VI-NEXT: v_mov_b32_e32 v12, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v17, s3 -; VI-NEXT: v_mov_b32_e32 v16, s2 +; VI-NEXT: v_mov_b32_e32 v11, s0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: s_add_u32 s2, s0, 64 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 -; VI-NEXT: v_mov_b32_e32 v11, s1 -; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x70 -; VI-NEXT: v_mov_b32_e32 v10, s0 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] +; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9 +; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 +; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 ; VI-NEXT: s_add_u32 s0, s0, 0x60 +; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v22, v4 -; VI-NEXT: v_cvt_f32_f16_sdwa v23, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v24, v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v21, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v26, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 -; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v30 -; VI-NEXT: v_mov_b32_e32 v21, s3 -; VI-NEXT: v_mov_b32_e32 v23, s1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v27 -; VI-NEXT: v_mov_b32_e32 v20, s2 -; VI-NEXT: v_mov_b32_e32 v22, s0 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; VI-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 +; VI-NEXT: v_mov_b32_e32 v20, s3 +; VI-NEXT: v_mov_b32_e32 v14, s1 +; VI-NEXT: v_mov_b32_e32 v19, s2 +; VI-NEXT: v_mov_b32_e32 v13, s0 +; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: global_extload_v16f16_to_v16f64: @@ -2368,52 +2368,51 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v13, s3 -; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CI-NEXT: v_mov_b32_e32 v12, s2 +; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v2, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_or_b32_e32 v0, v0, v18 ; CI-NEXT: v_or_b32_e32 v3, v6, v2 -; CI-NEXT: v_or_b32_e32 v2, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 -; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_or_b32_e32 v1, v10, v4 -; CI-NEXT: v_or_b32_e32 v0, v8, v5 +; CI-NEXT: v_or_b32_e32 v2, v17, v7 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_or_b32_e32 v3, v14, v6 -; CI-NEXT: v_or_b32_e32 v2, v12, v7 +; CI-NEXT: v_or_b32_e32 v1, v10, v6 +; CI-NEXT: v_or_b32_e32 v0, v8, v7 +; CI-NEXT: v_or_b32_e32 v3, v14, v9 +; CI-NEXT: v_or_b32_e32 v2, v12, v11 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -2429,31 +2428,29 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: s_add_u32 s4, s2, 48 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v17, s3 -; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 @@ -2464,17 +2461,19 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v18 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 ; VI-NEXT: v_or_b32_e32 v3, v6, v7 -; VI-NEXT: v_or_b32_e32 v2, v4, v5 +; VI-NEXT: v_or_b32_e32 v2, v18, v17 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_or_b32_e32 v1, v10, v11 ; VI-NEXT: v_or_b32_e32 v0, v8, v9 ; VI-NEXT: v_or_b32_e32 v3, v14, v15 ; VI-NEXT: v_or_b32_e32 v2, v12, v13 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index add62a5c39cb1..b9d3763e7def1 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2678,7 +2678,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18 +; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 +; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18 ; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 @@ -2686,8 +2687,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 -; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 +; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index e71c6cf71c882..74020c43a3ca3 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1111,16 +1111,13 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; GENERIC-LABEL: extract_neg_offset_sgpr_loaded: ; GENERIC: ; %bb.0: ; %entry ; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; GENERIC-NEXT: s_load_dword s2, s[4:5], 0x39 ; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29 -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_load_dword s2, s[4:5], 0x39 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 -; GENERIC-NEXT: s_or_b32 s4, s23, s51 -; GENERIC-NEXT: s_or_b32 s5, s22, s50 -; GENERIC-NEXT: s_or_b32 s6, s21, s49 -; GENERIC-NEXT: s_or_b32 s7, s20, s48 +; GENERIC-NEXT: s_or_b32 s6, s23, s51 +; GENERIC-NEXT: s_or_b32 s7, s22, s50 +; GENERIC-NEXT: s_or_b32 s21, s21, s49 +; GENERIC-NEXT: s_or_b32 s20, s20, s48 ; GENERIC-NEXT: s_or_b32 s19, s19, s47 ; GENERIC-NEXT: s_or_b32 s18, s18, s46 ; GENERIC-NEXT: s_or_b32 s17, s17, s45 @@ -1133,38 +1130,42 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; GENERIC-NEXT: s_or_b32 s10, s10, s38 ; GENERIC-NEXT: s_or_b32 s8, s8, s36 ; GENERIC-NEXT: s_or_b32 s9, s9, s37 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 -; GENERIC-NEXT: s_cselect_b32 s8, s9, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s9, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 2 -; GENERIC-NEXT: s_cselect_b32 s8, s10, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s10, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 3 -; GENERIC-NEXT: s_cselect_b32 s8, s11, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s11, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 4 -; GENERIC-NEXT: s_cselect_b32 s8, s12, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s12, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 5 -; GENERIC-NEXT: s_cselect_b32 s8, s13, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s13, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 6 -; GENERIC-NEXT: s_cselect_b32 s8, s14, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s14, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 7 -; GENERIC-NEXT: s_cselect_b32 s8, s15, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s15, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 8 -; GENERIC-NEXT: s_cselect_b32 s8, s16, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s16, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 9 -; GENERIC-NEXT: s_cselect_b32 s8, s17, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s17, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 10 -; GENERIC-NEXT: s_cselect_b32 s8, s18, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s18, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 11 -; GENERIC-NEXT: s_cselect_b32 s8, s19, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s19, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 12 -; GENERIC-NEXT: s_cselect_b32 s7, s7, s8 +; GENERIC-NEXT: s_cselect_b32 s4, s20, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 13 -; GENERIC-NEXT: s_cselect_b32 s6, s6, s7 +; GENERIC-NEXT: s_cselect_b32 s4, s21, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 14 -; GENERIC-NEXT: s_cselect_b32 s5, s5, s6 +; GENERIC-NEXT: s_cselect_b32 s4, s7, s4 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 15 -; GENERIC-NEXT: s_cselect_b32 s4, s4, s5 +; GENERIC-NEXT: s_cselect_b32 s4, s6, s4 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v0, s4 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GENERIC-NEXT: s_endpgm ; @@ -1278,9 +1279,9 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: s_or_b32 s8, s8, s36 -; SI-MOVREL-NEXT: s_or_b32 s5, s23, s51 -; SI-MOVREL-NEXT: s_or_b32 s6, s22, s50 -; SI-MOVREL-NEXT: s_or_b32 s7, s21, s49 +; SI-MOVREL-NEXT: s_or_b32 s6, s23, s51 +; SI-MOVREL-NEXT: s_or_b32 s7, s22, s50 +; SI-MOVREL-NEXT: s_or_b32 s21, s21, s49 ; SI-MOVREL-NEXT: s_or_b32 s20, s20, s48 ; SI-MOVREL-NEXT: s_or_b32 s19, s19, s47 ; SI-MOVREL-NEXT: s_or_b32 s18, s18, s46 @@ -1307,9 +1308,9 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 ; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 ; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s6 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s6 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm @@ -5699,94 +5700,94 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GENERIC-NEXT: v_mov_b32_e32 v2, 0 ; GENERIC-NEXT: s_mov_b32 s27, s3 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc +; GENERIC-NEXT: buffer_load_dword v14, v[1:2], s[24:27], 0 addr64 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: ;;#ASMSTART ; GENERIC-NEXT: v_mov_b32 v1, 62 ; GENERIC-NEXT: ;;#ASMEND -; GENERIC-NEXT: v_mov_b32_e32 v3, s20 -; GENERIC-NEXT: v_mov_b32_e32 v4, s21 -; GENERIC-NEXT: v_mov_b32_e32 v5, s22 -; GENERIC-NEXT: v_mov_b32_e32 v6, s23 -; GENERIC-NEXT: v_mov_b32_e32 v7, s16 -; GENERIC-NEXT: v_mov_b32_e32 v8, s17 -; GENERIC-NEXT: v_mov_b32_e32 v9, s18 -; GENERIC-NEXT: v_mov_b32_e32 v10, s19 -; GENERIC-NEXT: v_mov_b32_e32 v11, s12 -; GENERIC-NEXT: v_mov_b32_e32 v12, s13 -; GENERIC-NEXT: v_mov_b32_e32 v13, s14 -; GENERIC-NEXT: v_mov_b32_e32 v14, s15 -; GENERIC-NEXT: v_mov_b32_e32 v15, s8 -; GENERIC-NEXT: v_mov_b32_e32 v16, s9 -; GENERIC-NEXT: v_mov_b32_e32 v17, s10 -; GENERIC-NEXT: v_mov_b32_e32 v18, s11 -; GENERIC-NEXT: v_add_i32_e32 v19, vcc, 1, v2 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; GENERIC-NEXT: v_mov_b32_e32 v10, s22 +; GENERIC-NEXT: v_mov_b32_e32 v11, s23 +; GENERIC-NEXT: v_mov_b32_e32 v15, s16 +; GENERIC-NEXT: v_mov_b32_e32 v2, s18 +; GENERIC-NEXT: v_mov_b32_e32 v3, s19 +; GENERIC-NEXT: v_mov_b32_e32 v4, s12 +; GENERIC-NEXT: v_mov_b32_e32 v5, s13 +; GENERIC-NEXT: v_mov_b32_e32 v6, s14 +; GENERIC-NEXT: v_mov_b32_e32 v7, s15 +; GENERIC-NEXT: v_mov_b32_e32 v8, s8 +; GENERIC-NEXT: v_mov_b32_e32 v9, s9 +; GENERIC-NEXT: v_mov_b32_e32 v12, s10 +; GENERIC-NEXT: v_mov_b32_e32 v13, s11 +; GENERIC-NEXT: v_add_i32_e32 v18, vcc, 1, v14 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 ; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc +; GENERIC-NEXT: v_mov_b32_e32 v16, s17 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 +; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; GENERIC-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc +; GENERIC-NEXT: v_mov_b32_e32 v15, s21 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc +; GENERIC-NEXT: v_mov_b32_e32 v19, s20 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; GENERIC-NEXT: s_waitcnt vmcnt(0) @@ -6257,97 +6258,98 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc +; SI-MOVREL-NEXT: buffer_load_dword v14, v[1:2], s[8:11], 0 addr64 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: ;;#ASMSTART ; SI-MOVREL-NEXT: v_mov_b32 v1, 62 ; SI-MOVREL-NEXT: ;;#ASMEND ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s20 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s21 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s22 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s23 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s16 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s17 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s19 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s11 -; SI-MOVREL-NEXT: v_add_i32_e32 v19, vcc, 1, v2 -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s22 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s23 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s16 +; SI-MOVREL-NEXT: v_add_i32_e32 v18, vcc, 1, v14 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s17 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s21 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc +; SI-MOVREL-NEXT: v_mov_b32_e32 v19, s20 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 @@ -6368,104 +6370,104 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: flat_load_dword v2, v[1:2] glc +; VI-NEXT: flat_load_dword v14, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: v_mov_b32 v1, 62 ; VI-NEXT: ;;#ASMEND +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mov_b32_e32 v9, s9 +; VI-NEXT: v_mov_b32_e32 v12, s10 +; VI-NEXT: v_mov_b32_e32 v13, s11 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 1, v14 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 +; VI-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 +; VI-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 +; VI-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 +; VI-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 +; VI-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; VI-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 +; VI-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 +; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; VI-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 +; VI-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 +; VI-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 +; VI-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 +; VI-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 +; VI-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 +; VI-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 +; VI-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; VI-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; VI-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 +; VI-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 +; VI-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s20 -; VI-NEXT: v_mov_b32_e32 v4, s21 -; VI-NEXT: v_mov_b32_e32 v5, s22 -; VI-NEXT: v_mov_b32_e32 v6, s23 -; VI-NEXT: v_mov_b32_e32 v7, s16 -; VI-NEXT: v_mov_b32_e32 v8, s17 -; VI-NEXT: v_mov_b32_e32 v9, s18 -; VI-NEXT: v_mov_b32_e32 v10, s19 -; VI-NEXT: v_mov_b32_e32 v11, s12 -; VI-NEXT: v_mov_b32_e32 v12, s13 -; VI-NEXT: v_mov_b32_e32 v13, s14 -; VI-NEXT: v_mov_b32_e32 v14, s15 -; VI-NEXT: v_mov_b32_e32 v15, s8 -; VI-NEXT: v_mov_b32_e32 v16, s9 -; VI-NEXT: v_mov_b32_e32 v17, s10 -; VI-NEXT: v_mov_b32_e32 v18, s11 ; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc +; VI-NEXT: v_mov_b32_e32 v19, s20 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v2 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 -; VI-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 -; VI-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 -; VI-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 -; VI-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 -; VI-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 -; VI-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 -; VI-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 -; VI-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; VI-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 -; VI-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; VI-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; VI-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; VI-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 -; VI-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 -; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 -; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 -; VI-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 -; VI-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 -; VI-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 -; VI-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 -; VI-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 -; VI-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 -; VI-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 -; VI-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 -; VI-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 -; VI-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 -; VI-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; VI-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 ; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; VI-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6496,105 +6498,105 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GFX9-IDXMODE: ; %bb.0: ; %entry ; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dword v3, v1, s[0:1] glc +; GFX9-IDXMODE-NEXT: global_load_dword v14, v1, s[0:1] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: ;;#ASMSTART ; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62 ; GFX9-IDXMODE-NEXT: ;;#ASMEND -; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s19 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v27, v10, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, v16, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 -; GFX9-IDXMODE-NEXT: v_add_u32_e32 v20, 1, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v18, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v19, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v3, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v20 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s22 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s23 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s16 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v18, 1, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v20 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v12, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v20 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v11, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v27, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v18, 63, v24, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v14 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, v15, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v18 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v15, vcc +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s20 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[15:18], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:48 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[11:14], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] offset:32 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[7:10], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[3:6], s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2 @@ -6629,132 +6631,134 @@ bb2: define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) { ; GENERIC-LABEL: insert_w_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_mov_b32 s2, -1 -; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41500000 -; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41880000 -; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41600000 -; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41700000 -; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41800000 -; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41100000 -; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41200000 -; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41300000 -; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41400000 +; GENERIC-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s24, s[4:5], 0xb +; GENERIC-NEXT: s_mov_b32 s31, 0xf000 +; GENERIC-NEXT: s_mov_b32 s30, -1 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000 ; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40c00000 ; GENERIC-NEXT: v_mov_b32_e32 v11, 0x40e00000 ; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41000000 ; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40400000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_add_i32 s5, s4, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s5, 12 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 13 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 14 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 15 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 8 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 9 -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 10 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 5 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, 1.0, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 1 -; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, 2.0, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 2 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc -; GENERIC-NEXT: s_cmp_eq_u32 s5, 3 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v16, 4.0, v8, vcc -; GENERIC-NEXT: s_add_i32 s4, s4, 2 -; GENERIC-NEXT: s_cmp_lg_u32 s4, 3 -; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 +; GENERIC-NEXT: s_add_i32 s25, s24, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 12 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 13 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 14 +; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 15 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 8 +; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 9 +; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 10 +; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 11 +; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 4 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 5 +; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 6 +; GENERIC-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 7 +; GENERIC-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s25, 0 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, v0, s[22:23] +; GENERIC-NEXT: s_cmp_eq_u32 s25, 1 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v14, 2.0, v0, s[22:23] +; GENERIC-NEXT: s_cmp_eq_u32 s25, 2 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[22:23] +; GENERIC-NEXT: s_cmp_eq_u32 s25, 3 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v16, 4.0, v0, s[22:23] +; GENERIC-NEXT: s_add_i32 s26, s24, 2 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 3 +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 2 +; GENERIC-NEXT: s_cselect_b64 s[24:25], -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e32 v16, v8, v16, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 2 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v15, v8, v15, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 1 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 0 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, v8, v13, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 7 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 6 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 5 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 4 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 11 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 10 -; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 9 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 8 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 15 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 13 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s4, 12 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GENERIC-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[22:23] +; GENERIC-NEXT: v_cndmask_b32_e64 v15, v0, v15, s[24:25] +; GENERIC-NEXT: s_cmp_lg_u32 s26, 1 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v14, v0, v14, s[22:23] +; GENERIC-NEXT: s_cmp_lg_u32 s26, 0 +; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, v0, v13, s[22:23] +; GENERIC-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[14:15] +; GENERIC-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[16:17] +; GENERIC-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[18:19] +; GENERIC-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[20:21] +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:16 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 7 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 6 +; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e64 v12, v0, v12, s[14:15] +; GENERIC-NEXT: v_cndmask_b32_e64 v11, v0, v11, s[16:17] +; GENERIC-NEXT: s_cmp_lg_u32 s26, 5 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[14:15] +; GENERIC-NEXT: s_cmp_lg_u32 s26, 4 +; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v9, v0, v9, s[14:15] +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GENERIC-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] +; GENERIC-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[2:3] +; GENERIC-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] +; GENERIC-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[6:7] +; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:48 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] +; GENERIC-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[10:11] +; GENERIC-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[12:13] +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:32 +; GENERIC-NEXT: s_cmp_lg_u32 s26, 11 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:80 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(1) +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 10 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:96 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s26, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:112 +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 offset:64 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insert_w_offset_multiple_in_block: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 48a168b4bfbe7..d5b6c19399a1f 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1314,108 +1314,108 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v16bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x4 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[0:1], s[14:15] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[8:11], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16 -; SI-NEXT: s_cmp_eq_u32 s5, 6 -; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 +; SI-NEXT: s_cmp_eq_u32 s7, 6 +; SI-NEXT: v_mov_b32_e32 v6, s6 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 7 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_cmp_eq_u32 s7, 7 +; SI-NEXT: s_mov_b64 s[14:15], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 4 -; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 5 +; SI-NEXT: s_cmp_eq_u32 s7, 4 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 2 +; SI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 2 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 3 +; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s7, 0 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[0:1] +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 0 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v13, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 1 +; SI-NEXT: v_cndmask_b32_e64 v12, v13, v6, s[2:3] +; SI-NEXT: s_cmp_eq_u32 s7, 1 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 14 -; SI-NEXT: v_or_b32_e32 v8, v8, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v14, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s7, 14 +; SI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5] +; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: v_cndmask_b32_e32 v12, v14, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 15 +; SI-NEXT: s_cmp_eq_u32 s7, 15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 12 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v15, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s7, 12 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: v_cndmask_b32_e32 v12, v15, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 13 +; SI-NEXT: s_cmp_eq_u32 s7, 13 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 10 -; SI-NEXT: v_or_b32_e32 v3, v3, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s7, 10 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_cndmask_b32_e32 v12, v16, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 11 +; SI-NEXT: s_cmp_eq_u32 s7, 11 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v11 -; SI-NEXT: v_cndmask_b32_e32 v11, v17, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s7, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 9 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: s_cmp_eq_u32 s7, 9 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[12:15], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -1429,81 +1429,81 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s5, 14 +; VI-NEXT: s_cmp_eq_u32 s7, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_mov_b32_e32 v12, s6 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 15 +; VI-NEXT: s_cmp_eq_u32 s7, 15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 12 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 13 +; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 13 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 10 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 11 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 10 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 11 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 8 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 9 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 9 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 7 +; VI-NEXT: s_cmp_eq_u32 s7, 7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 4 -; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 4 +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_cmp_eq_u32 s7, 5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 2 -; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 2 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 3 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_cmp_eq_u32 s7, 3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; VI-NEXT: s_cmp_eq_u32 s7, 1 +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -1542,16 +1542,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 0 -; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 1 ; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 14 -; GFX900-NEXT: v_perm_b32 v2, v10, v2, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 15 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1559,30 +1557,32 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 12 -; GFX900-NEXT: v_perm_b32 v1, v10, v1, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc +; GFX900-NEXT: v_perm_b32 v1, v12, v1, s2 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 13 ; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 10 -; GFX900-NEXT: v_perm_b32 v8, v10, v8, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc +; GFX900-NEXT: v_perm_b32 v8, v12, v8, s2 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 11 -; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 8 -; GFX900-NEXT: v_perm_b32 v7, v10, v7, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX900-NEXT: s_cmp_eq_u32 s5, 9 -; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_perm_b32 v2, v11, v2, s2 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX900-NEXT: v_perm_b32 v7, v12, v7, s2 ; GFX900-NEXT: v_perm_b32 v6, v10, v6, s2 ; GFX900-NEXT: v_perm_b32 v5, v9, v5, s2 ; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index d09af8fd2ac95..12b4b2b372ef8 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2794,16 +2794,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 0 -; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 14 -; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2811,30 +2809,32 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 12 -; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc +; GFX9-NEXT: v_perm_b32 v1, v12, v1, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 10 -; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc +; GFX9-NEXT: v_perm_b32 v8, v12, v8, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 11 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 8 -; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 9 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_perm_b32 v2, v11, v2, s2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2 ; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 ; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2 ; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 @@ -2844,7 +2844,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2858,81 +2858,81 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s5, 14 +; VI-NEXT: s_cmp_eq_u32 s7, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_mov_b32_e32 v12, s6 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 15 +; VI-NEXT: s_cmp_eq_u32 s7, 15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 12 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 13 +; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 13 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 10 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 11 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 10 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 11 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 8 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 9 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 9 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 7 +; VI-NEXT: s_cmp_eq_u32 s7, 7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 4 -; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 4 +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_cmp_eq_u32 s7, 5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 2 -; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 2 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 3 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_cmp_eq_u32 s7, 3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 0 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; VI-NEXT: s_cmp_eq_u32 s7, 1 +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -2965,101 +2965,101 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 ; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 11 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 10 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] ; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; CI-NEXT: v_or_b32_e32 v9, v9, v12 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: v_or_b32_e32 v8, v8, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 ; CI-NEXT: s_cmp_eq_u32 s5, 9 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 8 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 7 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_or_b32_e32 v10, v10, v11 -; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: s_cmp_eq_u32 s5, 3 +; CI-NEXT: v_or_b32_e32 v10, v10, v11 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; CI-NEXT: v_or_b32_e32 v7, v7, v12 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v9, v9, v12 +; CI-NEXT: v_or_b32_e32 v3, v3, v12 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_or_b32_e32 v2, v2, v12 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; CI-NEXT: s_cmp_eq_u32 s5, 3 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_or_b32_e32 v7, v7, v12 -; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 1 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; CI-NEXT: v_or_b32_e32 v8, v8, v13 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_or_b32_e32 v3, v3, v13 +; CI-NEXT: v_or_b32_e32 v1, v1, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 ; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 26a4ea9d8a4b6..edf900a50cd4b 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5413,33 +5413,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] ; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v8, v13, vcc +; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] @@ -5518,33 +5518,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] ; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v8, v13, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v13, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] @@ -5615,33 +5615,33 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v8, v13, vcc +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v9, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v4, v[14:15] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v6, v[16:17] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v13, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v12 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[2:3] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir index dd478f94e1039..98552de05c857 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir @@ -45,6 +45,10 @@ body: | ; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_15:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CVT_F64_I32_e32_16:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) @@ -64,14 +68,10 @@ body: | ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_14]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_15:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_15]], implicit $exec - ; GCN-NEXT: [[V_CVT_F64_I32_e32_16:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_16]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_13]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_14]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_15]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_16]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_17:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY17]], implicit $mode, implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_17]], implicit $exec ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 1d0367db70143..4532571d5cf2a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -2059,207 +2059,207 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX7-LABEL: v_maximum_v16f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v0, v16 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v22 -; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v6, v16 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v23 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v7, v16 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v24 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v18 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v16 -; GFX7-NEXT: v_max_f32_e32 v8, v8, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v25 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v17 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9 ; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v16 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v26 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v17 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 ; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v28 ; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v16 -; GFX7-NEXT: v_max_f32_e32 v10, v10, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v17 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v17 ; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v21 -; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v28 -; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v29 -; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 ; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v18 +; GFX7-NEXT: v_max_f32_e32 v12, v12, v18 +; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v29 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v17 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v27 -; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v22 +; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18 +; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v13 ; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v19 +; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v18, v16 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v17 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v23 +; GFX7-NEXT: v_max_f32_e32 v16, v18, v16 +; GFX7-NEXT: v_max_f32_e32 v18, v13, v0 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v15 +; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v30 +; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v17 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v24 +; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 ; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v13 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_mov_b32_e32 v19, 0x7fc00000 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v13, v19, v16, s[26:27] +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v17 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v25 +; GFX7-NEXT: v_max_f32_e32 v16, v14, v15 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v15 +; GFX7-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v19, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[8:9] +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v17 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v26 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v19, v7, s[14:15] +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v19, v8, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v19, v12, s[24:25] +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v17 +; GFX7-NEXT: v_max_f32_e32 v10, v10, v17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v27 +; GFX7-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[20:21] +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v17 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v17 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v12, v20 -; GFX7-NEXT: v_max_f32_e32 v12, v12, v20 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc -; GFX7-NEXT: v_max_f32_e32 v20, v13, v19 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc -; GFX7-NEXT: v_max_f32_e32 v19, v14, v18 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v17, v19, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[22:23] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GFX7-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v19, v18, s[28:29] +; GFX7-NEXT: v_max_f32_e32 v15, v20, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v20, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX8-NEXT: v_max_f16_e32 v18, v17, v16 -; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v17, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v18, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX8-NEXT: v_max_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_max_f16_e32 v16, v18, v17 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX8-NEXT: v_max_f16_e32 v21, v20, v18 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v20, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GFX8-NEXT: v_max_f16_e32 v22, v21, v20 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v21, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v19, v22, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX8-NEXT: v_max_f16_e32 v23, v22, v21 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v22, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v19, v23, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; GFX8-NEXT: v_max_f16_e32 v24, v23, v22 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v23, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v19, v24, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX8-NEXT: v_max_f16_e32 v25, v24, v23 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v24, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v19, v25, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v26, v25, v24 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v25, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v19, v26, vcc -; GFX8-NEXT: v_max_f16_e32 v25, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v25, vcc -; GFX8-NEXT: v_max_f16_e32 v15, v6, v14 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v19, v15, vcc -; GFX8-NEXT: v_max_f16_e32 v14, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v19, v14, vcc -; GFX8-NEXT: v_max_f16_e32 v13, v4, v12 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v19, v13, vcc -; GFX8-NEXT: v_max_f16_e32 v12, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v12, vcc -; GFX8-NEXT: v_max_f16_e32 v11, v2, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX8-NEXT: v_max_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX8-NEXT: v_max_f16_e32 v21, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX8-NEXT: v_max_f16_e32 v22, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX8-NEXT: v_max_f16_e32 v23, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX8-NEXT: v_max_f16_e32 v24, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX8-NEXT: v_max_f16_e32 v25, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 +; GFX8-NEXT: v_max_f16_e32 v17, v6, v14 +; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 +; GFX8-NEXT: v_max_f16_e32 v6, v5, v13 +; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 +; GFX8-NEXT: v_max_f16_e32 v5, v4, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 +; GFX8-NEXT: v_max_f16_e32 v4, v3, v11 +; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 +; GFX8-NEXT: v_max_f16_e32 v11, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 +; GFX8-NEXT: v_max_f16_e32 v13, v7, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v11, vcc -; GFX8-NEXT: v_max_f16_e32 v10, v1, v9 +; GFX8-NEXT: v_max_f16_e32 v14, v1, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v10, vcc -; GFX8-NEXT: v_max_f16_e32 v9, v0, v8 +; GFX8-NEXT: v_max_f16_e32 v7, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v9, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v18 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; GFX8-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index df7355c2c57bf..584dd2700c419 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -1730,20 +1730,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: v_writelane_b32 v31, s30, 0 +; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v19, v0, v16 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX7-NEXT: v_max_f32_e32 v16, v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1752,39 +1752,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_max_f32_e32 v19, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX7-NEXT: v_readlane_b32 s31, v31, 1 ; GFX7-NEXT: v_readlane_b32 s30, v31, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -1797,20 +1797,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_writelane_b32 v31, s30, 0 +; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX8-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX8-NEXT: v_max_f32_e32 v19, v0, v16 +; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX8-NEXT: v_max_f32_e32 v16, v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1819,39 +1819,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_max_f32_e32 v19, v14, v30 -; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX8-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX8-NEXT: v_readlane_b32 s31, v31, 1 ; GFX8-NEXT: v_readlane_b32 s30, v31, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -1864,20 +1864,20 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX900-NEXT: v_writelane_b32 v31, s30, 0 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v31, s30, 0 +; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX900-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX900-NEXT: v_max_f32_e32 v19, v0, v16 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX900-NEXT: v_max_f32_e32 v16, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1886,39 +1886,39 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_max_f32_e32 v19, v14, v30 -; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX900-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX900-NEXT: v_readlane_b32 s31, v31, 1 ; GFX900-NEXT: v_readlane_b32 s30, v31, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_max_f32_e32 v16, v15, v17 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index f8c2c54af2783..0b9cb9682ea5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -1598,87 +1598,87 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-LABEL: v_minimum_v16f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX8-NEXT: v_min_f16_e32 v18, v17, v16 -; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v17, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v18, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX8-NEXT: v_min_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_min_f16_e32 v16, v18, v17 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v18, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GFX8-NEXT: v_min_f16_e32 v21, v20, v18 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v20, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GFX8-NEXT: v_min_f16_e32 v22, v21, v20 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v21, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v19, v22, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX8-NEXT: v_min_f16_e32 v23, v22, v21 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v22, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v19, v23, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; GFX8-NEXT: v_min_f16_e32 v24, v23, v22 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v23, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v19, v24, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX8-NEXT: v_min_f16_e32 v25, v24, v23 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v24, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v19, v25, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v26, v25, v24 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v25, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v19, v26, vcc -; GFX8-NEXT: v_min_f16_e32 v25, v7, v15 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v25, vcc -; GFX8-NEXT: v_min_f16_e32 v15, v6, v14 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v19, v15, vcc -; GFX8-NEXT: v_min_f16_e32 v14, v5, v13 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v19, v14, vcc -; GFX8-NEXT: v_min_f16_e32 v13, v4, v12 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v19, v13, vcc -; GFX8-NEXT: v_min_f16_e32 v12, v3, v11 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v12, vcc -; GFX8-NEXT: v_min_f16_e32 v11, v2, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX8-NEXT: v_min_f16_e32 v20, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[4:5], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX8-NEXT: v_min_f16_e32 v21, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[6:7], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX8-NEXT: v_min_f16_e32 v22, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[8:9], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX8-NEXT: v_min_f16_e32 v23, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[10:11], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX8-NEXT: v_min_f16_e32 v24, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[12:13], v18, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX8-NEXT: v_min_f16_e32 v25, v18, v17 +; GFX8-NEXT: v_cmp_o_f16_e64 s[14:15], v18, v17 +; GFX8-NEXT: v_min_f16_e32 v17, v6, v14 +; GFX8-NEXT: v_cmp_o_f16_e64 s[16:17], v6, v14 +; GFX8-NEXT: v_min_f16_e32 v6, v5, v13 +; GFX8-NEXT: v_cmp_o_f16_e64 s[18:19], v5, v13 +; GFX8-NEXT: v_min_f16_e32 v5, v4, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[20:21], v4, v12 +; GFX8-NEXT: v_min_f16_e32 v4, v3, v11 +; GFX8-NEXT: v_cmp_o_f16_e64 s[22:23], v3, v11 +; GFX8-NEXT: v_min_f16_e32 v11, v7, v15 +; GFX8-NEXT: v_cmp_o_f16_e64 s[24:25], v7, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_mov_b32_e32 v19, 0x7e00 +; GFX8-NEXT: v_min_f16_e32 v13, v7, v12 +; GFX8-NEXT: v_cmp_o_f16_e64 s[26:27], v7, v12 +; GFX8-NEXT: v_min_f16_e32 v3, v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v19, v13, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v11, vcc -; GFX8-NEXT: v_min_f16_e32 v10, v1, v9 +; GFX8-NEXT: v_min_f16_e32 v14, v1, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v10, vcc -; GFX8-NEXT: v_min_f16_e32 v9, v0, v8 +; GFX8-NEXT: v_min_f16_e32 v7, v0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v18, v19, v22, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v22, v19, v25, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v14, vcc ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v9, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v18 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; GFX8-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v16, v19, v21, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v19, v24, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v19, v7, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v20, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v20, v19, v23, s[10:11] +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v19, v4, s[22:23] +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v5, s[20:21] +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v19, v6, s[18:19] +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v19, v17, s[16:17] +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; GFX8-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 956de6de3aad3..9962433134073 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -1730,20 +1730,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_writelane_b32 v31, s30, 0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX7-NEXT: v_writelane_b32 v31, s30, 0 +; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX7-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX7-NEXT: v_writelane_b32 v31, s31, 1 ; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v19, v0, v16 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX7-NEXT: v_min_f32_e32 v16, v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1752,39 +1752,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX7-NEXT: v_min_f32_e32 v19, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX7-NEXT: v_readlane_b32 s31, v31, 1 ; GFX7-NEXT: v_readlane_b32 s30, v31, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] @@ -1797,20 +1797,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v31, s30, 0 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX8-NEXT: v_writelane_b32 v31, s30, 0 +; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX8-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX8-NEXT: v_writelane_b32 v31, s31, 1 ; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX8-NEXT: v_min_f32_e32 v19, v0, v16 +; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX8-NEXT: v_min_f32_e32 v16, v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1819,39 +1819,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_min_f32_e32 v19, v14, v30 -; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX8-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX8-NEXT: v_readlane_b32 s31, v31, 1 ; GFX8-NEXT: v_readlane_b32 s30, v31, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -1864,20 +1864,20 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX900-NEXT: v_writelane_b32 v31, s30, 0 ; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 ; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v31, s30, 0 +; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 ; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX900-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX900-NEXT: v_writelane_b32 v31, s31, 1 ; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 ; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_mov_b32_e32 v18, 0x7fc00000 +; GFX900-NEXT: v_min_f32_e32 v19, v0, v16 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v0, v16 +; GFX900-NEXT: v_min_f32_e32 v16, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 ; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 ; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 @@ -1886,39 +1886,39 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 ; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v8, v24 ; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v9, v25 ; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v10, v26 ; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v11, v27 ; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v12, v28 ; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX900-NEXT: v_min_f32_e32 v19, v14, v30 -; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v13, v29 +; GFX900-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v18, v19, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v18, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v18, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v18, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v18, v12, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v18, v13, s[26:27] ; GFX900-NEXT: v_readlane_b32 s31, v31, 1 ; GFX900-NEXT: v_readlane_b32 s30, v31, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_min_f32_e32 v16, v15, v17 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index c735854a45590..b378d69fb842f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -574,84 +574,85 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: s_brev_b32 s6, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7] +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v5, s11 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 -; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec +; CI-NEXT: v_mov_b32_e32 v2, s7 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_bfi_b32 v5, s6, v2, v5 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v10, s9 ; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] -; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] ; CI-NEXT: v_mov_b32_e32 v12, s15 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s6, v5, v12 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11] ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v14, s13 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v16, s19 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s6, v5, v16 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_mov_b32_e32 v18, s17 -; CI-NEXT: v_add_f64 v[10:11], s[22:23], -v[16:17] -; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v10, s17 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 +; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] ; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 ; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v20, s23 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_mov_b32_e32 v5, s2 +; CI-NEXT: v_mov_b32_e32 v18, s23 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v18 +; CI-NEXT: v_mov_b32_e32 v18, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: v_mov_b32_e32 v19, s21 ; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_mov_b32_e32 v18, s21 -; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_bfi_b32 v5, s6, v18, v19 ; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 ; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index c1ab63b8160c6..223870950e4b7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -1772,42 +1772,42 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v12, v[0:1] +; GFX8-NEXT: flat_load_ushort v18, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v12, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: v_mov_b32_e32 v22, s0 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v3, v12, 3, 1 -; GFX8-NEXT: v_bfe_i32 v2, v12, 2, 1 -; GFX8-NEXT: v_bfe_i32 v1, v12, 1, 1 -; GFX8-NEXT: v_bfe_i32 v0, v12, 0, 1 -; GFX8-NEXT: v_bfe_i32 v7, v12, 7, 1 -; GFX8-NEXT: v_bfe_i32 v6, v12, 6, 1 -; GFX8-NEXT: v_bfe_i32 v5, v12, 5, 1 -; GFX8-NEXT: v_bfe_i32 v4, v12, 4, 1 -; GFX8-NEXT: v_bfe_i32 v11, v12, 11, 1 -; GFX8-NEXT: v_bfe_i32 v10, v12, 10, 1 -; GFX8-NEXT: v_bfe_i32 v9, v12, 9, 1 -; GFX8-NEXT: v_bfe_i32 v8, v12, 8, 1 -; GFX8-NEXT: v_bfe_i32 v15, v12, 15, 1 -; GFX8-NEXT: v_bfe_i32 v14, v12, 14, 1 -; GFX8-NEXT: v_bfe_i32 v13, v12, 13, 1 -; GFX8-NEXT: v_bfe_i32 v12, v12, 12, 1 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX8-NEXT: v_bfe_i32 v7, v18, 15, 1 +; GFX8-NEXT: v_bfe_i32 v6, v18, 14, 1 +; GFX8-NEXT: v_bfe_i32 v5, v18, 13, 1 +; GFX8-NEXT: v_bfe_i32 v4, v18, 12, 1 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NEXT: v_bfe_i32 v11, v18, 11, 1 +; GFX8-NEXT: v_bfe_i32 v10, v18, 10, 1 +; GFX8-NEXT: v_bfe_i32 v9, v18, 9, 1 +; GFX8-NEXT: v_bfe_i32 v8, v18, 8, 1 +; GFX8-NEXT: v_bfe_i32 v3, v18, 3, 1 +; GFX8-NEXT: v_bfe_i32 v2, v18, 2, 1 +; GFX8-NEXT: v_bfe_i32 v1, v18, 1, 1 +; GFX8-NEXT: v_bfe_i32 v0, v18, 0, 1 +; GFX8-NEXT: v_bfe_i32 v7, v18, 7, 1 +; GFX8-NEXT: v_bfe_i32 v6, v18, 6, 1 +; GFX8-NEXT: v_bfe_i32 v5, v18, 5, 1 +; GFX8-NEXT: v_bfe_i32 v4, v18, 4, 1 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v16i1_to_v16i32: @@ -2707,33 +2707,33 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_u32 s8, s2, 0x1000b ; GFX6-NEXT: s_bfe_u32 s9, s2, 0x10009 ; GFX6-NEXT: s_bfe_u32 s10, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s11, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10019 -; GFX6-NEXT: s_lshr_b32 s18, s2, 31 -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x1001d -; GFX6-NEXT: s_bfe_u32 s20, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s21, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s24, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s27, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s28, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s29, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s33, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10019 -; GFX6-NEXT: s_lshr_b32 s35, s3, 31 -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x1001d -; GFX6-NEXT: s_and_b32 s37, s2, 1 -; GFX6-NEXT: s_bfe_u32 s38, s2, 0x10002 +; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000d +; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10013 +; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10011 +; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10017 +; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10015 +; GFX6-NEXT: s_bfe_u32 s18, s2, 0x1001b +; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10019 +; GFX6-NEXT: s_lshr_b32 s20, s2, 31 +; GFX6-NEXT: s_bfe_u32 s21, s2, 0x1001d +; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10003 +; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10001 +; GFX6-NEXT: s_bfe_u32 s24, s3, 0x10007 +; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10005 +; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000b +; GFX6-NEXT: s_bfe_u32 s27, s3, 0x10009 +; GFX6-NEXT: s_bfe_u32 s28, s3, 0x1000f +; GFX6-NEXT: s_bfe_u32 s29, s3, 0x1000d +; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10013 +; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10011 +; GFX6-NEXT: s_bfe_u32 s33, s3, 0x10017 +; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10015 +; GFX6-NEXT: s_bfe_u32 s35, s3, 0x1001b +; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10019 +; GFX6-NEXT: s_lshr_b32 s37, s3, 31 +; GFX6-NEXT: s_bfe_u32 s38, s3, 0x1001d +; GFX6-NEXT: s_and_b32 s12, s2, 1 +; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10002 ; GFX6-NEXT: s_bfe_u32 s39, s2, 0x10006 ; GFX6-NEXT: s_bfe_u32 s40, s2, 0x10004 ; GFX6-NEXT: s_bfe_u32 s41, s2, 0x1000a @@ -2752,91 +2752,90 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002 ; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10006 ; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10004 -; GFX6-NEXT: s_bfe_u32 s57, s3, 0x1000a -; GFX6-NEXT: s_bfe_u32 s58, s3, 0x10008 -; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000e +; GFX6-NEXT: s_bfe_u32 s57, s3, 0x10008 +; GFX6-NEXT: s_bfe_u32 s58, s3, 0x1000e +; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000c ; GFX6-NEXT: s_bfe_u32 s60, s3, 0x10012 ; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010 ; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10016 -; GFX6-NEXT: s_bfe_u32 s63, s3, 0x1001a -; GFX6-NEXT: s_bfe_u32 s64, s3, 0x10018 -; GFX6-NEXT: s_bfe_u32 s65, s3, 0x1001e -; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001c -; GFX6-NEXT: s_bfe_u32 s67, s3, 0x10014 -; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000c +; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10014 +; GFX6-NEXT: s_bfe_u32 s64, s3, 0x1001a +; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018 +; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001e +; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001c +; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000a ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v1, s36 -; GFX6-NEXT: v_mov_b32_e32 v2, s65 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: v_mov_b32_e32 v4, s64 -; GFX6-NEXT: v_mov_b32_e32 v5, s34 -; GFX6-NEXT: v_mov_b32_e32 v6, s63 -; GFX6-NEXT: v_mov_b32_e32 v7, s33 -; GFX6-NEXT: v_mov_b32_e32 v8, s67 -; GFX6-NEXT: v_mov_b32_e32 v9, s31 +; GFX6-NEXT: v_mov_b32_e32 v0, s67 +; GFX6-NEXT: v_mov_b32_e32 v1, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s66 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v4, s65 +; GFX6-NEXT: v_mov_b32_e32 v5, s36 +; GFX6-NEXT: v_mov_b32_e32 v6, s64 +; GFX6-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NEXT: v_mov_b32_e32 v8, s63 +; GFX6-NEXT: v_mov_b32_e32 v9, s34 ; GFX6-NEXT: v_mov_b32_e32 v10, s62 -; GFX6-NEXT: v_mov_b32_e32 v11, s30 +; GFX6-NEXT: v_mov_b32_e32 v11, s33 ; GFX6-NEXT: v_mov_b32_e32 v12, s61 -; GFX6-NEXT: v_mov_b32_e32 v13, s29 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 ; GFX6-NEXT: v_mov_b32_e32 v14, s60 +; GFX6-NEXT: v_mov_b32_e32 v15, s30 +; GFX6-NEXT: v_mov_b32_e32 v16, s59 +; GFX6-NEXT: v_mov_b32_e32 v17, s29 +; GFX6-NEXT: v_mov_b32_e32 v18, s58 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s57 +; GFX6-NEXT: v_mov_b32_e32 v19, s28 +; GFX6-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s68 +; GFX6-NEXT: v_mov_b32_e32 v3, s26 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v15, s28 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NEXT: v_mov_b32_e32 v2, s59 -; GFX6-NEXT: v_mov_b32_e32 v3, s26 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s57 -; GFX6-NEXT: v_mov_b32_e32 v3, s24 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s56 -; GFX6-NEXT: v_mov_b32_e32 v1, s23 +; GFX6-NEXT: v_mov_b32_e32 v1, s25 ; GFX6-NEXT: v_mov_b32_e32 v2, s55 -; GFX6-NEXT: v_mov_b32_e32 v3, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s24 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s53 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v1, s23 ; GFX6-NEXT: v_mov_b32_e32 v2, s54 -; GFX6-NEXT: v_mov_b32_e32 v3, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s22 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NEXT: v_mov_b32_e32 v1, s19 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 ; GFX6-NEXT: v_mov_b32_e32 v2, s51 -; GFX6-NEXT: v_mov_b32_e32 v3, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NEXT: v_mov_b32_e32 v2, s49 -; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 ; GFX6-NEXT: v_mov_b32_e32 v2, s47 -; GFX6-NEXT: v_mov_b32_e32 v3, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s16 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s45 -; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: v_mov_b32_e32 v3, s14 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NEXT: v_mov_b32_e32 v2, s43 ; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -2853,9 +2852,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s37 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3446,59 +3445,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_i32 s46, s3, 0x1000a ; GFX6-NEXT: s_bfe_i32 s47, s3, 0x10009 ; GFX6-NEXT: s_bfe_i32 s48, s3, 0x10008 -; GFX6-NEXT: s_bfe_i32 s49, s3, 0x1000f -; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000e -; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000d -; GFX6-NEXT: s_bfe_i32 s52, s3, 0x1000c +; GFX6-NEXT: s_bfe_i32 s49, s3, 0x1000e +; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000d +; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000c +; GFX6-NEXT: s_bfe_i32 s52, s3, 0x10013 ; GFX6-NEXT: s_bfe_i32 s53, s3, 0x10012 ; GFX6-NEXT: s_bfe_i32 s54, s3, 0x10011 ; GFX6-NEXT: s_bfe_i32 s55, s3, 0x10010 ; GFX6-NEXT: s_bfe_i32 s56, s3, 0x10017 ; GFX6-NEXT: s_bfe_i32 s57, s3, 0x10016 ; GFX6-NEXT: s_bfe_i32 s58, s3, 0x10015 -; GFX6-NEXT: s_bfe_i32 s59, s3, 0x1001b -; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001a -; GFX6-NEXT: s_bfe_i32 s61, s3, 0x10019 -; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10018 -; GFX6-NEXT: s_ashr_i32 s63, s3, 31 -; GFX6-NEXT: s_bfe_i32 s64, s3, 0x1001e -; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001d -; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001c -; GFX6-NEXT: s_bfe_i32 s67, s3, 0x10014 -; GFX6-NEXT: s_bfe_i32 s68, s3, 0x10013 +; GFX6-NEXT: s_bfe_i32 s59, s3, 0x10014 +; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001b +; GFX6-NEXT: s_bfe_i32 s61, s3, 0x1001a +; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10019 +; GFX6-NEXT: s_bfe_i32 s63, s3, 0x10018 +; GFX6-NEXT: s_ashr_i32 s64, s3, 31 +; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001e +; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001d +; GFX6-NEXT: s_bfe_i32 s67, s3, 0x1001c +; GFX6-NEXT: s_bfe_i32 s68, s3, 0x1000f ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v1, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s64 -; GFX6-NEXT: v_mov_b32_e32 v3, s63 -; GFX6-NEXT: v_mov_b32_e32 v4, s62 -; GFX6-NEXT: v_mov_b32_e32 v5, s61 -; GFX6-NEXT: v_mov_b32_e32 v6, s60 -; GFX6-NEXT: v_mov_b32_e32 v7, s59 -; GFX6-NEXT: v_mov_b32_e32 v8, s67 +; GFX6-NEXT: v_mov_b32_e32 v0, s67 +; GFX6-NEXT: v_mov_b32_e32 v1, s66 +; GFX6-NEXT: v_mov_b32_e32 v2, s65 +; GFX6-NEXT: v_mov_b32_e32 v3, s64 +; GFX6-NEXT: v_mov_b32_e32 v4, s63 +; GFX6-NEXT: v_mov_b32_e32 v5, s62 +; GFX6-NEXT: v_mov_b32_e32 v6, s61 +; GFX6-NEXT: v_mov_b32_e32 v7, s60 +; GFX6-NEXT: v_mov_b32_e32 v8, s59 ; GFX6-NEXT: v_mov_b32_e32 v9, s58 ; GFX6-NEXT: v_mov_b32_e32 v10, s57 ; GFX6-NEXT: v_mov_b32_e32 v11, s56 ; GFX6-NEXT: v_mov_b32_e32 v12, s55 ; GFX6-NEXT: v_mov_b32_e32 v13, s54 ; GFX6-NEXT: v_mov_b32_e32 v14, s53 +; GFX6-NEXT: v_mov_b32_e32 v15, s52 +; GFX6-NEXT: v_mov_b32_e32 v16, s51 +; GFX6-NEXT: v_mov_b32_e32 v17, s50 +; GFX6-NEXT: v_mov_b32_e32 v18, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v15, s68 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s50 -; GFX6-NEXT: v_mov_b32_e32 v3, s49 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s48 +; GFX6-NEXT: v_mov_b32_e32 v19, s68 ; GFX6-NEXT: v_mov_b32_e32 v1, s47 ; GFX6-NEXT: v_mov_b32_e32 v2, s46 ; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s44 @@ -5099,40 +5097,40 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 +; GFX8-NEXT: v_mov_b32_e32 v15, s4 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 -; GFX8-NEXT: v_mov_b32_e32 v13, v1 -; GFX8-NEXT: v_mov_b32_e32 v15, v1 -; GFX8-NEXT: v_mov_b32_e32 v22, s0 +; GFX8-NEXT: v_mov_b32_e32 v12, v1 +; GFX8-NEXT: v_mov_b32_e32 v14, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0 ; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1 ; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1 -; GFX8-NEXT: v_bfe_u32 v10, v0, 3, 1 -; GFX8-NEXT: v_bfe_u32 v14, v0, 1, 1 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 -; GFX8-NEXT: v_bfe_u32 v8, v0, 2, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v24 -; GFX8-NEXT: v_bfe_u32 v0, v24, 6, 1 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v16, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, s1 +; GFX8-NEXT: v_mov_b32_e32 v17, s0 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 2, 1 +; GFX8-NEXT: v_mov_b32_e32 v15, s2 +; GFX8-NEXT: v_bfe_u32 v13, v0, 1, 1 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v6 +; GFX8-NEXT: v_bfe_u32 v0, v6, 6, 1 +; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[11:14] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v8i1_to_v8i64: @@ -5728,61 +5726,63 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 15, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 15, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v9, 13, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 13, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v11, 10, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v12, 8, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v13, 9, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 6, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 7, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 5, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 11, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v16, 9, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 5, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v16, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 1, v1 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 1 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 1 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 1 -; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 1 -; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 1 -; GFX6-NEXT: v_bfe_i32 v14, v13, 0, 1 -; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 1 -; GFX6-NEXT: v_bfe_i32 v17, v5, 0, 1 -; GFX6-NEXT: v_bfe_i32 v15, v3, 0, 1 +; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 1 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:112 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_bfe_i32 v6, v10, 0, 1 +; GFX6-NEXT: v_bfe_i32 v4, v9, 0, 1 +; GFX6-NEXT: v_bfe_i32 v9, v8, 0, 1 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_bfe_i32 v9, v12, 0, 1 +; GFX6-NEXT: v_bfe_i32 v7, v11, 0, 1 +; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 1 +; GFX6-NEXT: v_bfe_i32 v11, v1, 0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 7, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_bfe_i32 v17, v1, 0, 1 +; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 1 ; GFX6-NEXT: v_bfe_i32 v21, v16, 0, 1 -; GFX6-NEXT: v_bfe_i32 v19, v1, 0, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 -; GFX6-NEXT: v_bfe_i32 v25, v1, 0, 1 -; GFX6-NEXT: v_bfe_i32 v23, v11, 0, 1 -; GFX6-NEXT: v_bfe_i32 v29, v9, 0, 1 -; GFX6-NEXT: v_bfe_i32 v27, v7, 0, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GFX6-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX6-NEXT: v_bfe_i32 v19, v14, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX6-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GFX6-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GFX6-NEXT: v_ashrrev_i32_e32 v30, 31, v29 -; GFX6-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GFX6-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 -; GFX6-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i64: @@ -5792,8 +5792,8 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v27, s1 -; GFX8-NEXT: v_mov_b32_e32 v26, s0 +; GFX8-NEXT: v_mov_b32_e32 v19, s1 +; GFX8-NEXT: v_mov_b32_e32 v18, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_lshr_b32 s2, s3, 14 @@ -5831,70 +5831,70 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v23, s3 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_mov_b32_e32 v22, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mov_b32_e32 v7, s7 ; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_mov_b32_e32 v9, s9 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v10, s10 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v10, s10 ; GFX8-NEXT: v_mov_b32_e32 v11, s11 ; GFX8-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NEXT: v_mov_b32_e32 v13, s13 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v14, s14 -; GFX8-NEXT: v_mov_b32_e32 v15, s15 -; GFX8-NEXT: v_mov_b32_e32 v16, s16 -; GFX8-NEXT: v_mov_b32_e32 v17, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[14:17] -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v18, s18 -; GFX8-NEXT: v_mov_b32_e32 v19, s19 -; GFX8-NEXT: v_mov_b32_e32 v20, s20 -; GFX8-NEXT: v_mov_b32_e32 v21, s21 +; GFX8-NEXT: v_mov_b32_e32 v7, s19 +; GFX8-NEXT: v_mov_b32_e32 v8, s20 +; GFX8-NEXT: v_mov_b32_e32 v9, s21 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[18:21] -; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v22, s22 -; GFX8-NEXT: v_mov_b32_e32 v23, s23 -; GFX8-NEXT: v_mov_b32_e32 v24, s24 -; GFX8-NEXT: v_mov_b32_e32 v25, s25 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_mov_b32_e32 v10, s22 +; GFX8-NEXT: v_mov_b32_e32 v11, s23 +; GFX8-NEXT: v_mov_b32_e32 v12, s24 +; GFX8-NEXT: v_mov_b32_e32 v13, s25 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s26 -; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: v_mov_b32_e32 v6, s28 -; GFX8-NEXT: v_mov_b32_e32 v7, s29 -; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_mov_b32_e32 v14, s26 +; GFX8-NEXT: v_mov_b32_e32 v15, s27 +; GFX8-NEXT: v_mov_b32_e32 v16, s28 +; GFX8-NEXT: v_mov_b32_e32 v17, s29 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s30 ; GFX8-NEXT: v_mov_b32_e32 v3, s31 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[14:17] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v16i1_to_v16i64: @@ -6607,164 +6607,164 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s52, s8, 30 -; GFX6-NEXT: s_lshr_b32 s46, s8, 31 -; GFX6-NEXT: s_lshr_b32 s48, s8, 28 -; GFX6-NEXT: s_lshr_b32 s36, s8, 29 -; GFX6-NEXT: s_lshr_b32 s38, s8, 26 -; GFX6-NEXT: s_lshr_b32 s26, s8, 27 -; GFX6-NEXT: s_lshr_b32 s28, s8, 24 -; GFX6-NEXT: s_lshr_b32 s4, s8, 25 -; GFX6-NEXT: s_lshr_b32 s6, s8, 22 -; GFX6-NEXT: s_lshr_b32 s10, s8, 23 -; GFX6-NEXT: s_lshr_b32 s12, s8, 20 -; GFX6-NEXT: s_lshr_b32 s14, s8, 21 -; GFX6-NEXT: s_lshr_b32 s16, s8, 18 -; GFX6-NEXT: s_lshr_b32 s18, s8, 19 -; GFX6-NEXT: s_lshr_b32 s20, s8, 16 -; GFX6-NEXT: s_lshr_b32 s22, s8, 17 -; GFX6-NEXT: s_lshr_b32 s24, s8, 14 -; GFX6-NEXT: s_lshr_b32 s30, s8, 15 -; GFX6-NEXT: s_lshr_b32 s34, s8, 12 -; GFX6-NEXT: s_lshr_b32 s40, s8, 13 -; GFX6-NEXT: s_lshr_b32 s42, s8, 10 -; GFX6-NEXT: s_lshr_b32 s44, s8, 11 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s8, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NEXT: s_lshr_b32 s52, s8, 9 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s46 -; GFX6-NEXT: v_mov_b32_e32 v5, s47 -; GFX6-NEXT: s_lshr_b32 s46, s8, 6 -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s8, 7 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_lshr_b32 s38, s4, 30 +; GFX6-NEXT: s_lshr_b32 s40, s4, 31 +; GFX6-NEXT: s_lshr_b32 s34, s4, 28 +; GFX6-NEXT: s_lshr_b32 s36, s4, 29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 26 +; GFX6-NEXT: s_lshr_b32 s30, s4, 27 +; GFX6-NEXT: s_lshr_b32 s24, s4, 24 +; GFX6-NEXT: s_lshr_b32 s26, s4, 25 +; GFX6-NEXT: s_lshr_b32 s20, s4, 22 +; GFX6-NEXT: s_lshr_b32 s22, s4, 23 +; GFX6-NEXT: s_lshr_b32 s18, s4, 20 +; GFX6-NEXT: s_lshr_b32 s6, s4, 21 +; GFX6-NEXT: s_lshr_b32 s8, s4, 18 +; GFX6-NEXT: s_lshr_b32 s10, s4, 19 +; GFX6-NEXT: s_lshr_b32 s12, s4, 16 +; GFX6-NEXT: s_lshr_b32 s14, s4, 17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 14 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 +; GFX6-NEXT: s_lshr_b32 s42, s4, 15 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NEXT: s_lshr_b32 s44, s4, 12 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 10 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s34 +; GFX6-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 11 ; GFX6-NEXT: v_mov_b32_e32 v8, s36 ; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s8, 4 -; GFX6-NEXT: v_mov_b32_e32 v10, s38 -; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s8, 5 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s26 -; GFX6-NEXT: v_mov_b32_e32 v13, s27 -; GFX6-NEXT: s_lshr_b32 s26, s8, 2 -; GFX6-NEXT: v_mov_b32_e32 v14, s28 -; GFX6-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NEXT: s_lshr_b32 s28, s8, 3 -; GFX6-NEXT: s_lshr_b32 s8, s8, 1 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_lshr_b32 s36, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v10, s28 +; GFX6-NEXT: v_mov_b32_e32 v11, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 6 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s24 +; GFX6-NEXT: v_mov_b32_e32 v15, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v16, s26 +; GFX6-NEXT: v_mov_b32_e32 v17, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 4 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v4, s22 +; GFX6-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 2 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, s18 +; GFX6-NEXT: v_mov_b32_e32 v7, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 3 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v16, s4 -; GFX6-NEXT: v_mov_b32_e32 v17, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_mov_b32_e32 v9, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 ; GFX6-NEXT: v_mov_b32_e32 v4, s14 ; GFX6-NEXT: v_mov_b32_e32 v5, s15 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NEXT: v_mov_b32_e32 v5, s19 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: v_mov_b32_e32 v4, s22 -; GFX6-NEXT: v_mov_b32_e32 v5, s23 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s24 -; GFX6-NEXT: v_mov_b32_e32 v3, s25 -; GFX6-NEXT: v_mov_b32_e32 v4, s30 -; GFX6-NEXT: v_mov_b32_e32 v5, s31 +; GFX6-NEXT: v_mov_b32_e32 v4, s42 +; GFX6-NEXT: v_mov_b32_e32 v5, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v4, s38 +; GFX6-NEXT: v_mov_b32_e32 v5, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: v_mov_b32_e32 v4, s44 -; GFX6-NEXT: v_mov_b32_e32 v5, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: v_mov_b32_e32 v4, s34 +; GFX6-NEXT: v_mov_b32_e32 v5, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s50 -; GFX6-NEXT: v_mov_b32_e32 v3, s51 -; GFX6-NEXT: v_mov_b32_e32 v4, s52 -; GFX6-NEXT: v_mov_b32_e32 v5, s53 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s47 -; GFX6-NEXT: v_mov_b32_e32 v4, s48 -; GFX6-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NEXT: v_mov_b32_e32 v5, s25 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NEXT: v_mov_b32_e32 v5, s39 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: v_mov_b32_e32 v4, s20 +; GFX6-NEXT: v_mov_b32_e32 v5, s21 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NEXT: v_mov_b32_e32 v5, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -7332,21 +7332,21 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b ; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d ; GFX6-NEXT: s_lshr_b32 s34, s2, 31 -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s47, s3, 0x10019 -; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s49, s3, 0x1001d -; GFX6-NEXT: s_lshr_b32 s50, s3, 31 +; GFX6-NEXT: s_bfe_u32 s35, s3, 0x10003 +; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10005 +; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10007 +; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10009 +; GFX6-NEXT: s_bfe_u32 s39, s3, 0x1000b +; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000d +; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000f +; GFX6-NEXT: s_bfe_u32 s42, s3, 0x10011 +; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10013 +; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10015 +; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10017 +; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10019 +; GFX6-NEXT: s_bfe_u32 s47, s3, 0x1001b +; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001d +; GFX6-NEXT: s_lshr_b32 s49, s3, 31 ; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001 ; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001 ; GFX6-NEXT: s_and_b32 s7, s2, 1 @@ -7362,7 +7362,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012 ; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014 ; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s35, s2, 0x10018 +; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 ; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a ; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c ; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e @@ -7386,63 +7386,63 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v2, s50 +; GFX6-NEXT: v_mov_b32_e32 v2, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v2, s47 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s46 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 +; GFX6-NEXT: v_mov_b32_e32 v2, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NEXT: v_mov_b32_e32 v2, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v2, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v2, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NEXT: v_mov_b32_e32 v2, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v2, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s56 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s55 -; GFX6-NEXT: v_mov_b32_e32 v2, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s54 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v2, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s53 @@ -7457,7 +7457,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v2, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s35 +; GFX6-NEXT: v_mov_b32_e32 v0, s50 ; GFX6-NEXT: v_mov_b32_e32 v2, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -8347,478 +8347,477 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s48, s5, 30 -; GFX6-NEXT: s_lshr_b32 s46, s5, 28 -; GFX6-NEXT: s_lshr_b32 s44, s5, 29 -; GFX6-NEXT: s_lshr_b32 s40, s5, 26 -; GFX6-NEXT: s_lshr_b32 s42, s5, 27 -; GFX6-NEXT: s_lshr_b32 s36, s5, 24 -; GFX6-NEXT: s_lshr_b32 s38, s5, 25 -; GFX6-NEXT: s_lshr_b32 s30, s5, 22 -; GFX6-NEXT: s_lshr_b32 s34, s5, 23 -; GFX6-NEXT: s_lshr_b32 s26, s5, 20 -; GFX6-NEXT: s_lshr_b32 s28, s5, 21 -; GFX6-NEXT: s_lshr_b32 s22, s5, 18 -; GFX6-NEXT: s_lshr_b32 s24, s5, 19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 16 -; GFX6-NEXT: s_lshr_b32 s20, s5, 17 -; GFX6-NEXT: s_lshr_b32 s14, s5, 14 -; GFX6-NEXT: s_lshr_b32 s16, s5, 15 -; GFX6-NEXT: s_lshr_b32 s10, s5, 12 -; GFX6-NEXT: s_lshr_b32 s12, s5, 13 -; GFX6-NEXT: s_lshr_b32 s6, s5, 10 -; GFX6-NEXT: s_lshr_b32 s8, s5, 11 -; GFX6-NEXT: s_mov_b32 s50, s5 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[4:5], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s5, 8 -; GFX6-NEXT: v_mov_b32_e32 v4, s52 -; GFX6-NEXT: v_mov_b32_e32 v5, s53 -; GFX6-NEXT: s_lshr_b32 s52, s5, 9 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[46:47], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s46, s5, 6 -; GFX6-NEXT: v_mov_b32_e32 v10, s54 -; GFX6-NEXT: v_mov_b32_e32 v11, s55 -; GFX6-NEXT: s_lshr_b32 s48, s5, 7 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_lshr_b32 s42, s5, 30 +; GFX6-NEXT: s_lshr_b32 s36, s5, 28 +; GFX6-NEXT: s_lshr_b32 s38, s5, 29 +; GFX6-NEXT: s_lshr_b32 s30, s5, 26 +; GFX6-NEXT: s_lshr_b32 s34, s5, 27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 24 +; GFX6-NEXT: s_lshr_b32 s28, s5, 25 +; GFX6-NEXT: s_lshr_b32 s22, s5, 22 +; GFX6-NEXT: s_lshr_b32 s24, s5, 23 +; GFX6-NEXT: s_lshr_b32 s18, s5, 20 +; GFX6-NEXT: s_lshr_b32 s20, s5, 21 +; GFX6-NEXT: s_lshr_b32 s14, s5, 18 +; GFX6-NEXT: s_lshr_b32 s16, s5, 19 +; GFX6-NEXT: s_lshr_b32 s10, s5, 16 +; GFX6-NEXT: s_lshr_b32 s12, s5, 17 +; GFX6-NEXT: s_lshr_b32 s6, s5, 14 +; GFX6-NEXT: s_lshr_b32 s8, s5, 15 +; GFX6-NEXT: s_mov_b32 s40, s5 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: v_mov_b32_e32 v12, s44 -; GFX6-NEXT: v_mov_b32_e32 v13, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s40 -; GFX6-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NEXT: s_lshr_b32 s42, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s54 -; GFX6-NEXT: v_mov_b32_e32 v17, s55 -; GFX6-NEXT: s_lshr_b32 s40, s5, 2 -; GFX6-NEXT: v_mov_b32_e32 v8, s7 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s7 +; GFX6-NEXT: s_lshr_b32 s40, s5, 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s44 +; GFX6-NEXT: v_mov_b32_e32 v7, s45 +; GFX6-NEXT: s_lshr_b32 s44, s5, 13 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: s_lshr_b32 s42, s5, 10 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v9, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:496 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s36 -; GFX6-NEXT: v_mov_b32_e32 v7, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 3 -; GFX6-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 1 +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 11 +; GFX6-NEXT: v_mov_b32_e32 v10, s38 +; GFX6-NEXT: v_mov_b32_e32 v11, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 8 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s30 -; GFX6-NEXT: v_mov_b32_e32 v11, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 30 -; GFX6-NEXT: v_mov_b32_e32 v12, s34 -; GFX6-NEXT: v_mov_b32_e32 v13, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 31 +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s5, 9 +; GFX6-NEXT: v_mov_b32_e32 v14, s34 +; GFX6-NEXT: v_mov_b32_e32 v15, s35 +; GFX6-NEXT: s_lshr_b32 s34, s5, 6 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:464 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s26 -; GFX6-NEXT: v_mov_b32_e32 v15, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 28 -; GFX6-NEXT: v_mov_b32_e32 v16, s28 -; GFX6-NEXT: v_mov_b32_e32 v17, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 29 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 7 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: s_lshr_b32 s28, s5, 4 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s22 -; GFX6-NEXT: v_mov_b32_e32 v7, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 26 -; GFX6-NEXT: v_mov_b32_e32 v8, s24 -; GFX6-NEXT: v_mov_b32_e32 v9, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[20:21], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s22 +; GFX6-NEXT: v_mov_b32_e32 v9, s23 +; GFX6-NEXT: s_lshr_b32 s22, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v10, s24 +; GFX6-NEXT: v_mov_b32_e32 v11, s25 +; GFX6-NEXT: s_lshr_b32 s24, s5, 2 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:432 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NEXT: s_lshr_b32 s20, s4, 24 -; GFX6-NEXT: v_mov_b32_e32 v12, s54 -; GFX6-NEXT: v_mov_b32_e32 v13, s55 -; GFX6-NEXT: s_lshr_b32 s18, s4, 25 +; GFX6-NEXT: v_mov_b32_e32 v12, s18 +; GFX6-NEXT: v_mov_b32_e32 v13, s19 +; GFX6-NEXT: s_lshr_b32 s18, s5, 3 +; GFX6-NEXT: v_mov_b32_e32 v14, s20 +; GFX6-NEXT: v_mov_b32_e32 v15, s21 +; GFX6-NEXT: s_lshr_b32 s20, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:416 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 22 -; GFX6-NEXT: v_mov_b32_e32 v16, s16 -; GFX6-NEXT: v_mov_b32_e32 v17, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 23 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 30 +; GFX6-NEXT: v_mov_b32_e32 v4, s16 +; GFX6-NEXT: v_mov_b32_e32 v5, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 31 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:400 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_mov_b32_e32 v7, s11 -; GFX6-NEXT: s_lshr_b32 s10, s4, 20 -; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_mov_b32_e32 v9, s13 -; GFX6-NEXT: s_lshr_b32 s12, s4, 21 +; GFX6-NEXT: v_mov_b32_e32 v8, s10 +; GFX6-NEXT: v_mov_b32_e32 v9, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 28 +; GFX6-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 29 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:384 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NEXT: s_lshr_b32 s46, s4, 26 +; GFX6-NEXT: v_mov_b32_e32 v14, s8 +; GFX6-NEXT: v_mov_b32_e32 v15, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 27 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 24 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: s_lshr_b32 s44, s4, 25 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 22 ; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: s_lshr_b32 s6, s4, 18 -; GFX6-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 19 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:368 +; GFX6-NEXT: s_lshr_b32 s42, s4, 23 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 20 +; GFX6-NEXT: v_mov_b32_e32 v14, s6 +; GFX6-NEXT: v_mov_b32_e32 v15, s7 +; GFX6-NEXT: s_lshr_b32 s6, s4, 21 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v16, s34 +; GFX6-NEXT: v_mov_b32_e32 v17, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 18 +; GFX6-NEXT: v_mov_b32_e32 v18, s26 +; GFX6-NEXT: v_mov_b32_e32 v19, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 19 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, s28 +; GFX6-NEXT: v_mov_b32_e32 v9, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 17 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s50 -; GFX6-NEXT: v_mov_b32_e32 v15, s51 -; GFX6-NEXT: s_lshr_b32 s50, s4, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, s52 -; GFX6-NEXT: v_mov_b32_e32 v17, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 17 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:352 +; GFX6-NEXT: v_mov_b32_e32 v12, s24 +; GFX6-NEXT: v_mov_b32_e32 v13, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 14 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s18 +; GFX6-NEXT: v_mov_b32_e32 v15, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 15 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 12 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v18, s16 +; GFX6-NEXT: v_mov_b32_e32 v19, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 10 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s46 -; GFX6-NEXT: v_mov_b32_e32 v7, s47 -; GFX6-NEXT: s_lshr_b32 s46, s4, 14 -; GFX6-NEXT: v_mov_b32_e32 v8, s48 -; GFX6-NEXT: v_mov_b32_e32 v9, s49 -; GFX6-NEXT: s_lshr_b32 s48, s4, 15 -; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[44:45], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:336 +; GFX6-NEXT: v_mov_b32_e32 v8, s10 +; GFX6-NEXT: v_mov_b32_e32 v9, s11 +; GFX6-NEXT: s_lshr_b32 s10, s4, 11 +; GFX6-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 12 -; GFX6-NEXT: v_mov_b32_e32 v12, s54 -; GFX6-NEXT: v_mov_b32_e32 v13, s55 -; GFX6-NEXT: s_lshr_b32 s44, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v12, s38 +; GFX6-NEXT: v_mov_b32_e32 v13, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v14, s8 +; GFX6-NEXT: v_mov_b32_e32 v15, s9 +; GFX6-NEXT: s_lshr_b32 s8, s4, 6 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:320 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s40 -; GFX6-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: s_lshr_b32 s44, s4, 4 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v16, s36 ; GFX6-NEXT: v_mov_b32_e32 v17, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_lshr_b32 s36, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v18, s42 +; GFX6-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NEXT: s_lshr_b32 s42, s4, 2 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:304 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s30 -; GFX6-NEXT: v_mov_b32_e32 v7, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v8, s34 -; GFX6-NEXT: v_mov_b32_e32 v9, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:288 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v12, s28 -; GFX6-NEXT: v_mov_b32_e32 v13, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:272 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s22 -; GFX6-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s24 -; GFX6-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 3 +; GFX6-NEXT: v_mov_b32_e32 v8, s30 +; GFX6-NEXT: v_mov_b32_e32 v9, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 3 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 -; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:224 -; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v11, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(2) +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NEXT: v_mov_b32_e32 v2, s48 -; GFX6-NEXT: v_mov_b32_e32 v3, s49 +; GFX6-NEXT: v_mov_b32_e32 v0, s24 +; GFX6-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s38 -; GFX6-NEXT: v_mov_b32_e32 v1, s39 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NEXT: v_mov_b32_e32 v9, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s90, -1 -; GFX8-NEXT: s_mov_b32 s91, 0xe80000 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; GFX8-NEXT: s_add_u32 s88, s88, s11 -; GFX8-NEXT: s_addc_u32 s89, s89, 0 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s0, s3, 8 +; GFX8-NEXT: s_lshr_b32 s48, s3, 15 ; GFX8-NEXT: v_writelane_b32 v62, s0, 0 -; GFX8-NEXT: v_writelane_b32 v62, s1, 1 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: s_lshr_b32 s36, s3, 21 -; GFX8-NEXT: s_lshr_b32 s30, s3, 19 -; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 ; GFX8-NEXT: s_lshr_b32 s74, s3, 30 -; GFX8-NEXT: s_lshr_b32 s50, s3, 31 +; GFX8-NEXT: s_lshr_b32 s30, s3, 31 ; GFX8-NEXT: s_lshr_b32 s72, s3, 28 -; GFX8-NEXT: s_lshr_b32 s48, s3, 29 +; GFX8-NEXT: s_lshr_b32 s34, s3, 29 ; GFX8-NEXT: s_lshr_b32 s70, s3, 26 -; GFX8-NEXT: s_lshr_b32 s46, s3, 27 +; GFX8-NEXT: s_lshr_b32 s36, s3, 27 ; GFX8-NEXT: s_lshr_b32 s68, s3, 24 -; GFX8-NEXT: s_lshr_b32 s42, s3, 25 -; GFX8-NEXT: s_lshr_b32 s66, s3, 22 +; GFX8-NEXT: s_lshr_b32 s38, s3, 25 +; GFX8-NEXT: s_lshr_b32 s64, s3, 22 ; GFX8-NEXT: s_lshr_b32 s40, s3, 23 -; GFX8-NEXT: s_lshr_b32 s64, s3, 20 -; GFX8-NEXT: s_lshr_b32 s62, s3, 18 +; GFX8-NEXT: s_lshr_b32 s60, s3, 20 +; GFX8-NEXT: s_lshr_b32 s42, s3, 21 +; GFX8-NEXT: s_lshr_b32 s66, s3, 18 +; GFX8-NEXT: s_lshr_b32 s44, s3, 19 ; GFX8-NEXT: s_lshr_b32 s56, s3, 16 -; GFX8-NEXT: s_lshr_b32 s18, s3, 17 +; GFX8-NEXT: s_lshr_b32 s46, s3, 17 ; GFX8-NEXT: s_lshr_b32 s58, s3, 14 -; GFX8-NEXT: s_lshr_b32 s38, s3, 15 -; GFX8-NEXT: s_lshr_b32 s60, s3, 12 -; GFX8-NEXT: s_lshr_b32 s44, s3, 13 +; GFX8-NEXT: s_lshr_b32 s62, s3, 12 ; GFX8-NEXT: s_lshr_b32 s54, s3, 10 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX8-NEXT: v_writelane_b32 v62, s0, 2 +; GFX8-NEXT: v_writelane_b32 v62, s1, 1 +; GFX8-NEXT: s_lshr_b32 s0, s3, 9 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s3, 11 -; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX8-NEXT: v_writelane_b32 v62, s0, 2 +; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v18, s36 -; GFX8-NEXT: v_mov_b32_e32 v19, s37 -; GFX8-NEXT: v_mov_b32_e32 v26, s30 -; GFX8-NEXT: v_mov_b32_e32 v27, s31 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[44:45], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v34, s48 +; GFX8-NEXT: s_lshr_b32 s48, s2, 1 +; GFX8-NEXT: s_lshr_b32 s50, s3, 13 ; GFX8-NEXT: v_writelane_b32 v62, s1, 3 -; GFX8-NEXT: s_lshr_b32 s6, s3, 9 -; GFX8-NEXT: s_lshr_b32 s8, s3, 6 +; GFX8-NEXT: s_lshr_b32 s6, s3, 6 ; GFX8-NEXT: s_lshr_b32 s10, s3, 7 ; GFX8-NEXT: s_lshr_b32 s12, s3, 4 ; GFX8-NEXT: s_lshr_b32 s14, s3, 5 ; GFX8-NEXT: s_lshr_b32 s16, s3, 2 -; GFX8-NEXT: s_lshr_b32 s20, s3, 3 -; GFX8-NEXT: s_lshr_b32 s22, s3, 1 -; GFX8-NEXT: s_mov_b32 s24, s3 -; GFX8-NEXT: s_lshr_b32 s26, s2, 30 -; GFX8-NEXT: s_lshr_b32 s28, s2, 31 -; GFX8-NEXT: s_lshr_b32 s34, s2, 28 +; GFX8-NEXT: s_lshr_b32 s18, s3, 3 +; GFX8-NEXT: s_lshr_b32 s20, s3, 1 +; GFX8-NEXT: s_mov_b32 s22, s3 +; GFX8-NEXT: s_lshr_b32 s24, s2, 30 +; GFX8-NEXT: s_lshr_b32 s26, s2, 31 +; GFX8-NEXT: s_lshr_b32 s28, s2, 28 ; GFX8-NEXT: v_mov_b32_e32 v4, s74 -; GFX8-NEXT: v_mov_b32_e32 v8, s72 +; GFX8-NEXT: v_mov_b32_e32 v12, s72 ; GFX8-NEXT: v_mov_b32_e32 v0, s70 -; GFX8-NEXT: v_mov_b32_e32 v54, s68 -; GFX8-NEXT: v_mov_b32_e32 v20, s66 +; GFX8-NEXT: v_mov_b32_e32 v8, s68 ; GFX8-NEXT: v_mov_b32_e32 v16, s64 -; GFX8-NEXT: v_mov_b32_e32 v24, s62 +; GFX8-NEXT: v_mov_b32_e32 v20, s60 +; GFX8-NEXT: v_mov_b32_e32 v24, s66 ; GFX8-NEXT: v_mov_b32_e32 v28, s56 ; GFX8-NEXT: v_mov_b32_e32 v32, s58 -; GFX8-NEXT: v_mov_b32_e32 v36, s60 +; GFX8-NEXT: v_mov_b32_e32 v36, s62 ; GFX8-NEXT: s_lshr_b32 s86, s2, 29 ; GFX8-NEXT: v_mov_b32_e32 v40, s54 ; GFX8-NEXT: s_lshr_b32 s84, s2, 26 ; GFX8-NEXT: s_lshr_b32 s82, s2, 27 +; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000 ; GFX8-NEXT: s_lshr_b32 s80, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v6, s50 +; GFX8-NEXT: v_mov_b32_e32 v6, s30 +; GFX8-NEXT: v_mov_b32_e32 v7, s31 ; GFX8-NEXT: s_lshr_b32 s78, s2, 25 ; GFX8-NEXT: s_lshr_b32 s76, s2, 22 -; GFX8-NEXT: v_mov_b32_e32 v10, s48 +; GFX8-NEXT: v_mov_b32_e32 v14, s34 ; GFX8-NEXT: s_lshr_b32 s74, s2, 23 ; GFX8-NEXT: s_lshr_b32 s72, s2, 20 -; GFX8-NEXT: v_mov_b32_e32 v2, s46 +; GFX8-NEXT: v_mov_b32_e32 v2, s36 ; GFX8-NEXT: s_lshr_b32 s70, s2, 21 ; GFX8-NEXT: s_lshr_b32 s68, s2, 18 -; GFX8-NEXT: v_mov_b32_e32 v56, s42 +; GFX8-NEXT: v_mov_b32_e32 v10, s38 ; GFX8-NEXT: s_lshr_b32 s66, s2, 19 ; GFX8-NEXT: s_lshr_b32 s64, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v22, s40 +; GFX8-NEXT: v_mov_b32_e32 v18, s40 ; GFX8-NEXT: s_lshr_b32 s62, s2, 17 ; GFX8-NEXT: s_lshr_b32 s60, s2, 14 +; GFX8-NEXT: v_mov_b32_e32 v22, s42 ; GFX8-NEXT: s_lshr_b32 s58, s2, 15 ; GFX8-NEXT: s_lshr_b32 s56, s2, 12 +; GFX8-NEXT: v_mov_b32_e32 v26, s44 ; GFX8-NEXT: s_lshr_b32 s54, s2, 13 -; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s2, 10 -; GFX8-NEXT: v_mov_b32_e32 v30, s18 -; GFX8-NEXT: v_mov_b32_e32 v31, s19 -; GFX8-NEXT: s_lshr_b32 s50, s2, 11 -; GFX8-NEXT: s_lshr_b32 s48, s2, 8 -; GFX8-NEXT: v_mov_b32_e32 v34, s36 +; GFX8-NEXT: v_mov_b32_e32 v30, s46 +; GFX8-NEXT: s_lshr_b32 s4, s2, 11 +; GFX8-NEXT: s_lshr_b32 s0, s2, 8 ; GFX8-NEXT: s_lshr_b32 s46, s2, 9 ; GFX8-NEXT: s_lshr_b32 s44, s2, 6 -; GFX8-NEXT: v_mov_b32_e32 v38, s30 ; GFX8-NEXT: s_lshr_b32 s42, s2, 7 ; GFX8-NEXT: s_lshr_b32 s40, s2, 4 ; GFX8-NEXT: s_lshr_b32 s38, s2, 5 ; GFX8-NEXT: s_lshr_b32 s36, s2, 2 -; GFX8-NEXT: s_lshr_b32 s30, s2, 3 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000 +; GFX8-NEXT: s_lshr_b32 s34, s2, 3 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x10000 +; GFX8-NEXT: v_writelane_b32 v62, s2, 4 +; GFX8-NEXT: v_writelane_b32 v62, s3, 5 +; GFX8-NEXT: v_readlane_b32 s2, v62, 2 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX8-NEXT: v_readlane_b32 s3, v62, 3 +; GFX8-NEXT: v_mov_b32_e32 v38, s50 +; GFX8-NEXT: v_mov_b32_e32 v39, s51 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 ; GFX8-NEXT: v_readlane_b32 s2, v62, 0 ; GFX8-NEXT: v_readlane_b32 s3, v62, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, s75 -; GFX8-NEXT: v_mov_b32_e32 v7, s51 -; GFX8-NEXT: v_mov_b32_e32 v9, s73 -; GFX8-NEXT: v_mov_b32_e32 v11, s49 +; GFX8-NEXT: v_mov_b32_e32 v13, s73 +; GFX8-NEXT: v_mov_b32_e32 v15, s35 ; GFX8-NEXT: v_mov_b32_e32 v1, s71 -; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: v_mov_b32_e32 v55, s69 -; GFX8-NEXT: v_mov_b32_e32 v57, s43 -; GFX8-NEXT: v_mov_b32_e32 v21, s67 -; GFX8-NEXT: v_mov_b32_e32 v23, s41 +; GFX8-NEXT: v_mov_b32_e32 v3, s37 +; GFX8-NEXT: v_mov_b32_e32 v9, s69 +; GFX8-NEXT: v_mov_b32_e32 v11, s39 ; GFX8-NEXT: v_mov_b32_e32 v17, s65 -; GFX8-NEXT: v_mov_b32_e32 v25, s63 +; GFX8-NEXT: v_mov_b32_e32 v19, s41 +; GFX8-NEXT: v_mov_b32_e32 v21, s61 +; GFX8-NEXT: v_mov_b32_e32 v23, s43 +; GFX8-NEXT: v_mov_b32_e32 v25, s67 +; GFX8-NEXT: v_mov_b32_e32 v27, s45 ; GFX8-NEXT: v_mov_b32_e32 v29, s57 +; GFX8-NEXT: v_mov_b32_e32 v31, s47 ; GFX8-NEXT: v_mov_b32_e32 v33, s59 -; GFX8-NEXT: v_mov_b32_e32 v35, s37 -; GFX8-NEXT: v_mov_b32_e32 v37, s61 -; GFX8-NEXT: v_mov_b32_e32 v39, s31 +; GFX8-NEXT: v_mov_b32_e32 v35, s49 +; GFX8-NEXT: v_mov_b32_e32 v37, s63 ; GFX8-NEXT: v_mov_b32_e32 v41, s55 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 @@ -8837,269 +8836,262 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[0:1], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1f0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x10000 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1f0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v43, s3 ; GFX8-NEXT: v_mov_b32_e32 v42, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1e0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1e0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v45, s3 ; GFX8-NEXT: v_mov_b32_e32 v44, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1d0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1d0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v47, s3 ; GFX8-NEXT: v_mov_b32_e32 v46, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1c0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v49, s3 ; GFX8-NEXT: v_mov_b32_e32 v48, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1b0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1b0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v51, s3 ; GFX8-NEXT: v_mov_b32_e32 v50, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x1a0 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v53, s3 ; GFX8-NEXT: v_mov_b32_e32 v52, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x190 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x180 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[54:57] -; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[20:23] -; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[16:19] -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27] -; GFX8-NEXT: buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: s_add_u32 s2, s4, 0x170 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x190 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v55, s3 +; GFX8-NEXT: v_mov_b32_e32 v54, s2 +; GFX8-NEXT: s_add_u32 s2, s8, 0x180 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v57, s3 +; GFX8-NEXT: v_mov_b32_e32 v56, s2 +; GFX8-NEXT: s_add_u32 s2, s8, 0x170 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v59, s3 ; GFX8-NEXT: v_mov_b32_e32 v58, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x160 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x160 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v61, s3 ; GFX8-NEXT: v_mov_b32_e32 v60, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x150 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v45, s3 -; GFX8-NEXT: v_mov_b32_e32 v44, s2 -; GFX8-NEXT: s_add_u32 s2, s4, 0x140 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x130 -; GFX8-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_add_u32 s2, s8, 0x150 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v13, s3 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 +; GFX8-NEXT: s_add_u32 s2, s8, 0x140 +; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x130 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x120 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s1 -; GFX8-NEXT: v_mov_b32_e32 v14, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x110 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x120 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, s1 +; GFX8-NEXT: v_mov_b32_e32 v18, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] ; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: v_mov_b32_e32 v10, s14 ; GFX8-NEXT: v_mov_b32_e32 v11, s15 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[28:31] +; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31] ; GFX8-NEXT: flat_store_dwordx4 v[58:59], v[32:35] ; GFX8-NEXT: flat_store_dwordx4 v[60:61], v[36:39] -; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[40:43] -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[40:43] +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x100 +; GFX8-NEXT: s_add_u32 s0, s8, 0x100 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xf0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 -; GFX8-NEXT: v_mov_b32_e32 v2, s22 -; GFX8-NEXT: v_mov_b32_e32 v3, s23 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xe0 -; GFX8-NEXT: v_mov_b32_e32 v0, s26 -; GFX8-NEXT: v_mov_b32_e32 v1, s27 -; GFX8-NEXT: v_mov_b32_e32 v2, s28 -; GFX8-NEXT: v_mov_b32_e32 v3, s29 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v0, s34 -; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: s_add_u32 s0, s8, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s86 ; GFX8-NEXT: v_mov_b32_e32 v3, s87 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xc0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s84 ; GFX8-NEXT: v_mov_b32_e32 v1, s85 ; GFX8-NEXT: v_mov_b32_e32 v2, s82 ; GFX8-NEXT: v_mov_b32_e32 v3, s83 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xb0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s80 ; GFX8-NEXT: v_mov_b32_e32 v1, s81 ; GFX8-NEXT: v_mov_b32_e32 v2, s78 ; GFX8-NEXT: v_mov_b32_e32 v3, s79 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0xa0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s76 ; GFX8-NEXT: v_mov_b32_e32 v1, s77 ; GFX8-NEXT: v_mov_b32_e32 v2, s74 ; GFX8-NEXT: v_mov_b32_e32 v3, s75 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x90 +; GFX8-NEXT: s_add_u32 s0, s8, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s72 ; GFX8-NEXT: v_mov_b32_e32 v1, s73 ; GFX8-NEXT: v_mov_b32_e32 v2, s70 ; GFX8-NEXT: v_mov_b32_e32 v3, s71 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x80 +; GFX8-NEXT: s_add_u32 s0, s8, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NEXT: v_mov_b32_e32 v1, s69 ; GFX8-NEXT: v_mov_b32_e32 v2, s66 ; GFX8-NEXT: v_mov_b32_e32 v3, s67 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x70 +; GFX8-NEXT: s_add_u32 s0, s8, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x60 +; GFX8-NEXT: s_add_u32 s0, s8, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0x50 +; GFX8-NEXT: s_add_u32 s0, s8, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 64 +; GFX8-NEXT: s_add_u32 s0, s8, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 48 +; GFX8-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 32 +; GFX8-NEXT: s_add_u32 s0, s8, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NEXT: v_mov_b32_e32 v1, s45 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 16 +; GFX8-NEXT: s_add_u32 s0, s8, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NEXT: v_mov_b32_e32 v1, s41 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NEXT: v_mov_b32_e32 v2, s30 -; GFX8-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_readlane_b32 s0, v62, 2 +; GFX8-NEXT: v_readlane_b32 s0, v62, 4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_readlane_b32 s1, v62, 3 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_readlane_b32 s1, v62, 5 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NEXT: v_mov_b32_e32 v1, s31 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index bb98af4e7a5c7..255a1acbe0086 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -637,8 +637,8 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: flat_load_ushort v19, v[6:7] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v20, v[8:9] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v21, v[10:11] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v22, v[12:13] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v23, v[14:15] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v12, v[12:13] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v13, v[14:15] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -664,18 +664,18 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0 -; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v24, v[2:3] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v14, v[0:1] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v15, v[2:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v4, v[4:5] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v5, v[6:7] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v8, v[8:9] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v9, v[10:11] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[12:13] -; GCN-NOHSA-VI-NEXT: flat_load_ushort v11, v[14:15] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1] +; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[2:3] ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) ; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v17, v1 @@ -688,25 +688,25 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(10) ; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v21, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(9) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v23, v6 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v13, v6 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v24, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v15, v6 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v5, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v5, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v11, v0 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v10, v0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -2502,29 +2502,27 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff @@ -2534,56 +2532,60 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: @@ -2622,32 +2624,32 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-HSA-NEXT: s_and_b32 s0, s15, 0xffff -; GCN-HSA-NEXT: s_and_b32 s1, s14, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 @@ -2981,88 +2983,90 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s0, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s1, s1 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s1 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s0 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s3, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s2, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s3, s3 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s2, s2 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s4, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s6, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s8, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s8, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s10, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s10, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s12, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s14, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: @@ -3073,8 +3077,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0 ; GCN-HSA-NEXT: s_ashr_i32 s22, s3, 16 ; GCN-HSA-NEXT: s_ashr_i32 s23, s2, 16 ; GCN-HSA-NEXT: s_ashr_i32 s24, s5, 16 @@ -3087,34 +3089,36 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i32 s31, s10, 16 ; GCN-HSA-NEXT: s_ashr_i32 s33, s13, 16 ; GCN-HSA-NEXT: s_ashr_i32 s34, s12, 16 -; GCN-HSA-NEXT: s_ashr_i32 s0, s15, 16 -; GCN-HSA-NEXT: s_ashr_i32 s1, s14, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GCN-HSA-NEXT: s_ashr_i32 s35, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s36, s14, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 @@ -3524,18 +3528,18 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -3555,22 +3559,21 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -3652,10 +3655,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s44, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s45, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s47, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff @@ -3664,13 +3667,13 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s40, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s42, s9, 0xffff -; GCN-HSA-NEXT: s_and_b32 s44, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s46, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s41, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s50, s12, 0xffff -; GCN-HSA-NEXT: s_and_b32 s51, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s51, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s50, s15, 0xffff ; GCN-HSA-NEXT: s_and_b32 s52, s14, 0xffff ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3708,111 +3711,111 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s58 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s54 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s53 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3854,57 +3857,34 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s17, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s16, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s21, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s20, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s23, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s22, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s25, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s24, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s27, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s26, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s29, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s28, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s31, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s30, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s10, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s18, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s21, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s20, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s23, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s22, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s25, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s24, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s27, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s26, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s29, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s28, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s0, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s9, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff @@ -3919,151 +3899,170 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s31, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s30, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s14, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xe0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xd0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xc0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xb0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xa0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x90 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x80 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x70 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x60 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x50 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s36, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s36, 32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s36, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4437,16 +4436,17 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s6, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s6, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s62, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s8, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s9 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s8, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s11, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s10, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s13, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 @@ -4455,8 +4455,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s7, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -4474,24 +4473,23 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 @@ -4586,10 +4584,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_sext_i32_i16 s43, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s44, s10 ; GCN-HSA-NEXT: s_ashr_i32 s45, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s46, s12, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s47, s13 -; GCN-HSA-NEXT: s_sext_i32_i16 s48, s12 -; GCN-HSA-NEXT: s_ashr_i32 s49, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s47, s12, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s46, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s49, s12 +; GCN-HSA-NEXT: s_ashr_i32 s48, s15, 16 ; GCN-HSA-NEXT: s_ashr_i32 s50, s14, 16 ; GCN-HSA-NEXT: s_sext_i32_i16 s51, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s52, s14 @@ -4597,8 +4595,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 -; GCN-HSA-NEXT: s_ashr_i32 s55, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s56, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s53, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s54, s2, 16 ; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 ; GCN-HSA-NEXT: s_ashr_i32 s58, s4, 16 ; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 16 @@ -4611,114 +4609,114 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i32 s66, s12, 16 ; GCN-HSA-NEXT: s_ashr_i32 s67, s15, 16 ; GCN-HSA-NEXT: s_ashr_i32 s68, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s54, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_sext_i32_i16 s53, s1 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s56, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 +; GCN-HSA-NEXT: s_sext_i32_i16 s55, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 +; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s49 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 @@ -7033,104 +7031,102 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s16, s15 -; GCN-HSA-NEXT: s_mov_b32 s18, s13 -; GCN-HSA-NEXT: s_mov_b32 s20, s11 -; GCN-HSA-NEXT: s_mov_b32 s22, s9 -; GCN-HSA-NEXT: s_lshr_b32 s24, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 +; GCN-HSA-NEXT: s_mov_b32 s6, s19 +; GCN-HSA-NEXT: s_mov_b32 s10, s17 +; GCN-HSA-NEXT: s_mov_b32 s20, s15 +; GCN-HSA-NEXT: s_mov_b32 s22, s13 +; GCN-HSA-NEXT: s_lshr_b32 s24, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[18:19], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[18:19], s[18:19], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[16:17], s[16:17], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[28:29], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s28, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s29, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -7403,106 +7399,108 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s3, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: @@ -7513,141 +7511,142 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 ; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s35, s2, 0xffff -; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s3, 0xffff -; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s2, s15, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -8091,144 +8090,140 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s50, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s52, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s8, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[70:71], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[60:61], s[0:1], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[8:9], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[12:13], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[14:15], 48 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[48:49], s[0:1], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[58:59], s[2:3], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s71 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[52:53], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[54:55], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s50 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s74 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s75 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -8237,13 +8232,13 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s42, s15 +; GCN-HSA-NEXT: s_mov_b32 s40, s15 ; GCN-HSA-NEXT: s_mov_b32 s48, s13 ; GCN-HSA-NEXT: s_mov_b32 s50, s11 ; GCN-HSA-NEXT: s_mov_b32 s52, s9 ; GCN-HSA-NEXT: s_mov_b32 s54, s7 ; GCN-HSA-NEXT: s_mov_b32 s56, s5 -; GCN-HSA-NEXT: s_mov_b32 s46, s3 +; GCN-HSA-NEXT: s_mov_b32 s44, s3 ; GCN-HSA-NEXT: s_mov_b32 s58, s1 ; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16 ; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16 @@ -8258,15 +8253,15 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[42:43], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[40:41], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[44:45], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 @@ -8282,8 +8277,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 @@ -8299,84 +8294,82 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 ; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s42 +; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s43 +; GCN-HSA-NEXT: s_add_u32 s42, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s43, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 -; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s44 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 -; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 +; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s37 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s38 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 @@ -8441,208 +8434,211 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s1, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[20:21], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[26:27], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s9 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s11 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s13 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s15 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s0, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s2, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s34, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s60, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s12, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s68, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s74, s14, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s76, s15 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s78, s15, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s13, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s15, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xe0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xd0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[72:73], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[46:47], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[50:51], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[60:61], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[62:63], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[68:69], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[70:71], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[74:75], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[76:77], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[78:79], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60 +; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 -; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xc0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s61, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 +; GCN-NOHSA-VI-NEXT: s_add_u32 s58, s16, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s59, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s72 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s73 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s59 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xb0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: s_add_u32 s54, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s55, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s55 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: s_add_u32 s52, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s53, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s53 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x90 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: s_add_u32 s48, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x80 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s49, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s49 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 +; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 +; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 6eeaec12c3d14..341332e60b5c0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -2713,37 +2713,39 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s3 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s33 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s30 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:80 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:64 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s26 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s24 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[16:19], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[16:19], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s22 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s20 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: @@ -2752,97 +2754,91 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_ashr_i32 s18, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s19, s0, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s20, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s21, s2, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s22, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s23, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s24, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s25, s6, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s26, s9, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s8, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s28, s11, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s10, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s30, s13, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s31, s12, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s33, s15, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s34, s14, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s20, s1, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s21, s0, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s22, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s23, s2, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s24, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s25, s4, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s26, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s27, s6, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s28, s9, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s8, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s30, s11, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s31, s10, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s13, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s34, s12, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s35, s15, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s14, 31 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s16, 0x70 +; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s14 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[9:12] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -3500,137 +3496,135 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s38, -1 -; GFX6-NOHSA-NEXT: s_mov_b32 s36, s16 -; GFX6-NOHSA-NEXT: s_mov_b32 s37, s17 -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[18:19], 0x10 +; GFX6-NOHSA-NEXT: s_mov_b32 s36, s0 +; GFX6-NOHSA-NEXT: s_mov_b32 s37, s1 +; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s0, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s2, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s4, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s6, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s45, s17, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s46, s16, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s47, s19, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s48, s18, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s21, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s50, s20, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s51, s23, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s30, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s31, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s28, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s29, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s26, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s27, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s52 -; GFX6-NOHSA-NEXT: s_ashr_i32 s52, s22, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s25, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s17, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s16, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s19, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s18, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s21, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s20, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s30, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s31, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s28, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s44 +; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s29, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s23, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s44 +; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s22, 31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s16 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s25, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s27, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s26, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s24, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s18 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s17 -; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s24, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s9, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s8, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s11, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s10, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s13, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s12, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s15, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s14, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s53 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s0, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s2, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s4, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s6, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s26, s9, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s8, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s10, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s13, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s12, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s22 +; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s15, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s16 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s14, 31 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s51 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:128 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s33 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s22 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s30 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s26 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s24 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -3646,45 +3640,45 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_ashr_i32 s23, s2, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s24, s5, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s25, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s26, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s6, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s28, s9, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s8, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s30, s11, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s31, s10, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s33, s13, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s34, s12, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s35, s15, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s36, s14, 31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GFX7-HSA-NEXT: s_ashr_i32 s28, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s6, 31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s9, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s37, s8, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s38, s11, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s39, s10, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s40, s13, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s41, s12, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s42, s15, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s14, 31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s1 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s38, s0, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s39, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s40, s2, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s42, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s43, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s18, s1, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s19, s0, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s26, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s27, s2, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s30, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s31, s4, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s44, s6, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s45, s9, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s46, s8, 31 @@ -3694,105 +3688,101 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_ashr_i32 s50, s12, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s51, s15, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s52, s14, 31 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xd0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s28 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0xa0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x90 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x80 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[28:31] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s16, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s34 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xc0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30] +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xa0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26] +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35 +; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29 +; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 +; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s25 +; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 +; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 +; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s47 +; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[3:6] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[6:9] +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -3801,8 +3791,8 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -3811,15 +3801,15 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -4193,43 +4183,37 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: s_ashr_i32 s65, s31, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s66, s30, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s63, s29, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s64, s28, 31 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s30, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s31, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s28, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s29, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s26, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v8, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s27, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s24, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v11, s58 +; GFX9-HSA-NEXT: s_ashr_i32 s58, s25, 31 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s66 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s65 -; GFX9-HSA-NEXT: s_ashr_i32 s61, s27, 31 -; GFX9-HSA-NEXT: s_ashr_i32 s62, s26, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240 -; GFX9-HSA-NEXT: s_ashr_i32 s59, s25, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s28 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s64 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s63 -; GFX9-HSA-NEXT: s_ashr_i32 s60, s24, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:224 ; GFX9-HSA-NEXT: s_ashr_i32 s57, s23, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s62 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s61 +; GFX9-HSA-NEXT: v_mov_b32_e32 v13, s58 ; GFX9-HSA-NEXT: s_ashr_i32 s58, s22, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:208 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240 +; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s29 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX9-HSA-NEXT: s_ashr_i32 s55, s21, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s60 -; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s59 ; GFX9-HSA-NEXT: s_ashr_i32 s56, s20, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:192 -; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[4:7], s[36:37] offset:224 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s22 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s57 +; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31 ; GFX9-HSA-NEXT: s_ashr_i32 s54, s18, 31 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:176 ; GFX9-HSA-NEXT: s_ashr_i32 s51, s17, 31 @@ -4294,14 +4278,18 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s39 ; GFX9-HSA-NEXT: s_ashr_i32 s34, s0, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s26 +; GFX9-HSA-NEXT: v_mov_b32_e32 v9, s27 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:32 -; GFX9-HSA-NEXT: s_nop 0 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[7:10], s[36:37] offset:208 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s35 +; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-HSA-NEXT: v_mov_b32_e32 v12, s25 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:16 -; GFX9-HSA-NEXT: s_nop 0 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[10:13], s[36:37] offset:192 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s1 @@ -4496,64 +4484,64 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s34, s36, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s35, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: s_add_u32 s28, s36, 0xe0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GFX7-HSA-NEXT: s_addc_u32 s29, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: s_add_u32 s26, s36, 0xd0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GFX7-HSA-NEXT: s_addc_u32 s27, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s27 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s36, 0xc0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX7-HSA-NEXT: s_addc_u32 s25, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: s_add_u32 s22, s36, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: s_addc_u32 s23, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s36, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: s_addc_u32 s19, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: s_add_u32 s16, s36, 0x80 @@ -4562,7 +4550,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: s_add_u32 s14, s36, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 @@ -5111,53 +5099,52 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s34, s36, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s35, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s35 +; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s36, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: s_addc_u32 s25, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: s_add_u32 s16, s36, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: s_addc_u32 s17, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s16, 64 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s36, 48 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s37, 0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 @@ -5165,9 +5152,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s36, 32 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s37, 0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 @@ -5175,20 +5162,20 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s36, 16 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: s_addc_u32 s5, s37, 0 +; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 102c33ec31b09..b3e75e767ae64 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -638,53 +638,52 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 -; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GFX7-NEXT: s_add_u32 s34, s36, 0x70 -; GFX7-NEXT: s_addc_u32 s35, s37, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s34 -; GFX7-NEXT: v_mov_b32_e32 v6, s35 +; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-NEXT: s_add_u32 s24, s36, 0x60 -; GFX7-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GFX7-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-NEXT: s_addc_u32 s25, s37, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-NEXT: v_mov_b32_e32 v6, s26 -; GFX7-NEXT: v_mov_b32_e32 v7, s27 -; GFX7-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-NEXT: v_mov_b32_e32 v8, s4 +; GFX7-NEXT: v_mov_b32_e32 v9, s5 +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-NEXT: v_mov_b32_e32 v13, s1 +; GFX7-NEXT: v_mov_b32_e32 v14, s2 +; GFX7-NEXT: v_mov_b32_e32 v15, s3 +; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GFX7-NEXT: s_add_u32 s18, s16, 0x70 +; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-NEXT: s_add_u32 s18, s16, 0x60 +; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: s_add_u32 s18, s16, 0x50 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: s_add_u32 s20, s36, 0x50 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: s_addc_u32 s21, s37, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-NEXT: s_nop 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: s_add_u32 s16, s36, 64 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: s_addc_u32 s17, s37, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-NEXT: s_nop 0 +; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: s_add_u32 s18, s16, 64 +; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-NEXT: s_add_u32 s12, s36, 48 +; GFX7-NEXT: s_add_u32 s12, s16, 48 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: s_addc_u32 s13, s37, 0 +; GFX7-NEXT: s_addc_u32 s13, s17, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 @@ -692,9 +691,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: s_add_u32 s8, s36, 32 +; GFX7-NEXT: s_add_u32 s8, s16, 32 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_addc_u32 s9, s37, 0 +; GFX7-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 @@ -702,20 +701,20 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_add_u32 s4, s36, 16 +; GFX7-NEXT: s_add_u32 s4, s16, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_addc_u32 s5, s37, 0 +; GFX7-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-NEXT: v_mov_b32_e32 v4, s36 +; GFX7-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_mov_b32_e32 v5, s37 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index ff55ab8859c83..efc31fbd5ed9e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -2391,48 +2391,48 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s23, s9, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s24, s10, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s25, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s2, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s3, s11, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s26, s4, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s26, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s27, s11, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s28, s4, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s27, s5, 0xff +; GFX7-HSA-NEXT: s_and_b32 s29, s5, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s28, s6, 0xff +; GFX7-HSA-NEXT: s_and_b32 s30, s6, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s29, s7, 0xff +; GFX7-HSA-NEXT: s_and_b32 s31, s7, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s30, s8, 0xff +; GFX7-HSA-NEXT: s_and_b32 s33, s8, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s31, s9, 0xff +; GFX7-HSA-NEXT: s_and_b32 s34, s9, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s33, s10, 0xff +; GFX7-HSA-NEXT: s_and_b32 s35, s10, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s34, s11, 0xff +; GFX7-HSA-NEXT: s_and_b32 s36, s11, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s24 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 @@ -2441,7 +2441,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 @@ -2450,7 +2450,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 @@ -2459,21 +2459,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 @@ -2880,33 +2880,33 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s30, s10, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s31, s10, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s33, s10, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s2, s11, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s3, s11, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s34, s11, 0x80008 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-HSA-NEXT: s_ashr_i32 s34, s11, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s35, s11, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s36, s11, 0x80008 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 @@ -3281,32 +3281,32 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 ; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s2, 0x80008 ; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s3, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s4, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s28, s4, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s30, s5, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s31, s6, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s33, s6, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s35, s7, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s8, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s37, s8, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s9, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s39, s9, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s10, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s41, s10, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s11, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s43, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s12, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s45, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s13, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s47, s13, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s14, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s49, s14, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s15, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s51, s15, 0x80008 -; GFX6-NOHSA-NEXT: s_and_b32 s52, s0, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s3, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s6, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s37, s8, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s38, s8, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s39, s9, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s40, s9, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s41, s10, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s42, s10, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s43, s11, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s45, s12, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s46, s12, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s47, s13, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s48, s13, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s49, s14, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s50, s14, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s51, s15, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s52, s15, 0x80008 +; GFX6-NOHSA-NEXT: s_and_b32 s26, s0, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s0, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s53, s1, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s54, s1, 0x80010 @@ -3327,92 +3327,91 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_and_b32 s64, s9, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s65, s10, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s66, s11, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s67, s12, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s68, s13, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s69, s14, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s70, s15, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s50 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s50 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s49 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s47 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s46 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s11 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s44 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s42 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -3429,7 +3428,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 @@ -3455,25 +3454,25 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s31, s5, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s33, s5, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s35, s6, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s36, s6, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s37, s6, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s39, s7, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s41, s8, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s42, s8, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s43, s9, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s44, s9, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s45, s10, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s46, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s47, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s48, s11, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s49, s12, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s50, s12, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s51, s13, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s52, s13, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s53, s14, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s54, s14, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s55, s15, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s56, s15, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s43, s8, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s9, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s46, s9, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s47, s10, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s48, s10, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s49, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s50, s11, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s51, s12, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s52, s12, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s53, s13, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s54, s13, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s55, s14, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s56, s14, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s57, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s58, s15, 0x80008 ; GFX7-HSA-NEXT: s_and_b32 s24, s0, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s27, s1, 0xff @@ -3482,18 +3481,18 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s34, s3, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s37, s4, 0xff +; GFX7-HSA-NEXT: s_and_b32 s36, s4, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s40, s5, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s57, s6, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s58, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s59, s7, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s60, s7, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s61, s8, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s62, s9, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s42, s6, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s45, s7, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s59, s8, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s60, s8, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s61, s9, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s62, s9, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s63, s10, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s64, s11, 0xff @@ -3506,97 +3505,97 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s68, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s53 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s53 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s55 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 @@ -3610,7 +3609,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 @@ -4235,16 +4234,17 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i32 s54, s10, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s55, s10, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX6-NOHSA-NEXT: s_ashr_i32 s56, s11, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s57, s11, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s58, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX6-NOHSA-NEXT: s_bfe_i32 s59, s12, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s61, s12 +; GFX6-NOHSA-NEXT: s_bfe_i32 s56, s11, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s57, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s58, s11 +; GFX6-NOHSA-NEXT: s_ashr_i32 s59, s12, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s61, s12, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s12, s12 ; GFX6-NOHSA-NEXT: s_ashr_i32 s62, s13, 24 ; GFX6-NOHSA-NEXT: s_bfe_i32 s63, s13, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s64, s13, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX6-NOHSA-NEXT: s_ashr_i32 s65, s14, 24 ; GFX6-NOHSA-NEXT: s_bfe_i32 s66, s14, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s67, s14, 0x80008 @@ -4253,8 +4253,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i32 s69, s15, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s70, s15, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s12, 24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s11, 24 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -4271,25 +4270,24 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s64 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s63 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s56 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s12 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s56 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s54 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9 @@ -4380,7 +4378,8 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s38, s6, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s39, s6, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s40, s7, 24 +; GFX7-HSA-NEXT: s_sext_i32_i8 s40, s6 +; GFX7-HSA-NEXT: s_ashr_i32 s6, s7, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s43, s8, 24 @@ -4411,104 +4410,103 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 +; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s9 -; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 -; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s9 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s67 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] +; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 -; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s49 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s41 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 @@ -6819,80 +6817,82 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA: ; %bb.0: ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s12, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s24, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[40:41], s[6:7], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s11, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s11, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s4, s11 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s10, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s10, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s10, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s9, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s9, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s9 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s8, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s8, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s8, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[8:9], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: @@ -6901,26 +6901,30 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s14, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s26, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s12, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s24, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[34:35], s[4:5], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[38:39], s[6:7], 56 +; GFX7-HSA-NEXT: s_ashr_i64 s[4:5], s[6:7], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -6929,31 +6933,27 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6961,15 +6961,15 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 @@ -6977,10 +6977,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -7390,143 +7390,144 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s15, s7, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s18, s10, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s19, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s20, s11, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s21, s10, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s22, s9, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s23, s8, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s24, s7, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s25, s6, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s26, s5, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX7-HSA-NEXT: s_and_b32 s27, s5, 0xff -; GFX7-HSA-NEXT: s_and_b32 s28, s6, 0xff -; GFX7-HSA-NEXT: s_and_b32 s29, s7, 0xff -; GFX7-HSA-NEXT: s_and_b32 s30, s8, 0xff -; GFX7-HSA-NEXT: s_and_b32 s31, s9, 0xff -; GFX7-HSA-NEXT: s_and_b32 s33, s10, 0xff -; GFX7-HSA-NEXT: s_and_b32 s34, s11, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s35, s4, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s36, s5, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s12, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s21, s13, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s14, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s23, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s24, s15, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s25, s14, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s26, s13, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s27, s12, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s28, s11, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s29, s10, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s4, s9, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s2, s8, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s3, s8, 0xff +; GFX7-HSA-NEXT: s_and_b32 s5, s9, 0xff +; GFX7-HSA-NEXT: s_and_b32 s30, s10, 0xff +; GFX7-HSA-NEXT: s_and_b32 s31, s11, 0xff +; GFX7-HSA-NEXT: s_and_b32 s33, s12, 0xff +; GFX7-HSA-NEXT: s_and_b32 s34, s13, 0xff +; GFX7-HSA-NEXT: s_and_b32 s35, s14, 0xff +; GFX7-HSA-NEXT: s_and_b32 s36, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s4, s11, 0x80010 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s12, s12, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s13, s13, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 32 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -7976,74 +7977,85 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s50, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s44, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s40, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s62, s1 -; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s64, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s66, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[50:51], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s38, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s42, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s34, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 16 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s52, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[38:39], s[0:1], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[54:55], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s71 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s70 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s71 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s66 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s67 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s65 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s63 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[62:63], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s6 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[52:53], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 @@ -8052,81 +8064,65 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s37 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -8136,33 +8132,33 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s50, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s52, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s5, 8 ; GFX7-HSA-NEXT: s_mov_b32 s62, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s4, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 ; GFX7-HSA-NEXT: s_mov_b32 s34, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s1, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8 ; GFX7-HSA-NEXT: s_mov_b32 s16, s1 ; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[20:21], s[2:3], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 56 +; GFX7-HSA-NEXT: s_ashr_i64 s[18:19], s[2:3], 56 +; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i64 s[44:45], s[4:5], 56 ; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 @@ -8177,118 +8173,118 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s48 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s49 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s49 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 -; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xc0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 -; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s42 -; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s43 -; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s50 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s51 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s51 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s44 +; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s45 +; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s43 -; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GFX7-HSA-NEXT: s_add_u32 s44, s8, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65 -; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: s_addc_u32 s45, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43 +; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s42 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s40 -; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s29 +; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s29 +; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 +; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s45 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s18 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x70 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s19 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s25 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s31 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 @@ -8308,8 +8304,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 64f1f45bf734c..4217384cdd5ce 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -8733,4 +8733,4 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ret void ; } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 8f6a1f8c01ec3..5ce8a2b5f862e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -4645,4 +4645,4 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index 018da7f81e3d4..9f264de531950 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -139,16 +139,16 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] @@ -248,14 +248,14 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -356,15 +356,15 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -464,27 +464,27 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -600,29 +600,29 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_25]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -722,6 +722,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 @@ -742,8 +743,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -759,8 +758,8 @@ body: | ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) @@ -773,7 +772,8 @@ body: | ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.1(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: S_BRANCH %bb.1 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: @@ -1114,14 +1114,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -1194,12 +1186,19 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78 ; GFX908-NEXT: [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79 ; GFX908-NEXT: [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80 @@ -1216,13 +1215,14 @@ body: | ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] ; GFX908-NEXT: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc ; GFX908-NEXT: S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc @@ -1643,10 +1643,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -1719,6 +1715,10 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 @@ -2049,10 +2049,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2 @@ -2125,9 +2121,13 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 ; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 @@ -2801,6 +2801,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 @@ -2822,7 +2823,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -2988,7 +2988,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0 ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GFX908-NEXT: {{ $}} @@ -3004,9 +3003,10 @@ body: | ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: ; GFX908-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) @@ -4974,20 +4974,20 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: undef [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] @@ -4998,9 +4998,9 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]] - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]] + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]] ; GFX908-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 @@ -5192,13 +5192,13 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -5297,7 +5297,6 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -5305,6 +5304,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -5726,17 +5726,17 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: DBG_VALUE [[V_CVT_I32_F64_e32_23]], 0, 0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: DBG_VALUE %23, 0, 0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] @@ -5836,17 +5836,17 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def %22, 327689 /* reguse:SReg_1_with_sub0 */, [[V_CVT_I32_F64_e32_4]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]] diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 851c9bb02a345..127656f7aa626 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -589,6 +589,6 @@ declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2 -attributes #0 = { minsize } -attributes #1 = { optsize } +attributes #0 = { minsize "amdgpu-flat-work-group-size"="1024,1024" } +attributes #1 = { optsize "amdgpu-flat-work-group-size"="1024,1024" } attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir index f7e295a91c828..4b0226a0f6586 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir @@ -263,10 +263,10 @@ body: | # GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, implicit $exec # GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, implicit $exec # GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, implicit $exec -# GCN-NEXT: KILL %0{{$}} # GCN-NEXT: dead %9:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 128, 0, implicit $exec # GCN-NEXT: dead %10:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 144, 0, implicit $exec # GCN-NEXT: KILL %1{{$}} +# GCN-NEXT: KILL %0{{$}} --- name: reg_pressure diff --git a/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll b/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll index 239fa80ade98a..04f2e3235d44a 100644 --- a/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll +++ b/llvm/test/CodeGen/AMDGPU/min-waves-per-eu-not-respected.ll @@ -12,5 +12,5 @@ define amdgpu_kernel void @impossible_occupancy() #1 { ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="9" } +attributes #0 = { "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-waves-per-eu"="9" } attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="11" } diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index bb7a591c91465..01eb1b1a353d1 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2994,71 +2994,70 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; VI-NEXT: v_mov_b32_e32 v11, 0 +; VI-NEXT: v_mov_b32_e32 v10, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_lo_u32 v10, v4, v3 -; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0 -; VI-NEXT: v_mul_lo_u32 v14, v5, v2 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 -; VI-NEXT: v_mov_b32_e32 v10, v3 -; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v14 -; VI-NEXT: v_mov_b32_e32 v10, v4 -; VI-NEXT: v_mov_b32_e32 v4, v11 -; VI-NEXT: v_mul_lo_u32 v7, v7, v0 -; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13] -; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] -; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v13 -; VI-NEXT: v_mov_b32_e32 v0, v4 -; VI-NEXT: v_mul_lo_u32 v11, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v0 -; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] -; VI-NEXT: v_add_u32_e32 v5, vcc, v11, v13 -; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc -; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; VI-NEXT: v_mul_lo_u32 v3, v4, v3 +; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; VI-NEXT: v_mul_lo_u32 v2, v5, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 +; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v5, v[2:3] +; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v6, v0, v[14:15] +; VI-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; VI-NEXT: v_addc_u32_e64 v4, s[0:1], 0, 0, vcc +; VI-NEXT: v_mul_lo_u32 v0, v7, v0 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v5, v[3:4] +; VI-NEXT: v_mul_lo_u32 v1, v6, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v10 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, v3, v9 +; VI-NEXT: v_addc_u32_e32 v11, vcc, v4, v0, vcc +; VI-NEXT: v_mov_b32_e32 v9, v2 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 -; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v16, v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12] -; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, v10 -; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] -; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v13, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 +; GFX9-NEXT: v_add3_u32 v9, v9, v13, v10 +; GFX9-NEXT: v_mul_lo_u32 v13, v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v0, v[8:9] +; GFX9-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] +; GFX9-NEXT: v_mul_lo_u32 v0, v7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, v5, v[10:11] +; GFX9-NEXT: v_add3_u32 v0, v0, v9, v13 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc +; GFX9-NEXT: global_store_dwordx4 v12, v[2:5], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 1e9994dd8e6ef..299bbdac60091 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -73,22 +73,22 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: .LBB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GFX9-NEXT: v_add_u32_e32 v18, v9, v0 ; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v19, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v20, v3, v16 -; GFX9-NEXT: v_sub_u32_e32 v3, v18, v19 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v19 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v13 +; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v19, v3, v16 +; GFX9-NEXT: v_add_u32_e32 v3, v9, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 +; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4] ; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7] ; GFX9-NEXT: global_load_dword v3, v[18:19], off diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 37bf8516403bf..312dfa3717c77 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -1616,24 +1616,24 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 ; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v2 +; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 ; GFX9-NEXT: v_or_b32_e32 v9, 1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13| ; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v14, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc ; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v3, v17, v3 -; GFX9-NEXT: v_add_u32_sdwa v4, v18, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v1, off ; GFX9-NEXT: global_store_dword v[7:8], v0, off @@ -1952,71 +1952,71 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v9, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x2070306 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v14 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v10 +; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 +; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 +; GFX9-NEXT: v_trunc_f32_e32 v18, v18 +; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14| +; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v3 +; GFX9-NEXT: v_mul_f32_e32 v14, v16, v19 +; GFX9-NEXT: v_trunc_f32_e32 v14, v14 +; GFX9-NEXT: v_mad_f32 v19, -v14, v10, v16 +; GFX9-NEXT: v_mul_f32_e32 v13, v10, v13 +; GFX9-NEXT: v_trunc_f32_e32 v13, v13 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v19|, |v10| +; GFX9-NEXT: v_mad_f32 v10, -v13, v3, v10 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v10 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v16 -; GFX9-NEXT: v_mul_f32_e32 v20, v10, v20 -; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX9-NEXT: v_mul_f32_e32 v21, v13, v21 -; GFX9-NEXT: v_trunc_f32_e32 v20, v20 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v10|, |v3| +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v16 ; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GFX9-NEXT: v_mul_f32_e32 v22, v16, v22 -; GFX9-NEXT: v_mul_f32_e32 v23, v19, v23 -; GFX9-NEXT: v_trunc_f32_e32 v21, v21 -; GFX9-NEXT: v_mad_f32 v24, -v20, v3, v10 +; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 ; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX9-NEXT: v_mul_f32_e32 v3, v19, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v12, 30, v12 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_trunc_f32_e32 v22, v22 -; GFX9-NEXT: v_trunc_f32_e32 v23, v23 -; GFX9-NEXT: v_mad_f32 v13, -v21, v14, v13 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v3| -; GFX9-NEXT: v_xor_b32_sdwa v18, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v13, v13 +; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 +; GFX9-NEXT: v_cvt_i32_f32_e32 v14, v14 +; GFX9-NEXT: v_mad_f32 v19, -v3, v16, v19 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v15, 30, v15 ; GFX9-NEXT: v_or_b32_e32 v12, 1, v12 -; GFX9-NEXT: v_cvt_i32_f32_e32 v20, v20 -; GFX9-NEXT: v_cvt_i32_f32_e32 v21, v21 -; GFX9-NEXT: v_mad_f32 v25, -v22, v10, v16 -; GFX9-NEXT: v_cvt_i32_f32_e32 v22, v22 -; GFX9-NEXT: v_mad_f32 v19, -v23, v16, v19 -; GFX9-NEXT: v_cvt_i32_f32_e32 v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14| -; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10 ; GFX9-NEXT: v_or_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v12, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v10| -; GFX9-NEXT: v_or_b32_e32 v18, 1, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v15, vcc +; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v16| -; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v18, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v15, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v20, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v21, v3 -; GFX9-NEXT: v_add_u32_e32 v10, v22, v10 -; GFX9-NEXT: v_add_u32_e32 v12, v23, v12 -; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v13, v2 +; GFX9-NEXT: v_add_u32_e32 v12, v18, v12 +; GFX9-NEXT: v_add_u32_e32 v13, v14, v15 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v10 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v11 -; GFX9-NEXT: v_mul_lo_u32 v4, v10, v0 -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v17 +; GFX9-NEXT: v_mul_lo_u32 v4, v12, v11 +; GFX9-NEXT: v_mul_lo_u32 v10, v13, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v17 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_e32 v3, v17, v4 -; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_e32 v4, v17, v10 +; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: global_store_dword v[7:8], v1, off @@ -2503,39 +2503,39 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16 ; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 -; GFX9-NEXT: v_mad_f32 v20, -v16, v3, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2 +; GFX9-NEXT: v_mad_f32 v2, -v16, v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v13, v9 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2 ; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 -; GFX9-NEXT: v_mad_f32 v21, -v17, v11, v3 +; GFX9-NEXT: v_mad_f32 v19, -v17, v11, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v17 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v15, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc ; GFX9-NEXT: v_trunc_f32_e32 v18, v18 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 ; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 ; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v16, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v21|, v11 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v17, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v16, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v17, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v18, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v18, vcc ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v10 -; GFX9-NEXT: v_mul_lo_u32 v0, v11, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v13, v12 -; GFX9-NEXT: v_sub_u32_e32 v2, v10, v2 -; GFX9-NEXT: v_sub_u32_sdwa v3, v10, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u32 v4, v15, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v10 +; GFX9-NEXT: v_mul_lo_u32 v0, v3, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v11, v12 +; GFX9-NEXT: v_sub_u32_e32 v4, v10, v4 +; GFX9-NEXT: v_sub_u32_sdwa v2, v10, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_sub_u32_e32 v0, v10, v0 -; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: global_store_dword v[7:8], v1, off diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index f496a4b06bb23..81925de8910f8 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -5,8 +5,8 @@ # is killed by that store. # GCN-LABEL: name: global_sextload_v32i32_to_v32i64 -# GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) -# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 +# GCN: renamable $vgpr34_vgpr35_vgpr36_vgpr37 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) +# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr26_vgpr27_vgpr28_vgpr29, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 # GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64 # GCN-GCNTRACKER-NOT: SI_SPILL diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a2a0107a6f7d8..a1197aeace86f 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -361,96 +361,96 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 -; GFX8-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 +; GFX8-NEXT: v_and_b32_e32 v12, 0xfe000000, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x5000 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v4 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v4 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v4 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v4 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v4 -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] -; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v4 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] -; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v4 -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v4 -; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22] -; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v4 -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24] -; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v4 -; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v5, vcc -; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26] -; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[4:5] -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x10000, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffd800, v2 +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe000, v2 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[19:20] ; GFX8-NEXT: s_addk_i32 s1, 0x2000 ; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(10) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(9) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(8) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v23, vcc, v13, v10 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, v14, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xfffff000, v2 +; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[21:22] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v15, v23 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v16, v24, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xfffff800, v2 +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v17, v21 +; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v18, v22, vcc +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v21 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v22, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v16, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v18, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v19, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v20, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v19, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v20, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v21, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v22, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v23, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v24, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v25, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v26, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v27, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v28, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v17, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v18, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -462,9 +462,9 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[10:11] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -496,91 +496,92 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX900-NEXT: s_movk_i32 s0, 0x5000 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: s_movk_i32 s2, 0x7f -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: s_movk_i32 s0, 0xd000 -; GFX900-NEXT: s_movk_i32 s1, 0xe000 -; GFX900-NEXT: s_movk_i32 s3, 0xf000 +; GFX900-NEXT: s_movk_i32 s4, 0x7f +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: s_movk_i32 s2, 0xd000 +; GFX900-NEXT: s_movk_i32 s3, 0xe000 +; GFX900-NEXT: s_movk_i32 s5, 0xf000 ; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX900-NEXT: ; =>This Loop Header: Depth=1 ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: s_mov_b32 s6, 0 ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[9:10], v[4:5], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[11:12], v[4:5], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v4 +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[9:10], v[2:3], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[11:12], v[2:3], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v2 ; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v5, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s0, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v2 ; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, s1, v4 -; GFX900-NEXT: global_load_dwordx2 v[13:14], v[13:14], off -; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[23:24], v[19:20], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[25:26], v[19:20], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[27:28], v[19:20], off -; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, s3, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[19:20], v[21:22], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[29:30], v[4:5], off -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX900-NEXT: s_addk_i32 s4, 0x2000 -; GFX900-NEXT: s_cmp_gt_u32 s4, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(8) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(7) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v17, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v18, v3, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc +; GFX900-NEXT: s_addk_i32 s6, 0x2000 +; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096 +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21 +; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v5, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s5, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 +; GFX900-NEXT: global_load_dwordx2 v[13:14], v[2:3], off +; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc +; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v14, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v15, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v16, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19 +; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc ; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v23, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v24, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v25, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v26, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc ; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v27, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v28, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v19, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v20, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v10, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v11, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v8, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v10, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v11, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v29, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v30, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v5, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_add_i32 s4, s2, -1 -; GFX900-NEXT: s_cmp_eq_u32 s2, 0 +; GFX900-NEXT: s_add_i32 s0, s4, -1 +; GFX900-NEXT: s_cmp_eq_u32 s4, 0 ; GFX900-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_mov_b32 s2, s4 +; GFX900-NEXT: s_mov_b32 s4, s0 ; GFX900-NEXT: s_branch .LBB1_1 ; GFX900-NEXT: .LBB1_5: ; %while.end ; GFX900-NEXT: v_mov_b32_e32 v1, s35 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v6 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX900-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX900-NEXT: s_endpgm ; ; GFX10-LABEL: clmem_read: diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 6583d5e8aa5a0..704947523f677 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -70,22 +70,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_or_b32_e32 v13, v7, v9 +; GFX9-NEXT: v_or_b32_e32 v12, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX9-NEXT: v_xor_b32_e32 v11, 0x7f, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX9-NEXT: v_xor_b32_e32 v10, 0x7f, v6 -; GFX9-NEXT: v_or_b32_e32 v12, v10, v8 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v8 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5] ; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] @@ -107,47 +107,47 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v8, v10, v12 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v13, v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 +; GFX9-NEXT: v_sub_u32_e32 v12, 64, v24 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v12, v8, v12 ; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_or_b32_e32 v13, v9, v13 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v10, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v12, v0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, -1, v23 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v22, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, -1, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v18, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, -1, v5, vcc ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -155,20 +155,20 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15] ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 31, v7 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v11 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] ; GFX9-NEXT: v_or_b32_e32 v14, v14, v33 -; GFX9-NEXT: v_or3_b32 v6, v6, v8, v10 +; GFX9-NEXT: v_or3_b32 v6, v6, v8, v12 ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v28, v14 ; GFX9-NEXT: v_or_b32_e32 v16, v16, v32 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v29, v15, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v30, v16, vcc -; GFX9-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v31, v17, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX9-NEXT: v_or_b32_e32 v12, v18, v12 +; GFX9-NEXT: v_or_b32_e32 v10, v18, v10 ; GFX9-NEXT: v_and_b32_e32 v18, v8, v23 -; GFX9-NEXT: v_or_b32_e32 v13, v19, v13 +; GFX9-NEXT: v_or_b32_e32 v11, v19, v11 ; GFX9-NEXT: v_and_b32_e32 v19, v8, v22 ; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v18 ; GFX9-NEXT: v_and_b32_e32 v32, v8, v4 @@ -185,7 +185,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX9-NEXT: v_mov_b32_e32 v19, v9 -; GFX9-NEXT: v_or3_b32 v7, v7, 0, v11 +; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v18, v8 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -194,12 +194,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v13 -; GFX9-NEXT: v_or3_b32 v11, v7, 0, v11 -; GFX9-NEXT: v_or3_b32 v12, v6, v12, v10 -; GFX9-NEXT: v_or_b32_e32 v10, v9, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 31, v11 +; GFX9-NEXT: v_or3_b32 v10, v7, 0, v13 +; GFX9-NEXT: v_or3_b32 v12, v6, v11, v12 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v15 ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] @@ -209,19 +209,19 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[14:15] -; GFX9-NEXT: v_mul_lo_u32 v9, v10, v4 -; GFX9-NEXT: v_mul_lo_u32 v11, v11, v23 +; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, v10, v23 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 ; GFX9-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v23, v10, v[13:14] +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v23, v11, v[13:14] ; GFX9-NEXT: v_add3_u32 v8, v8, v16, v9 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] ; GFX9-NEXT: v_mov_b32_e32 v8, v14 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v8 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mul_lo_u32 v12, v12, v22 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v22, v10, v[8:9] -; GFX9-NEXT: v_add3_u32 v4, v11, v7, v12 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v22, v11, v[8:9] +; GFX9-NEXT: v_add3_u32 v4, v10, v7, v12 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v7, v13 @@ -1628,38 +1628,38 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[10:11] ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] -; GFX9-NEXT: v_or_b32_e32 v10, v20, v10 +; GFX9-NEXT: v_or_b32_e32 v10, v20, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v17 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] +; GFX9-NEXT: v_or_b32_e32 v11, v21, v31 +; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v16, v16, v21 ; GFX9-NEXT: v_or_b32_e32 v18, v18, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v9 -; GFX9-NEXT: v_or_b32_e32 v16, v16, v20 ; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v26, v16 ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v27, v17, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v28, v18, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v29, v19, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v20 ; GFX9-NEXT: v_and_b32_e32 v20, v30, v4 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v16, v20 ; GFX9-NEXT: v_and_b32_e32 v20, v30, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v20, vcc -; GFX9-NEXT: v_and_b32_e32 v20, v30, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v20, vcc +; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 +; GFX9-NEXT: v_and_b32_e32 v12, v30, v6 ; GFX9-NEXT: v_and_b32_e32 v20, v30, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v12, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v19, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc -; GFX9-NEXT: v_or_b32_e32 v11, v21, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v20, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] -; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 ; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll index a433509511584..dc5e442c2b262 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll +++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -8,7 +8,7 @@ ; GCN-NOT: v_writelane_b32 ; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] ; GCN: .sgpr_spill_count: 0 -define amdgpu_kernel void @test_remat_sgpr(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) { +define amdgpu_kernel void @test_remat_sgpr(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() br label %bb3 @@ -43,3 +43,5 @@ bb3: ; preds = %bb3, %bb declare double @llvm.fma.f64(double, double, double) declare i32 @llvm.amdgcn.workitem.id.x() + +attributes #0 = { "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll index 8bbae59f468f1..cbd1714a5e375 100644 --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -127,7 +127,7 @@ define void @test_func() !dbg !6 { ; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0 ; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False -; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 8 +; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 10 ; STDERR-NEXT: remark: foo.cl:8:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:8:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:8:0: LDS Size [bytes/block]: 0 @@ -146,7 +146,7 @@ define void @empty_func() !dbg !8 { ; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr ; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0 ; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0)) +; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0 @@ -164,7 +164,7 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 { ; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr ; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144 ; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0)) +; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index 8f4a4b5afcdc1..554e3640221b9 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -1675,7 +1675,7 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1716,23 +1716,22 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v20 -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v20 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 -; SI-GISEL-NEXT: s_nop 0 ; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1978,7 +1977,7 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 +; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0xbff00000 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -2019,23 +2018,22 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v20 -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], -1.0, v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v20 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 -; SI-GISEL-NEXT: s_nop 0 ; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2245,8 +2243,8 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 @@ -2254,60 +2252,60 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] -; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v17 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-GISEL-NEXT: s_nop 0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2520,8 +2518,8 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_mov_b32_e32 v11, s5 ; SI-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 -; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 +; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[4:5], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5 @@ -2529,61 +2527,61 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8 ; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc -; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] +; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] -; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] -; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0x3ff00000 -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v8 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17] +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v6 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-GISEL-NEXT: s_nop 0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 6d79837feb128..6796391aba675 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -42,7 +42,7 @@ body: | ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead %11 ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] + ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %15, 851978 /* regdef:VGPR_16 */, def %16 ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec @@ -50,8 +50,8 @@ body: | ; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %21, 851978 /* regdef:VGPR_16 */, def %22 ; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_4]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B64_gfx9 undef %30:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index 268322bd074bf..648f4fc64f9d0 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -44,9 +44,9 @@ entry: ; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64: ; TONGA: NumSgprs: 96 ; TONGA-GCNTRACKERS: NumSgprs: 96 -; TONGA: NumVgprs: 33 -; TONGA-GCNTRACKERS: NumVgprs: 25 -; TONGA: Occupancy: 7 +; TONGA: NumVgprs: 21 +; TONGA-GCNTRACKERS: NumVgprs: 23 +; TONGA: Occupancy: 8 ; TONGA-GCNTRACKERS: Occupancy: 8 @@ -59,11 +59,11 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: ; GENERIC: NumSgprs: 71 -; GENERIC-GCNTRACKERS: NumSgprs: 54 -; GENERIC: NumVgprs: 16 -; GENERIC-GCNTRACKERS: NumVgprs: 16 +; GENERIC-GCNTRACKERS: NumSgprs: 45 +; GENERIC: NumVgprs: 20 +; GENERIC-GCNTRACKERS: NumVgprs: 20 ; GENERIC: Occupancy: 7 -; GENERIC-GCNTRACKERS: Occupancy: 8 +; GENERIC-GCNTRACKERS: Occupancy: 10 define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { %load = load <64 x i16>, ptr addrspace(4) %in diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir index 9429d1565962e..e67036f0bbbea 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -16,20 +16,20 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub3:vreg_128 = COPY $vgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub2:vreg_128 = COPY $vgpr8 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_128 = COPY $vgpr7 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6 - ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5 - ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4 - ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 + ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6 + ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5 + ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4 ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_128 = COPY $vgpr3 ; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub0:vreg_128 = COPY $vgpr2 ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY6]].sub2, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub3, implicit $exec ; CHECK-NEXT: S_BARRIER - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub2, implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub3, implicit $exec ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY3]].sub0, implicit $exec + ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY2]].sub1, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY1]].sub2, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY]].sub3, implicit $exec @@ -37,7 +37,7 @@ body: | ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY6]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 undef %43.sub3:vreg_128 = COPY $vgpr9 undef %42.sub2:vreg_128 = COPY $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll index bd1258cb1cf98..1e5d6755fbc85 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll @@ -42,4 +42,4 @@ bb2: declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-num-vgpr"="9" } +attributes #1 = { "amdgpu-num-vgpr"="9" "amdgpu-flat-work-group-size"="1024,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll index 71f8d91874f04..5a30d5d5e42ec 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll @@ -7,16 +7,16 @@ ; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target ; CHECK-LABEL: {{^}}load_fma_store: -; OCC: NumVgprs: 32 -; OCC-GCNTRACKER: NumVgprs: 24 +; OCC: NumVgprs: 24 +; OCC-GCNTRACKER: NumVgprs: 26 ; RELAX: NumVgprs: 64 ; RELAX-GCNTRACKER: NumVgprs: 60 -; OCC: NumVGPRsForWavesPerEU: 32 -; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24 +; OCC: NumVGPRsForWavesPerEU: 24 +; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 26 ; RELAX: NumVGPRsForWavesPerEU: 64 ; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60 -; OCC: Occupancy: 8 -; OCC-GCNTRACKER: Occupancy: 8 +; OCC: Occupancy: 10 +; OCC-GCNTRACKER: Occupancy: 9 ; RELAX: Occupancy: 4 ; RELAX-GCNTRACKER: Occupancy: 4 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 6225ff73e28d0..57c54c4de7102 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -792,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: s_mov_b32 s6, s10 -; GCN-NEXT: s_mov_b32 s7, s11 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s2 -; GCN-NEXT: s_mov_b32 s5, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NEXT: s_mov_b32 s8, s0 -; GCN-NEXT: s_mov_b32 s9, s1 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 +; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 +; GCN-NEXT: v_max_i32_e32 v5, v5, v12 +; GCN-NEXT: v_cvt_f32_u32_e32 v12, v5 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 ; GCN-NEXT: v_xor_b32_e32 v8, v0, v4 -; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 -; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 +; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GCN-NEXT: v_max_i32_e32 v4, v4, v10 -; GCN-NEXT: v_max_i32_e32 v5, v5, v13 -; GCN-NEXT: v_max_i32_e32 v6, v6, v16 -; GCN-NEXT: v_max_i32_e32 v1, v1, v12 -; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v14 +; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v12 +; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v14, v5 -; GCN-NEXT: v_cvt_f32_u32_e32 v16, v6 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GCN-NEXT: v_max_i32_e32 v1, v1, v13 +; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v10 ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 -; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 +; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 +; GCN-NEXT: v_max_i32_e32 v6, v6, v15 +; GCN-NEXT: v_mul_hi_u32 v16, v10, v16 ; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 -; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v7 +; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16 +; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v12 +; GCN-NEXT: v_mul_hi_u32 v10, v1, v10 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GCN-NEXT: v_mul_hi_u32 v13, v12, v16 ; GCN-NEXT: v_max_i32_e32 v0, v0, v9 -; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v11 -; GCN-NEXT: v_max_i32_e32 v2, v2, v15 -; GCN-NEXT: v_max_i32_e32 v11, v7, v17 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 -; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 -; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 -; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 -; GCN-NEXT: v_mul_lo_u32 v17, v17, v16 -; GCN-NEXT: v_cvt_f32_u32_e32 v18, v11 -; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 -; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 -; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 -; GCN-NEXT: v_rcp_iflag_f32_e32 v18, v18 +; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15 +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; GCN-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, v14, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, v16, v17 +; GCN-NEXT: v_mul_lo_u32 v13, v10, v5 ; GCN-NEXT: v_mul_hi_u32 v12, v0, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v1, v13 -; GCN-NEXT: v_mul_hi_u32 v14, v2, v14 -; GCN-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 -; GCN-NEXT: v_mul_lo_u32 v15, v12, v4 -; GCN-NEXT: v_mul_lo_u32 v17, v13, v5 -; GCN-NEXT: v_mul_lo_u32 v21, v14, v6 -; GCN-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v12 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 1, v13 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v14 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v11 -; GCN-NEXT: v_sub_i32_e32 v17, vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v16, vcc, v1, v5 -; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v20, s[2:3] -; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] -; GCN-NEXT: v_mul_lo_u32 v19, v19, v18 -; GCN-NEXT: v_sub_i32_e32 v20, vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v17, s[0:1] -; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[2:3] -; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 1, v14 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc +; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v13, vcc, v1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 +; GCN-NEXT: v_mul_lo_u32 v1, v12, v4 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v6 +; GCN-NEXT: v_mul_lo_u32 v5, v5, v9 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] +; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7 +; GCN-NEXT: v_mul_hi_u32 v4, v9, v5 +; GCN-NEXT: v_max_i32_e32 v5, v7, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 +; GCN-NEXT: v_max_i32_e32 v2, v2, v9 +; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v12, s[2:3] ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 -; GCN-NEXT: v_mul_hi_u32 v4, v18, v19 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5] ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GCN-NEXT: v_cndmask_b32_e64 v1, v10, v13, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v17, vcc -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GCN-NEXT: v_max_i32_e32 v5, v3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v5, v4 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v10 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GCN-NEXT: v_mul_lo_u32 v6, v4, v11 +; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GCN-NEXT: v_mul_hi_u32 v4, v9, v10 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; GCN-NEXT: v_max_i32_e32 v6, v3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v6, v4 +; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v14 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; GCN-NEXT: v_mul_lo_u32 v8, v4, v5 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, v5, v11 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc ; GCN-NEXT: v_xor_b32_e32 v4, v4, v3 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s11, 0xf000 -; TONGA-NEXT: s_mov_b32 s10, -1 -; TONGA-NEXT: s_mov_b32 s6, s10 -; TONGA-NEXT: s_mov_b32 s7, s11 +; TONGA-NEXT: s_mov_b32 s7, 0xf000 +; TONGA-NEXT: s_mov_b32 s6, -1 +; TONGA-NEXT: s_mov_b32 s10, s6 +; TONGA-NEXT: s_mov_b32 s11, s7 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s4, s2 -; TONGA-NEXT: s_mov_b32 s5, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; TONGA-NEXT: s_mov_b32 s8, s0 -; TONGA-NEXT: s_mov_b32 s9, s1 +; TONGA-NEXT: s_mov_b32 s8, s2 +; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; TONGA-NEXT: s_mov_b32 s4, s0 +; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v1 ; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5 +; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 +; TONGA-NEXT: v_max_i32_e32 v5, v5, v12 +; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v5 ; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v6 ; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4 -; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 -; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; TONGA-NEXT: v_max_i32_e32 v4, v4, v10 -; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 -; TONGA-NEXT: v_max_i32_e32 v6, v6, v16 -; TONGA-NEXT: v_max_i32_e32 v1, v1, v12 -; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v14 +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5 +; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v12 +; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 ; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 -; TONGA-NEXT: v_cvt_f32_u32_e32 v14, v5 -; TONGA-NEXT: v_cvt_f32_u32_e32 v16, v6 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 +; TONGA-NEXT: v_max_i32_e32 v1, v1, v13 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6 +; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v16, v16 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 +; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 +; TONGA-NEXT: v_max_i32_e32 v6, v6, v15 +; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16 ; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; TONGA-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 -; TONGA-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 ; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 -; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v14 -; TONGA-NEXT: v_cvt_u32_f32_e32 v16, v16 -; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v7 +; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16 +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4 +; TONGA-NEXT: v_mul_lo_u32 v16, v16, v12 +; TONGA-NEXT: v_mul_hi_u32 v10, v1, v10 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 +; TONGA-NEXT: v_mul_hi_u32 v13, v12, v16 ; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 -; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v11 -; TONGA-NEXT: v_max_i32_e32 v2, v2, v15 -; TONGA-NEXT: v_max_i32_e32 v11, v7, v17 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v4 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v5 -; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v6 -; TONGA-NEXT: v_mul_lo_u32 v13, v13, v12 -; TONGA-NEXT: v_mul_lo_u32 v15, v15, v14 -; TONGA-NEXT: v_mul_lo_u32 v17, v17, v16 -; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v11 -; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13 -; TONGA-NEXT: v_mul_hi_u32 v15, v14, v15 -; TONGA-NEXT: v_mul_hi_u32 v17, v16, v17 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v18, v18 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; TONGA-NEXT: v_add_u32_e32 v12, vcc, v12, v13 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v14, v15 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v16, v17 +; TONGA-NEXT: v_mul_lo_u32 v13, v10, v5 ; TONGA-NEXT: v_mul_hi_u32 v12, v0, v12 -; TONGA-NEXT: v_mul_hi_u32 v13, v1, v13 -; TONGA-NEXT: v_mul_hi_u32 v14, v2, v14 -; TONGA-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 -; TONGA-NEXT: v_mul_lo_u32 v15, v12, v4 -; TONGA-NEXT: v_mul_lo_u32 v17, v13, v5 -; TONGA-NEXT: v_mul_lo_u32 v21, v14, v6 -; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v18 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v17 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21 -; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v12 -; TONGA-NEXT: v_add_u32_e32 v20, vcc, 1, v13 -; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v14 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v11 -; TONGA-NEXT: v_sub_u32_e32 v17, vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, v1, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v13, v13, v20, s[2:3] -; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] -; TONGA-NEXT: v_mul_lo_u32 v19, v19, v18 -; TONGA-NEXT: v_sub_u32_e32 v20, vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v17, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v12 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v16, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; TONGA-NEXT: v_add_u32_e32 v17, vcc, 1, v14 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc +; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v13 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 +; TONGA-NEXT: v_mul_lo_u32 v1, v12, v4 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v6 +; TONGA-NEXT: v_mul_lo_u32 v5, v5, v9 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7 +; TONGA-NEXT: v_mul_hi_u32 v4, v9, v5 +; TONGA-NEXT: v_max_i32_e32 v5, v7, v0 +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 +; TONGA-NEXT: v_max_i32_e32 v2, v2, v9 +; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 +; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v1, v12, s[2:3] ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 -; TONGA-NEXT: v_mul_hi_u32 v4, v18, v19 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5] ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 +; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v10, v13, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8 +; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11 +; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v14, v17, vcc -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v3 -; TONGA-NEXT: v_max_i32_e32 v5, v3, v5 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4 -; TONGA-NEXT: v_mul_hi_u32 v4, v5, v4 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v10 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 -; TONGA-NEXT: v_mul_lo_u32 v6, v4, v11 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3 +; TONGA-NEXT: v_max_i32_e32 v6, v3, v6 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4 +; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14 +; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v5, v6 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 -; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v5, v11 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3 ; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 04a824a073a7e..459ef648fd806 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -873,20 +873,20 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; NOSDWA-NEXT: v_mul_f16_e32 v1, v5, v1 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; NOSDWA-NEXT: v_mul_f16_e32 v0, v4, v0 -; NOSDWA-NEXT: v_mul_f16_e32 v4, v11, v10 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_mul_f16_e32 v10, v11, v10 ; NOSDWA-NEXT: v_mul_f16_e32 v7, v12, v7 ; NOSDWA-NEXT: v_mul_f16_e32 v6, v13, v6 -; NOSDWA-NEXT: v_mul_f16_e32 v5, v14, v5 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_mul_f16_e32 v4, v4, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v5 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v7 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v6 -; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v5 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4 ; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; NOSDWA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 572026da79646..26a4a6743cffa 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1508,52 +1508,52 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 ; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 ; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; SI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v8f16: @@ -1652,81 +1652,81 @@ define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v11, v15, v11 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v26, v15 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_or_b32_e32 v9, v24, v25 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v5, v20, v21 -; SI-NEXT: v_or_b32_e32 v3, v18, v3 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_or_b32_e32 v11, v18, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v11, v11, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; SI-NEXT: v_cndmask_b32_e32 v15, v22, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; SI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc ; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc -; SI-NEXT: v_cndmask_b32_e32 v16, v7, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v16, v3, v14, vcc ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 @@ -1772,136 +1772,132 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; SI-LABEL: v_vselect_v16f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 +; SI-NEXT: v_cndmask_b32_e32 v0, v37, v0, vcc +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 +; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; SI-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[10:11] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cndmask_b32_e64 v3, v16, v3, s[12:13] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[14:15] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e64 v5, v16, v5, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31 -; SI-NEXT: v_cndmask_b32_e64 v7, v16, v7, s[16:17] -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cndmask_b32_e32 v4, v18, v4, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 +; SI-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cndmask_b32_e32 v6, v21, v6, vcc +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; SI-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_cndmask_b32_e32 v7, v22, v7, vcc +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cndmask_b32_e32 v8, v23, v8, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v23, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cndmask_b32_e32 v9, v24, v9, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v28 +; SI-NEXT: v_cndmask_b32_e32 v10, v23, v10, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v29 +; SI-NEXT: v_cndmask_b32_e32 v11, v24, v11, vcc +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 -; SI-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 +; SI-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v14, v20, v14, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cndmask_b32_e32 v13, v20, v13, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 +; SI-NEXT: v_cndmask_b32_e32 v15, v18, v15, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_vselect_v16f16: @@ -1912,25 +1908,22 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v31, s30, 0 ; VI-NEXT: v_writelane_b32 v31, s31, 1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17 ; VI-NEXT: v_cmp_eq_u32_e64 s[30:31], 0, v29 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v24 +; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 ; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27 ; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[30:31] ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19 +; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25 ; VI-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[28:29] ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[10:11] -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25 -; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v20 +; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v20 ; VI-NEXT: v_cmp_eq_u32_e64 s[24:25], 0, v23 ; VI-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[26:27] ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 @@ -1939,46 +1932,49 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[24:25] ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v22 +; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v22 ; VI-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[22:23] ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 ; VI-NEXT: v_cndmask_b32_e64 v21, v22, v21, s[20:21] -; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] +; VI-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[8:9] +; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v24 +; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[10:11] +; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v26 +; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] +; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v28 +; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[14:15] +; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[16:17] +; VI-NEXT: v_readlane_b32 s31, v31, 1 +; VI-NEXT: v_readlane_b32 s30, v31, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 ; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v26 -; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19] -; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[8:9] ; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 -; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v28 -; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v30 -; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[14:15] -; VI-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[16:17] ; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_readlane_b32 s31, v31, 1 -; VI-NEXT: v_readlane_b32 s30, v31, 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; VI-NEXT: v_cndmask_b32_e32 v8, v13, v11, vcc ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index fc6ad39db5b89..a423b6f831a9d 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -286,18 +286,18 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_or_b32_e32 v19, v19, v17 +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 +; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 @@ -335,18 +335,18 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_or_b32_e32 v19, v19, v17 +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 +; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 @@ -384,18 +384,18 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_or_b32_e32 v19, v19, v17 +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 +; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 6b4bca11d80c7..7e7f4f5d19914 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -911,20 +911,20 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 +; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 9d550ec27a63b..8150328dd24f0 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -605,20 +605,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 +; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13 -; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11 -; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 +; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i64: @@ -631,20 +631,20 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3] +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] +; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7] -; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5] -; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1] -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] +; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index ce15bbcc9e189..6423267be4b34 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -6117,108 +6117,108 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v11 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v8 ; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v11, v8, vcc -; TONGA-NEXT: v_xor_b32_e32 v22, v9, v8 -; TONGA-NEXT: v_xor_b32_e32 v11, v11, v8 -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v22 -; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v11 -; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v22 -; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v11, vcc -; TONGA-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 -; TONGA-NEXT: v_rcp_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; TONGA-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; TONGA-NEXT: v_trunc_f32_e32 v9, v9 -; TONGA-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 -; TONGA-NEXT: v_cvt_u32_f32_e32 v20, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v21, v8 -; TONGA-NEXT: v_mul_lo_u32 v18, v23, v20 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v23, v21, 0 -; TONGA-NEXT: v_mul_lo_u32 v19, v24, v21 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v18 -; TONGA-NEXT: v_add_u32_e32 v25, vcc, v9, v19 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v21, v25, 0 -; TONGA-NEXT: v_mul_hi_u32 v9, v21, v8 -; TONGA-NEXT: v_add_u32_e32 v26, vcc, v9, v18 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v20, v8, 0 -; TONGA-NEXT: v_addc_u32_e32 v27, vcc, 0, v19, vcc -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v25, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v26, v8 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v27, v9, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v18 -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v25, vcc, v21, v8 -; TONGA-NEXT: v_addc_u32_e32 v26, vcc, v20, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v23, v25, 0 -; TONGA-NEXT: v_mul_lo_u32 v20, v23, v26 -; TONGA-NEXT: v_mul_lo_u32 v21, v24, v25 -; TONGA-NEXT: v_mul_hi_u32 v23, v25, v8 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v26, v8, 0 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v20, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v21 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v9, 0 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v26, v9, 0 -; TONGA-NEXT: v_add_u32_e32 v20, vcc, v23, v20 -; TONGA-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; TONGA-NEXT: v_xor_b32_e32 v9, v9, v8 +; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8 +; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v9 +; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v8 +; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v9 +; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v8, vcc +; TONGA-NEXT: v_madmk_f32 v11, v18, 0x4f800000, v11 +; TONGA-NEXT: v_rcp_f32_e32 v11, v11 +; TONGA-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; TONGA-NEXT: v_mul_f32_e32 v18, 0x2f800000, v11 +; TONGA-NEXT: v_trunc_f32_e32 v18, v18 +; TONGA-NEXT: v_madmk_f32 v11, v18, 0xcf800000, v11 +; TONGA-NEXT: v_cvt_u32_f32_e32 v22, v18 +; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 +; TONGA-NEXT: v_mul_lo_u32 v20, v23, v22 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v21, v24, v11 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v20 +; TONGA-NEXT: v_add_u32_e32 v21, vcc, v19, v21 +; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0 +; TONGA-NEXT: v_mul_hi_u32 v25, v11, v18 +; TONGA-NEXT: v_add_u32_e32 v25, vcc, v25, v19 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v22, v18, 0 +; TONGA-NEXT: v_addc_u32_e32 v26, vcc, 0, v20, vcc +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v22, v21, 0 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v18 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v26, v19, vcc +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v21, vcc +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v20 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v22, v19, vcc +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v22, v23, v25 +; TONGA-NEXT: v_mul_lo_u32 v23, v24, v11 +; TONGA-NEXT: v_mul_hi_u32 v24, v11, v18 +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v18, 0 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v22, v19 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v23 +; TONGA-NEXT: v_mad_u64_u32 v[22:23], s[0:1], v11, v19, 0 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v19, 0 +; TONGA-NEXT: v_add_u32_e32 v22, vcc, v24, v22 +; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; TONGA-NEXT: v_add_u32_e32 v20, vcc, v22, v20 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v23, v21, vcc +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; TONGA-NEXT: v_add_u32_e32 v18, vcc, v20, v18 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v21, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v18, v8 -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v8 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, v26, v9, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v20, 31, v15 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v14, v20 -; TONGA-NEXT: v_xor_b32_e32 v21, v8, v20 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v21, v19, 0 -; TONGA-NEXT: v_mul_hi_u32 v23, v21, v18 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v20, vcc -; TONGA-NEXT: v_xor_b32_e32 v15, v15, v20 -; TONGA-NEXT: v_add_u32_e32 v23, vcc, v23, v8 -; TONGA-NEXT: v_addc_u32_e32 v24, vcc, 0, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v18, 0 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v19, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v23, v8 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v24, v9, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v8, v18 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; TONGA-NEXT: v_mul_lo_u32 v19, v22, v8 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v22, v18, 0 -; TONGA-NEXT: v_mul_lo_u32 v18, v11, v18 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v19, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v18, v9 -; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v15, v9 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v21, v8 -; TONGA-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, vcc -; TONGA-NEXT: v_sub_u32_e64 v19, s[0:1], v8, v22 -; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v18, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v11 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v25, v19, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v22 +; TONGA-NEXT: v_xor_b32_e32 v23, v18, v22 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v20, 0 +; TONGA-NEXT: v_mul_hi_u32 v21, v23, v11 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v22, vcc +; TONGA-NEXT: v_xor_b32_e32 v15, v15, v22 +; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v18 +; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v19, vcc +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v11, 0 +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v15, v20, 0 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v24, v18 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v25, v19, vcc +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v20 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; TONGA-NEXT: v_mul_lo_u32 v20, v9, v18 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v11, v8, v11 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v20, v19 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19 +; TONGA-NEXT: v_sub_u32_e32 v19, vcc, v15, v11 +; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v23, v18 +; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, vcc +; TONGA-NEXT: v_sub_u32_e64 v20, s[0:1], v18, v9 +; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v19, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v8 ; TONGA-NEXT: v_cndmask_b32_e64 v23, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v22 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v9 +; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v15, v11, vcc ; TONGA-NEXT: v_cndmask_b32_e64 v24, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v11 -; TONGA-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, s[0:1] +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v8 +; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 ; TONGA-NEXT: v_cndmask_b32_e64 v23, v23, v24, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v19, v22 -; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v15, v9, vcc -; TONGA-NEXT: v_subbrev_u32_e64 v18, s[0:1], 0, v18, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v9, v11 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23 +; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v20, v9 ; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v22 -; TONGA-NEXT: v_cndmask_b32_e64 v18, v21, v18, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v21, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v9, v11 -; TONGA-NEXT: v_cndmask_b32_e32 v11, v15, v21, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v19, v19, v24, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v19, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v9, v9, v18, vcc -; TONGA-NEXT: v_xor_b32_e32 v8, v8, v20 -; TONGA-NEXT: v_xor_b32_e32 v9, v9, v20 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v8, v20 -; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v9, v20, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v18, v9 +; TONGA-NEXT: v_subbrev_u32_e64 v19, s[0:1], 0, v19, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23 +; TONGA-NEXT: v_cndmask_b32_e32 v8, v15, v9, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v24, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e32 v9, v18, v20, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v8, v11, v19, vcc +; TONGA-NEXT: v_xor_b32_e32 v9, v9, v22 +; TONGA-NEXT: v_xor_b32_e32 v11, v8, v22 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v9, v22 +; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v11, v22, vcc ; TONGA-NEXT: s_cbranch_execnz .LBB12_3 ; TONGA-NEXT: .LBB12_2: ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v10 @@ -8991,33 +8991,33 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: s_waitcnt vmcnt(1) ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1 ; TONGA-NEXT: v_lshrrev_b32_e32 v12, 30, v12 -; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 ; TONGA-NEXT: v_add_u32_e32 v12, vcc, v0, v12 -; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13 ; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 +; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 +; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13 +; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v2, v13 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v5 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v2, v13 -; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14 -; TONGA-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc ; TONGA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v4, v14 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 +; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14 ; TONGA-NEXT: v_lshrrev_b32_e32 v15, 30, v15 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v5, vcc -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v6, v15 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc +; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; TONGA-NEXT: v_add_u32_e64 v12, s[0:1], v4, v14 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v6, v15 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc ; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 +; TONGA-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v5, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 ; TONGA-NEXT: v_and_b32_e32 v13, -4, v13 -; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v13 -; TONGA-NEXT: v_and_b32_e32 v14, -4, v14 -; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v17, vcc -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 -; TONGA-NEXT: v_and_b32_e32 v15, -4, v15 -; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v18, vcc -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v15 -; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v19, vcc +; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v14, vcc +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc ; TONGA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; TONGA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 29488579c1553..a9b1f7e888567 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -266,20 +266,20 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 +; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11 -; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 6ed19bd6d764b..30a0a26ca173e 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -776,14 +776,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v4, v20 +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 @@ -895,14 +895,14 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v4, v20 +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 7c310477dd838..530226baa775e 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -862,43 +862,43 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 -; GCN-NEXT: v_mul_lo_u32 v20, v13, v3 +; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 ; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14 ; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 ; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18 -; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v20 +; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v19 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 -; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12 -; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13 +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 +; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 -; GCN-NEXT: v_sub_u32_e32 v14, vcc, v6, v0 +; GCN-NEXT: v_sub_u32_e32 v18, vcc, v6, v0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] ; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_sub_u32_e32 v16, vcc, v8, v2 -; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] -; GCN-NEXT: v_sub_u32_e32 v17, vcc, v9, v3 -; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, v8, v2 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v9, v3 +; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v18, s[0:1] +; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3] ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5] -; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7] -; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[4:5] +; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12 +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm ; From 28600c025ed5a5894e84ad23d505c3192d30d9da Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Tue, 21 Jan 2025 16:12:04 +0100 Subject: [PATCH 2/6] Fix formatting issues --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 4 ++-- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index dd6ab64925e50..031d8f0560ff2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1272,8 +1272,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( - STM.computeOccupancy(F, ProgInfo.LDSSize).second, ProgInfo.NumSGPRsForWavesPerEU, - ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); + STM.computeOccupancy(F, ProgInfo.LDSSize).second, + ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 22a550450dc2e..fe9da7b7b505f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -411,7 +411,7 @@ GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs); - + // Maximum occupancy may be further limited by high SGPR/VGPR usage. MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc)); return {std::min(MinOcc, MaxOcc), MaxOcc}; From a191fec2b680a180d638415f967940cd7aa943f9 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Tue, 21 Jan 2025 17:53:32 +0100 Subject: [PATCH 3/6] Fix failing MIR tests --- ...ine-function-info-long-branch-reg-debug.ll | 2 +- .../machine-function-info-long-branch-reg.ll | 2 +- .../AMDGPU/machine-function-info-no-ir.mir | 20 +++++++++---------- .../MIR/AMDGPU/machine-function-info.ll | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index d1d8240a1007a..883657547519b 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: fp64-fp16-input-denormals: true ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: BitsOf32BitAddress: 0 -; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index ad6e92a25b861..278bf086d6088 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: fp64-fp16-input-denormals: true ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: BitsOf32BitAddress: 0 -; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 3eff89239d541..89d831b51f694 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -153,7 +153,7 @@ body: | # FULL-NEXT: fp64-fp16-input-denormals: true # FULL-NEXT: fp64-fp16-output-denormals: true # FULL-NEXT: highBitsOf32BitAddress: 0 -# FULL-NEXT: occupancy: 8 +# FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' @@ -175,7 +175,7 @@ body: | # SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } # SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } -# SIMPLE-NEXT: occupancy: 8 +# SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: no_mfi @@ -229,7 +229,7 @@ body: | # FULL-NEXT: fp64-fp16-input-denormals: true # FULL-NEXT: fp64-fp16-output-denormals: true # FULL-NEXT: highBitsOf32BitAddress: 0 -# FULL-NEXT: occupancy: 8 +# FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' @@ -251,7 +251,7 @@ body: | # SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } # SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } -# SIMPLE-NEXT: occupancy: 8 +# SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: empty_mfi @@ -306,7 +306,7 @@ body: | # FULL-NEXT: fp64-fp16-input-denormals: true # FULL-NEXT: fp64-fp16-output-denormals: true # FULL-NEXT: highBitsOf32BitAddress: 0 -# FULL-NEXT: occupancy: 8 +# FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' @@ -329,7 +329,7 @@ body: | # SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } # SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } -# SIMPLE-NEXT: occupancy: 8 +# SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: empty_mfi_entry_func @@ -457,11 +457,11 @@ body: | ... --- -# ALL-LABEL: name: occupancy_0 -# ALL: occupancy: 8 -name: occupancy_0 +# ALL-LABEL: name: occupancy_10 +# ALL: occupancy: 10 +name: occupancy_10 machineFunctionInfo: - occupancy: 0 + occupancy: 10 body: | bb.0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index eca3f99b64955..ec56de11b250a 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -167,7 +167,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { ; CHECK-NEXT: fp64-fp16-input-denormals: true ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: highBitsOf32BitAddress: 0 -; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' @@ -220,7 +220,7 @@ define void @function() { ; CHECK-NEXT: fp64-fp16-input-denormals: true ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: highBitsOf32BitAddress: 0 -; CHECK-NEXT: occupancy: 8 +; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' From 7d3f9449c362f3c48ca1c083c5fb1b9fc0b6c8cb Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Wed, 22 Jan 2025 16:02:38 +0100 Subject: [PATCH 4/6] Address feedback --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 14 +--- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 24 +++--- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 11 ++- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + .../AMDGPU/agpr-copy-no-free-registers.ll | 73 +++++++++++-------- 7 files changed, 74 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 031d8f0560ff2..972994117ee23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1175,22 +1175,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Make clamp modifier on NaN input returns 0. ProgInfo.DX10Clamp = Mode.DX10Clamp; - unsigned LDSAlignShift; - if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { - // LDS is allocated in 320 dword blocks. - LDSAlignShift = 11; - } else if (STM.getFeatureBits().test( - FeatureAddressableLocalMemorySize65536)) { - // LDS is allocated in 128 dword blocks. - LDSAlignShift = 9; - } else { - // LDS is allocated in 64 dword blocks. - LDSAlignShift = 8; - } - ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); + unsigned LDSAlignShift = Log2_32_Ceil(STM.getLDSAllocGranularity()); ProgInfo.LDSSize = MFI->getLDSSize(); ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 907f82ed7fc52..26c65f1e64965 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } unsigned MaxOccupancy = - ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second; + ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F, TM).second; // Restrict local memory usage so that we don't drastically reduce occupancy, // unless it is already significantly reduced. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index da729d4dc7e08..b427174edf552 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -55,13 +55,15 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() / WorkGroupsPerCU; } -std::pair -AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, - const Function &F) const { - // FIXME: Is there an allocation granularity for the LDS? If so we would need - // to make sure the amount of bytes is aligned on that granularity. - +std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( + uint32_t LDSBytes, const Function &F, const TargetMachine &TM) const { // Compute occupancy restriction based on LDS usage. + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + // For GCN subtargets, LDS size must be aligned on allocation granularity. + const GCNSubtarget &ST = TM.getSubtarget(F); + LDSBytes = alignTo(LDSBytes, ST.getLDSAllocGranularity()); + } + const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); // Queried LDS size may be larger than available on a CU, in which case we @@ -72,9 +74,8 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, return {1, 1}; const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU(); - const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU(); - auto PropsFromWGSize = [&](unsigned WGSize) + auto PropsFromWGSize = [=](unsigned WGSize) -> std::tuple { unsigned WavesPerWG = divideCeil(WGSize, WaveSize); unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS); @@ -91,10 +92,12 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, // It is possible that we end up with flipped minimum and maximum number of // waves per CU when the number of minimum/maximum concurrent groups on the CU - // is limited by LDS usage or barrier ressources. + // is limited by LDS usage or barrier resources. if (MinWavesPerCU >= MaxWavesPerCU) { std::swap(MinWavesPerCU, MaxWavesPerCU); } else { + const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU(); + // Look for a potential smaller group size than the maximum which decreases // the concurrent number of waves on the CU for the same number of // concurrent workgroups on the CU. @@ -140,7 +143,8 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( const MachineFunction &MF) const { const auto *MFI = MF.getInfo(); - return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction()); + return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction(), + MF.getTarget()); } std::pair diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 5944b69ce6416..78d2d1041744f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -133,7 +133,8 @@ class AMDGPUSubtarget { /// This notably depends on the range of allowed flat group sizes for the /// function and hardware characteristics. std::pair - getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const; + getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F, + const TargetMachine &TM) const; /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can /// be achieved when the only function running on a CU is \p MF. This notably diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index fe9da7b7b505f..737034e59686e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -408,7 +408,8 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { std::pair GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, unsigned NumSGPRs, unsigned NumVGPRs) const { - auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); + auto [MinOcc, MaxOcc] = + getOccupancyWithWorkGroupSizes(LDSSize, F, TLInfo.getTargetMachine()); unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs); @@ -417,6 +418,14 @@ GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, return {std::min(MinOcc, MaxOcc), MaxOcc}; } +unsigned GCNSubtarget::getLDSAllocGranularity() const { + if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize163840)) + return 1280; // LDS is allocated in 320 dword blocks. + if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize65536)) + return 512; // LDS is allocated in 128 dword blocks. + return 256; // LDS is allocated in 64 dword blocks. +} + unsigned GCNSubtarget::getBaseMaxNumSGPRs( const Function &F, std::pair WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a22e413508021..542aba027ae08 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1381,6 +1381,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; + /// Returns the LDS's allocation granularity in bytes. + unsigned getLDSAllocGranularity() const; + /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 58bb4ef5789ec..4ce46bbaf45ac 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -365,7 +365,10 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ret void } -define void @v32_asm_def_use(float %v0, float %v1) #0 { +; FIXME: This case is broken. The asm value passed in v32 is live +; through the range where the reserved def for the copy is introduced, +; clobbering the user value. +define void @v32_asm_def_use(float %v0, float %v1) #4 { ; GFX908-LABEL: v32_asm_def_use: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -374,48 +377,57 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def v[0:31] a[0:15] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v32, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def v32 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v35 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a14 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_write_b32 a31, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a13 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a13 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a12 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v35 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a11 -; GFX908-NEXT: v_accvgpr_write_b32 a29, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a12 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a10 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a9 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v35 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a8 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a10 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a24, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a6 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v35 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a5 -; GFX908-NEXT: v_accvgpr_write_b32 a26, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a9 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v35 -; GFX908-NEXT: v_accvgpr_read_b32 v35, a2 -; GFX908-NEXT: v_accvgpr_write_b32 a25, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a7 -; GFX908-NEXT: v_accvgpr_write_b32 a18, v35 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_write_b32 a23, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a4 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a22, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a4 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a3 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a3 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a2 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a19, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a1 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a17, v32 -; GFX908-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v35 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a0 ; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v32 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def v32 -; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_write_b32 a16, v35 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND @@ -1133,3 +1145,4 @@ attributes #0 = { "amdgpu-waves-per-eu"="6,6" } attributes #1 = { convergent nounwind readnone willreturn } attributes #2 = { nounwind readnone willreturn } attributes #3 = { "amdgpu-waves-per-eu"="7,7" } +attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" } From 260463f33362377b21d12c0725658038f9eed8a6 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Thu, 23 Jan 2025 14:45:05 +0100 Subject: [PATCH 5/6] Revert changes related to LDS alloc. granularity --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 13 ++++++++++++- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 16 +++++----------- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +-- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 11 +---------- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 --- 6 files changed, 20 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 972994117ee23..906dd3142ff5b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1178,7 +1178,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); - unsigned LDSAlignShift = Log2_32_Ceil(STM.getLDSAllocGranularity()); + unsigned LDSAlignShift; + if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { + // LDS is allocated in 320 dword blocks. + LDSAlignShift = 11; + } else if (STM.getFeatureBits().test( + FeatureAddressableLocalMemorySize65536)) { + // LDS is allocated in 128 dword blocks. + LDSAlignShift = 9; + } else { + // LDS is allocated in 64 dword blocks. + LDSAlignShift = 8; + } ProgInfo.LDSSize = MFI->getLDSSize(); ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 26c65f1e64965..907f82ed7fc52 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } unsigned MaxOccupancy = - ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F, TM).second; + ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second; // Restrict local memory usage so that we don't drastically reduce occupancy, // unless it is already significantly reduced. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index b427174edf552..d98a0ffcaf7e3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -55,15 +55,10 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() / WorkGroupsPerCU; } -std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( - uint32_t LDSBytes, const Function &F, const TargetMachine &TM) const { - // Compute occupancy restriction based on LDS usage. - if (TM.getTargetTriple().getArch() == Triple::amdgcn) { - // For GCN subtargets, LDS size must be aligned on allocation granularity. - const GCNSubtarget &ST = TM.getSubtarget(F); - LDSBytes = alignTo(LDSBytes, ST.getLDSAllocGranularity()); - } - +std::pair +AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, + const Function &F) const { + // FIXME: We should take into account the LDS allocation granularity. const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u); // Queried LDS size may be larger than available on a CU, in which case we @@ -143,8 +138,7 @@ std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( std::pair AMDGPUSubtarget::getOccupancyWithWorkGroupSizes( const MachineFunction &MF) const { const auto *MFI = MF.getInfo(); - return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction(), - MF.getTarget()); + return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction()); } std::pair diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 78d2d1041744f..5944b69ce6416 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -133,8 +133,7 @@ class AMDGPUSubtarget { /// This notably depends on the range of allowed flat group sizes for the /// function and hardware characteristics. std::pair - getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F, - const TargetMachine &TM) const; + getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const; /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can /// be achieved when the only function running on a CU is \p MF. This notably diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 737034e59686e..fe9da7b7b505f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -408,8 +408,7 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { std::pair GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, unsigned NumSGPRs, unsigned NumVGPRs) const { - auto [MinOcc, MaxOcc] = - getOccupancyWithWorkGroupSizes(LDSSize, F, TLInfo.getTargetMachine()); + auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs); @@ -418,14 +417,6 @@ GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, return {std::min(MinOcc, MaxOcc), MaxOcc}; } -unsigned GCNSubtarget::getLDSAllocGranularity() const { - if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize163840)) - return 1280; // LDS is allocated in 320 dword blocks. - if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize65536)) - return 512; // LDS is allocated in 128 dword blocks. - return 256; // LDS is allocated in 64 dword blocks. -} - unsigned GCNSubtarget::getBaseMaxNumSGPRs( const Function &F, std::pair WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 542aba027ae08..a22e413508021 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1381,9 +1381,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; - /// Returns the LDS's allocation granularity in bytes. - unsigned getLDSAllocGranularity() const; - /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { From 18ce96a4fb4fe6de6e6431b9c1dd94b79576ae8c Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Thu, 23 Jan 2025 16:05:18 +0100 Subject: [PATCH 6/6] Revert spurious modification --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 906dd3142ff5b..031d8f0560ff2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1175,9 +1175,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Make clamp modifier on NaN input returns 0. ProgInfo.DX10Clamp = Mode.DX10Clamp; - ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); - ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); - unsigned LDSAlignShift; if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { // LDS is allocated in 320 dword blocks. @@ -1190,6 +1187,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // LDS is allocated in 64 dword blocks. LDSAlignShift = 8; } + + ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); + ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); + ProgInfo.LDSSize = MFI->getLDSSize(); ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;