diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index 9acfd872e574b..8e03928ed3b3c 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -433,6 +433,10 @@ LLVM_ABI void combineAAMetadata(Instruction *K, const Instruction *J); /// replacement for the source instruction). LLVM_ABI void copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source); +/// Copy the metadata from the source instruction to the destination (the +/// replacement for the source instruction). +LLVM_ABI void copyMetadataForStore(StoreInst &Dest, const StoreInst &Source); + /// Patch the replacement so that it is not more restrictive than the value /// being replaced. It assumes that the replacement does not get moved from /// its original position. diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 46f29030ddb05..b35806af2dfba 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3153,6 +3153,48 @@ void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) { } } +void llvm::copyMetadataForStore(StoreInst &Dest, const StoreInst &Source) { + SmallVector, 8> MD; + Source.getAllMetadata(MD); + MDBuilder MDB(Dest.getContext()); + Type *NewType = Dest.getType(); + for (const auto &MDPair : MD) { + unsigned ID = MDPair.first; + MDNode *N = MDPair.second; + switch (ID) { + case LLVMContext::MD_dbg: + case LLVMContext::MD_prof: + case LLVMContext::MD_tbaa_struct: + case LLVMContext::MD_alias_scope: + case LLVMContext::MD_noalias: + case LLVMContext::MD_nontemporal: + case LLVMContext::MD_access_group: + case LLVMContext::MD_noundef: + case LLVMContext::MD_noalias_addrspace: + case LLVMContext::MD_mem_parallel_loop_access: + Dest.setMetadata(ID, N); + break; + + case LLVMContext::MD_tbaa: { + MDNode *NewTyNode = + MDB.createTBAAScalarTypeNode(NewType->getStructName(), N); + Dest.setMetadata(LLVMContext::MD_tbaa, NewTyNode); + break; + } + case LLVMContext::MD_nonnull: + break; + + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: + // These only directly apply if the new type is also a pointer. + if (NewType->isPointerTy()) + Dest.setMetadata(ID, N); + break; + } + } +} + void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) { auto *ReplInst = dyn_cast(Repl); if (!ReplInst) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 7b5137b0185ab..40c4c6baec445 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -112,6 +112,7 @@ #include #include #include +#include #include #include @@ -268,11 +269,6 @@ class Vectorizer { /// isGuaranteedToTransferExecutionToSuccessor(I) == true. bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End); - /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores - /// in the same BB with the same value for getUnderlyingObject() etc. - bool runOnEquivalenceClass(const EqClassKey &EqClassKey, - ArrayRef EqClass); - /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class /// where all instructions access a known, constant offset from the first /// instruction. @@ -338,12 +334,22 @@ class Vectorizer { EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin, BasicBlock::iterator End); + /// Inserts a cast instruction to convert Inst to DstTy. + Value *insertCast(Value *Val, Type *DstTy); + /// Partitions Instrs into "chains" where every instruction has a known /// constant offset from the first instr in the chain. /// /// Postcondition: For all i, ret[i][0].second == 0, because the first instr /// in the chain is the leader, and an instr touches distance 0 from itself. std::vector gatherChains(ArrayRef Instrs); + + // Helpers for chain merging. + std::optional computeLeaderDelta(Instruction *I1, Instruction *I2); + bool chainsOverlapAfterRebase(const Chain &A, const Chain &B, + const APInt &Delta) const; + static void rebaseChain(Chain &C, const APInt &Delta); + void normalizeChainToType(Chain &C, Type *CastTy); }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -425,6 +431,20 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, return Changed ? PA : PreservedAnalyses::all(); } +static const Value *getUnderlyingObject(const Value *Ptr) { + const Value *ObjPtr = llvm::getUnderlyingObject(Ptr); + if (const auto *Sel = dyn_cast(ObjPtr)) { + // The select's themselves are distinct instructions even if they share + // the same condition and evaluate to consecutive pointers for true and + // false values of the condition. Therefore using the select's themselves + // for grouping instructions would put consecutive accesses into different + // lists and they won't be even checked for being consecutive, and won't + // be vectorized. + return Sel->getCondition(); + } + return ObjPtr; +} + bool Vectorizer::run() { bool Changed = false; // Break up the BB if there are any instrs which aren't guaranteed to transfer @@ -468,6 +488,88 @@ bool Vectorizer::run() { return Changed; } +Value *Vectorizer::insertCast(Value *Val, Type *DstTy) { + if (DL.getTypeSizeInBits(Val->getType()) == DL.getTypeSizeInBits(DstTy)) { + return Builder.CreateBitOrPointerCast(Val, DstTy, Val->getName() + ".bc"); + } + + // If the types are of different sizes and both are integers, we can use + // zext or sext to cast. + if (Val->getType()->isIntegerTy() && DstTy->isIntegerTy()) { + if (DL.getTypeSizeInBits(Val->getType()) < DL.getTypeSizeInBits(DstTy)) { + return Builder.CreateZExt(Val, DstTy, Val->getName() + ".bc"); + } + return Builder.CreateTrunc(Val, DstTy, Val->getName() + ".bc"); + } + + return nullptr; +} + +std::optional Vectorizer::computeLeaderDelta(Instruction *I1, + Instruction *I2) { + assert(((isa(I1) && isa(I2)) || + (isa(I1) && isa(I2))) && + "computeLeaderDelta must be called with two load or two store " + "instructions"); + Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1; + const Value *Ptr1 = getLoadStorePointerOperand(I1); + const Value *Ptr2 = getLoadStorePointerOperand(I2); + return getConstantOffset(const_cast(Ptr1), const_cast(Ptr2), + CtxInst); +} + +bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B, + const APInt &Delta) const { + ConstantRange ARange( + A.front().OffsetFromLeader, + A.back().OffsetFromLeader + + DL.getTypeStoreSize(getLoadStoreType(A.back().Inst))); + ConstantRange BRange( + B.front().OffsetFromLeader + Delta, + B.back().OffsetFromLeader + Delta + + DL.getTypeStoreSize(getLoadStoreType(B.back().Inst))); + return !ARange.intersectWith(BRange).isEmptySet(); +} + +void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) { + for (ChainElem &E : C) + E.OffsetFromLeader += Delta; +} + +void Vectorizer::normalizeChainToType(Chain &C, Type *CastTy) { + for (ChainElem &Elem : C) { + Instruction *Inst = Elem.Inst; + Type *OrigValTy = getLoadStoreType(Inst); + if (OrigValTy == CastTy) + continue; + + if (auto *LI = dyn_cast(Inst)) { + Builder.SetInsertPoint(LI); + LoadInst *NewLoad = Builder.CreateLoad(CastTy, LI->getPointerOperand(), + LI->getName() + ".mut"); + copyMetadataForLoad(*NewLoad, *LI); + Value *CastBack = insertCast(NewLoad, OrigValTy); + if (!CastBack) + llvm_unreachable("Failed to insert cast"); + LI->replaceAllUsesWith(CastBack); + ToErase.emplace_back(LI); + Elem.Inst = NewLoad; + } else if (auto *SI = dyn_cast(Inst)) { + Builder.SetInsertPoint(SI); + Value *CastVal = insertCast(SI->getValueOperand(), CastTy); + if (!CastVal) + llvm_unreachable("Failed to insert cast"); + StoreInst *NewStore = + Builder.CreateStore(CastVal, SI->getPointerOperand()); + NewStore->setAlignment(SI->getAlign()); + NewStore->setVolatile(SI->isVolatile()); + copyMetadataForStore(*NewStore, *SI); + ToErase.emplace_back(SI); + Elem.Inst = NewStore; + } + } +} + bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End) { LLVM_DEBUG({ @@ -480,49 +582,120 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, }); bool Changed = false; + SmallVector ContiguousSubChains; + for (const auto &[EqClassKey, EqClass] : - collectEquivalenceClasses(Begin, End)) - Changed |= runOnEquivalenceClass(EqClassKey, EqClass); + collectEquivalenceClasses(Begin, End)) { - return Changed; -} + LLVM_DEBUG({ + dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() + << " keyed on " << EqClassKey << ":\n"; + for (Instruction *I : EqClass) + dbgs() << " " << *I << "\n"; + }); -bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey, - ArrayRef EqClass) { - bool Changed = false; + for (Chain &C : gatherChains(EqClass)) { - LLVM_DEBUG({ - dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() - << " keyed on " << EqClassKey << ":\n"; - for (Instruction *I : EqClass) - dbgs() << " " << *I << "\n"; - }); + // Split up the chain into increasingly smaller chains, until we can + // finally vectorize the chains. + // + // (Don't be scared by the depth of the loop nest here. These operations + // are all at worst O(n lg n) in the number of instructions, and splitting + // chains doesn't change the number of instrs. So the whole loop nest is + // O(n lg n).) + for (auto &C : splitChainByMayAliasInstrs(C)) { + for (auto &C : splitChainByContiguity(C)) { + ContiguousSubChains.emplace_back(C); + } + } + } + } - std::vector Chains = gatherChains(EqClass); - LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size() - << " nontrivial chains.\n";); - for (Chain &C : Chains) - Changed |= runOnChain(C); - return Changed; -} + // Merge chains in reverse order, so that the first chain is the largest. + for (int I = ContiguousSubChains.size() - 1; I > 0; I--) { + Chain &C1 = ContiguousSubChains[I - 1]; + Chain &C2 = ContiguousSubChains[I]; -bool Vectorizer::runOnChain(Chain &C) { - LLVM_DEBUG({ - dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n"; - dumpChain(C); - }); + // If the scalar types of the chains are the same, we can merge them + // without inserting any casts. + if (getLoadStoreType(C1[0].Inst)->getScalarType() == + getLoadStoreType(C2[0].Inst)->getScalarType()) + continue; + + const Value *C1Ptr = getLoadStorePointerOperand(C1[0].Inst); + const Value *C2Ptr = getLoadStorePointerOperand(C2[0].Inst); + unsigned AS1 = C1Ptr->getType()->getPointerAddressSpace(); + unsigned AS2 = C2Ptr->getType()->getPointerAddressSpace(); + bool C1IsLoad = isa(C1[0].Inst); + bool C2IsLoad = isa(C2[0].Inst); + + // If the chains are mapped to different types, have distinct underlying + // pointer objects, or include both loads and stores, skip. + if (C1IsLoad != C2IsLoad || AS1 != AS2 || + ::getUnderlyingObject(C1Ptr) != ::getUnderlyingObject(C2Ptr)) + continue; + + // Compute constant offset between chain leaders; if unknown, skip. + std::optional DeltaOpt = computeLeaderDelta(C1[0].Inst, C2[0].Inst); + if (!DeltaOpt) + continue; + + // Check that rebasing C2 into C1's coordinate space will not overlap C1. + if (chainsOverlapAfterRebase(C1, C2, *DeltaOpt)) + continue; + + // Determine the common integer cast type for normalization and ensure total + // bitwidth matches across all elements of both chains. + Type *C1ElemTy = getLoadStoreType(C1[0].Inst); + unsigned TotalBits = DL.getTypeSizeInBits(C1ElemTy); + auto AllElemsMatchTotalBits = [&](const Chain &C) { + return llvm::all_of(C, [&](const ChainElem &E) { + return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits; + }); + }; + if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2)) + continue; + + // Power-of-two span ensures we can form a legal, single vector access + // without padding or splitting. Many targets and cost models assume POT + // widths, and it guarantees an integral element count for the chosen + // VecElemTy. + APInt Sz = C2.front().OffsetFromLeader + + DL.getTypeStoreSize(getLoadStoreType(C2.front().Inst)) - + C1.back().OffsetFromLeader + *DeltaOpt; + if (!Sz.isPowerOf2()) + continue; + + // Rebase C2's offsets into C1's coordinate space prior to merging and + // merge C2 into C1 by appending all elements of C2 to C1, then erase C2 + // from ContiguousSubChains. + rebaseChain(C2, *DeltaOpt); + C1.insert(C1.end(), C2.begin(), C2.end()); + ContiguousSubChains.erase(ContiguousSubChains.begin() + I); + + // Normalize the value operand/result type of each instruction in C1 to + // C1CastTy. + Type *C1CastTy = + Type::getIntNTy(C1ElemTy->getContext(), DL.getTypeSizeInBits(C1ElemTy)); + normalizeChainToType(C1, C1CastTy); + } + + for (auto &C : ContiguousSubChains) { + if (C.size() <= 1) + continue; + for (auto &AlignedSubChain : splitChainByAlignment(C)) + Changed |= vectorizeChain(AlignedSubChain); + } + + // Erase all instructions scheduled for deletion in this pseudo-BB. + for (Instruction *I : ToErase) { + auto *PtrOperand = getLoadStorePointerOperand(I); + if (I->use_empty()) + I->eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); + } + ToErase.clear(); - // Split up the chain into increasingly smaller chains, until we can finally - // vectorize the chains. - // - // (Don't be scared by the depth of the loop nest here. These operations are - // all at worst O(n lg n) in the number of instructions, and splitting chains - // doesn't change the number of instrs. So the whole loop nest is O(n lg n).) - bool Changed = false; - for (auto &C : splitChainByMayAliasInstrs(C)) - for (auto &C : splitChainByContiguity(C)) - for (auto &C : splitChainByAlignment(C)) - Changed |= vectorizeChain(C); return Changed; } @@ -583,7 +756,7 @@ std::vector Vectorizer::splitChainByMayAliasInstrs(Chain &C) { LLVM_DEBUG( dbgs() << "LSV: Found intervening may-alias instrs; cannot merge " << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n"); - if (NewChain.size() > 1) { + if (!NewChain.empty()) { LLVM_DEBUG({ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; dumpChain(NewChain); @@ -595,7 +768,7 @@ std::vector Vectorizer::splitChainByMayAliasInstrs(Chain &C) { NewChain = SmallVector({*ChainIt}); } } - if (NewChain.size() > 1) { + if (!NewChain.empty()) { LLVM_DEBUG({ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; dumpChain(NewChain); @@ -648,8 +821,6 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { Ret.push_back({*It}); } - // Filter out length-1 chains, these are uninteresting. - llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; }); return Ret; } @@ -669,7 +840,7 @@ Type *Vectorizer::getChainElemTy(const Chain &C) { if (any_of(C, [](const ChainElem &E) { return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy(); })) { - return Type::getIntNTy( + return IntegerType::getIntNTy( F.getContext(), DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType())); } @@ -1434,20 +1605,6 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, BasicBlock::iterator End) { EquivalenceClassMap Ret; - auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * { - const Value *ObjPtr = llvm::getUnderlyingObject(Ptr); - if (const auto *Sel = dyn_cast(ObjPtr)) { - // The select's themselves are distinct instructions even if they share - // the same condition and evaluate to consecutive pointers for true and - // false values of the condition. Therefore using the select's themselves - // for grouping instructions would put consecutive accesses into different - // lists and they won't be even checked for being consecutive, and won't - // be vectorized. - return Sel->getCondition(); - } - return ObjPtr; - }; - for (Instruction &I : make_range(Begin, End)) { auto *LI = dyn_cast(&I); auto *SI = dyn_cast(&I); @@ -1495,7 +1652,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; - Ret[{GetUnderlyingObject(Ptr), AS, + Ret[{::getUnderlyingObject(Ptr), AS, DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()), /*IsLoad=*/LI != nullptr}] .emplace_back(&I); @@ -1590,8 +1747,7 @@ std::vector Vectorizer::gatherChains(ArrayRef Instrs) { Ret.reserve(Chains.size()); // Iterate over MRU rather than Chains so the order is deterministic. for (auto &E : MRU) - if (E.second.size() > 1) - Ret.emplace_back(std::move(E.second)); + Ret.emplace_back(std::move(E.second)); return Ret; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll index fc236147f1238..7dd907e3c143f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll @@ -20,3 +20,5 @@ define void @addi32(i32 %arg1, i32 %arg2) { store i32 %res, ptr addrspace(1) poison ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll index ca605986da941..7c13a0bd97850 100644 --- a/llvm/test/CodeGen/AMDGPU/add_i1.ll +++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; GFX9-LABEL: add_var_var_i1: @@ -18,39 +18,6 @@ define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: add_var_var_i1: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: add_var_var_i1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_u8 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in0 %b = load volatile i1, ptr addrspace(1) %in1 %add = add i1 %a, %b @@ -72,35 +39,6 @@ define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: add_var_imm_i1: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: s_xor_b32 s2, vcc_lo, -1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: add_var_imm_i1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: s_xor_b32 s2, vcc_lo, -1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in %add = add i1 %a, 1 store i1 %add, ptr addrspace(1) %out @@ -145,88 +83,6 @@ define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: add_i1_cf: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX10-NEXT: ; implicit-def: $sgpr4 -; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX10-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execz .LBB2_2 -; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0 -; GFX10-NEXT: .LBB2_2: ; %Flow -; GFX10-NEXT: s_andn2_saveexec_b32 s5, s5 -; GFX10-NEXT: s_cbranch_execz .LBB2_4 -; GFX10-NEXT: ; %bb.3: ; %if -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_andn2_b32 s2, s4, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX10-NEXT: s_or_b32 s4, s2, s3 -; GFX10-NEXT: .LBB2_4: ; %endif -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s2, s4, -1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: add_i1_cf: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s7, exec_lo -; GFX11-NEXT: ; implicit-def: $sgpr6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX11-NEXT: s_xor_b32 s7, exec_lo, s7 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 -; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u8 v0, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 1, v0 -; GFX11-NEXT: .LBB2_2: ; %Flow -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_not1_saveexec_b32 s4, s7 -; GFX11-NEXT: s_cbranch_execz .LBB2_4 -; GFX11-NEXT: ; %bb.3: ; %if -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_and_not1_b32 s2, s6, exec_lo -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX11-NEXT: s_or_b32 s6, s2, s3 -; GFX11-NEXT: .LBB2_4: ; %endif -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_xor_b32 s2, s6, -1 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %d_cmp = icmp ult i32 %tid, 16 @@ -248,3 +104,6 @@ endif: } declare i32 @llvm.amdgcn.workitem.id.x() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX10: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index ebbeab94066d6..978d72ec83fc7 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -510,53 +510,53 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 -; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18 -; GFX908-NEXT: s_mov_b32 s12, 0 -; GFX908-NEXT: s_mov_b32 s9, s12 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX908-NEXT: s_load_dwordx2 s[16:17], s[8:9], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 +; GFX908-NEXT: s_mov_b32 s4, 0 +; GFX908-NEXT: s_mov_b32 s9, s4 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX908-NEXT: s_sub_i32 s1, 0, s7 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0 +; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX908-NEXT: s_sub_i32 s3, 0, s1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2 ; GFX908-NEXT: v_mov_b32_e32 v17, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX908-NEXT: v_readfirstlane_b32 s2, v0 -; GFX908-NEXT: s_mul_i32 s1, s1, s2 -; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1 -; GFX908-NEXT: s_add_i32 s2, s2, s1 -; GFX908-NEXT: s_mul_hi_u32 s1, s6, s2 -; GFX908-NEXT: s_mul_i32 s2, s1, s7 -; GFX908-NEXT: s_sub_i32 s2, s6, s2 -; GFX908-NEXT: s_add_i32 s3, s1, 1 -; GFX908-NEXT: s_sub_i32 s6, s2, s7 -; GFX908-NEXT: s_cmp_ge_u32 s2, s7 -; GFX908-NEXT: s_cselect_b32 s1, s3, s1 -; GFX908-NEXT: s_cselect_b32 s2, s6, s2 -; GFX908-NEXT: s_add_i32 s3, s1, 1 -; GFX908-NEXT: s_cmp_ge_u32 s2, s7 -; GFX908-NEXT: s_cselect_b32 s8, s3, s1 -; GFX908-NEXT: s_lshr_b32 s2, s0, 16 +; GFX908-NEXT: v_readfirstlane_b32 s5, v0 +; GFX908-NEXT: s_mul_i32 s3, s3, s5 +; GFX908-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX908-NEXT: s_add_i32 s5, s5, s3 +; GFX908-NEXT: s_mul_hi_u32 s3, s0, s5 +; GFX908-NEXT: s_mul_i32 s5, s3, s1 +; GFX908-NEXT: s_sub_i32 s0, s0, s5 +; GFX908-NEXT: s_add_i32 s8, s3, 1 +; GFX908-NEXT: s_sub_i32 s5, s0, s1 +; GFX908-NEXT: s_cmp_ge_u32 s0, s1 +; GFX908-NEXT: s_cselect_b32 s3, s8, s3 +; GFX908-NEXT: s_cselect_b32 s0, s5, s0 +; GFX908-NEXT: s_add_i32 s5, s3, 1 +; GFX908-NEXT: s_cmp_ge_u32 s0, s1 +; GFX908-NEXT: s_cselect_b32 s8, s5, s3 +; GFX908-NEXT: s_lshr_b32 s2, s2, 16 ; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2 -; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[16:17], 5 ; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_or_b32 s14, s14, 28 -; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 +; GFX908-NEXT: s_or_b32 s12, s12, 28 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[8:9], 5 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s2, v16 ; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX908-NEXT: s_mul_i32 s3, s5, s2 -; GFX908-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX908-NEXT: s_mul_i32 s2, s4, s2 +; GFX908-NEXT: s_mul_i32 s3, s17, s2 +; GFX908-NEXT: s_mul_hi_u32 s5, s16, s2 +; GFX908-NEXT: s_mul_i32 s2, s16, s2 ; GFX908-NEXT: s_add_i32 s3, s5, s3 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 +; GFX908-NEXT: s_lshl_b64 s[16:17], s[2:3], 5 ; GFX908-NEXT: s_branch .LBB3_2 -; GFX908-NEXT: .LBB3_1: ; %Flow20 +; GFX908-NEXT: .LBB3_1: ; %Flow21 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 @@ -569,47 +569,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 -; GFX908-NEXT: s_mov_b32 s13, s12 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1 +; GFX908-NEXT: s_mov_b32 s5, s4 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX908-NEXT: v_mov_b32_e32 v4, s12 +; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v6, s12 -; GFX908-NEXT: v_mov_b32_e32 v8, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s13 -; GFX908-NEXT: v_mov_b32_e32 v7, s13 -; GFX908-NEXT: v_mov_b32_e32 v9, s13 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 +; GFX908-NEXT: v_mov_b32_e32 v7, s5 +; GFX908-NEXT: v_mov_b32_e32 v9, s5 +; GFX908-NEXT: v_mov_b32_e32 v5, s5 +; GFX908-NEXT: v_mov_b32_e32 v6, s4 +; GFX908-NEXT: v_mov_b32_e32 v8, s4 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[6:7], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] +; GFX908-NEXT: s_mov_b64 s[20:21], s[12:13] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s9, v2 -; GFX908-NEXT: v_readfirstlane_b32 s13, v3 -; GFX908-NEXT: s_add_u32 s9, s9, 1 -; GFX908-NEXT: s_addc_u32 s13, s13, 0 -; GFX908-NEXT: s_mul_hi_u32 s22, s6, s9 -; GFX908-NEXT: s_mul_i32 s13, s6, s13 -; GFX908-NEXT: s_mul_i32 s23, s7, s9 -; GFX908-NEXT: s_add_i32 s13, s22, s13 -; GFX908-NEXT: s_mul_i32 s9, s6, s9 -; GFX908-NEXT: s_add_i32 s13, s13, s23 +; GFX908-NEXT: v_readfirstlane_b32 s5, v2 +; GFX908-NEXT: v_readfirstlane_b32 s9, v3 +; GFX908-NEXT: s_add_u32 s5, s5, 1 +; GFX908-NEXT: s_addc_u32 s9, s9, 0 +; GFX908-NEXT: s_mul_hi_u32 s22, s10, s5 +; GFX908-NEXT: s_mul_i32 s9, s10, s9 +; GFX908-NEXT: s_mul_i32 s23, s11, s5 +; GFX908-NEXT: s_add_i32 s9, s22, s9 +; GFX908-NEXT: s_mul_i32 s5, s10, s5 +; GFX908-NEXT: s_add_i32 s9, s9, s23 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s4 +; GFX908-NEXT: s_add_u32 s20, s20, s16 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s5 +; GFX908-NEXT: s_addc_u32 s21, s21, s17 ; GFX908-NEXT: s_mov_b64 s[22:23], 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s22, s20, s9 -; GFX908-NEXT: s_addc_u32 s23, s21, s13 +; GFX908-NEXT: s_add_u32 s22, s20, s5 +; GFX908-NEXT: s_addc_u32 s23, s21, s9 ; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc @@ -657,17 +657,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1 -; GFX908-NEXT: .LBB3_10: ; %Flow19 +; GFX908-NEXT: .LBB3_10: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_mov_b64 s[2:3], -1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s10, s10, s8 -; GFX908-NEXT: s_addc_u32 s11, s11, 0 -; GFX908-NEXT: s_add_u32 s14, s14, s16 -; GFX908-NEXT: s_addc_u32 s15, s15, s17 +; GFX908-NEXT: s_add_u32 s6, s6, s8 +; GFX908-NEXT: s_addc_u32 s7, s7, 0 +; GFX908-NEXT: s_add_u32 s12, s12, s14 +; GFX908-NEXT: s_addc_u32 s13, s13, s15 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock @@ -676,52 +676,52 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 -; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18 -; GFX90A-NEXT: s_mov_b32 s12, 0 -; GFX90A-NEXT: s_mov_b32 s9, s12 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[8:9], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 +; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX90A-NEXT: s_sub_i32 s1, 0, s7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX90A-NEXT: s_sub_i32 s3, 0, s1 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 -; GFX90A-NEXT: s_mul_i32 s1, s1, s2 -; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1 -; GFX90A-NEXT: s_add_i32 s2, s2, s1 -; GFX90A-NEXT: s_mul_hi_u32 s1, s6, s2 -; GFX90A-NEXT: s_mul_i32 s2, s1, s7 -; GFX90A-NEXT: s_sub_i32 s2, s6, s2 -; GFX90A-NEXT: s_add_i32 s3, s1, 1 -; GFX90A-NEXT: s_sub_i32 s6, s2, s7 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 -; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 -; GFX90A-NEXT: s_cselect_b32 s2, s6, s2 -; GFX90A-NEXT: s_add_i32 s3, s1, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 -; GFX90A-NEXT: s_cselect_b32 s8, s3, s1 -; GFX90A-NEXT: s_lshr_b32 s2, s0, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v1 +; GFX90A-NEXT: s_mul_i32 s3, s3, s5 +; GFX90A-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX90A-NEXT: s_add_i32 s5, s5, s3 +; GFX90A-NEXT: s_mul_hi_u32 s3, s0, s5 +; GFX90A-NEXT: s_mul_i32 s5, s3, s1 +; GFX90A-NEXT: s_sub_i32 s0, s0, s5 +; GFX90A-NEXT: s_add_i32 s8, s3, 1 +; GFX90A-NEXT: s_sub_i32 s5, s0, s1 +; GFX90A-NEXT: s_cmp_ge_u32 s0, s1 +; GFX90A-NEXT: s_cselect_b32 s3, s8, s3 +; GFX90A-NEXT: s_cselect_b32 s0, s5, s0 +; GFX90A-NEXT: s_add_i32 s5, s3, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s0, s1 +; GFX90A-NEXT: s_cselect_b32 s8, s5, s3 +; GFX90A-NEXT: s_lshr_b32 s2, s2, 16 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[16:17], 5 ; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_or_b32 s14, s14, 28 -; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 +; GFX90A-NEXT: s_or_b32 s12, s12, 28 +; GFX90A-NEXT: s_lshl_b64 s[14:15], s[8:9], 5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v18 ; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX90A-NEXT: s_mul_i32 s3, s5, s2 -; GFX90A-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX90A-NEXT: s_mul_i32 s2, s4, s2 +; GFX90A-NEXT: s_mul_i32 s3, s17, s2 +; GFX90A-NEXT: s_mul_hi_u32 s5, s16, s2 +; GFX90A-NEXT: s_mul_i32 s2, s16, s2 ; GFX90A-NEXT: s_add_i32 s3, s5, s3 -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 +; GFX90A-NEXT: s_lshl_b64 s[16:17], s[2:3], 5 ; GFX90A-NEXT: s_branch .LBB3_2 -; GFX90A-NEXT: .LBB3_1: ; %Flow20 +; GFX90A-NEXT: .LBB3_1: ; %Flow21 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 @@ -734,34 +734,34 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 -; GFX90A-NEXT: s_mov_b32 s13, s12 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1 +; GFX90A-NEXT: s_mov_b32 s5, s4 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8 -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[6:7], 0 +; GFX90A-NEXT: s_mov_b64 s[20:21], s[12:13] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s13, v5 -; GFX90A-NEXT: s_add_u32 s9, s9, 1 -; GFX90A-NEXT: s_addc_u32 s13, s13, 0 -; GFX90A-NEXT: s_mul_hi_u32 s22, s6, s9 -; GFX90A-NEXT: s_mul_i32 s13, s6, s13 -; GFX90A-NEXT: s_mul_i32 s23, s7, s9 -; GFX90A-NEXT: s_add_i32 s13, s22, s13 -; GFX90A-NEXT: s_mul_i32 s9, s6, s9 -; GFX90A-NEXT: s_add_i32 s13, s13, s23 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 +; GFX90A-NEXT: s_add_u32 s5, s5, 1 +; GFX90A-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NEXT: s_mul_hi_u32 s22, s10, s5 +; GFX90A-NEXT: s_mul_i32 s9, s10, s9 +; GFX90A-NEXT: s_mul_i32 s23, s11, s5 +; GFX90A-NEXT: s_add_i32 s9, s22, s9 +; GFX90A-NEXT: s_mul_i32 s5, s10, s5 +; GFX90A-NEXT: s_add_i32 s9, s9, s23 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s4 -; GFX90A-NEXT: s_addc_u32 s21, s21, s5 +; GFX90A-NEXT: s_add_u32 s20, s20, s16 +; GFX90A-NEXT: s_addc_u32 s21, s21, s17 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[22:23], 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] @@ -769,8 +769,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s22, s20, s9 -; GFX90A-NEXT: s_addc_u32 s23, s21, s13 +; GFX90A-NEXT: s_add_u32 s22, s20, s5 +; GFX90A-NEXT: s_addc_u32 s23, s21, s9 ; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc @@ -811,17 +811,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1 -; GFX90A-NEXT: .LBB3_10: ; %Flow19 +; GFX90A-NEXT: .LBB3_10: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_mov_b64 s[2:3], -1 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s10, s10, s8 -; GFX90A-NEXT: s_addc_u32 s11, s11, 0 -; GFX90A-NEXT: s_add_u32 s14, s14, s16 -; GFX90A-NEXT: s_addc_u32 s15, s15, s17 +; GFX90A-NEXT: s_add_u32 s6, s6, s8 +; GFX90A-NEXT: s_addc_u32 s7, s7, 0 +; GFX90A-NEXT: s_add_u32 s12, s12, s14 +; GFX90A-NEXT: s_addc_u32 s13, s13, s15 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll index f96a6f74e3c0d..1e5ecc725de3b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll @@ -1,13 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}kernel_ieee_mode_default: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_kernel void @kernel_ieee_mode_default() #0 { +; GCN-LABEL: kernel_ieee_mode_default: +; GCN: .amd_kernel_code_t +; GCN-NEXT: amd_code_version_major = 1 +; GCN-NEXT: amd_code_version_minor = 2 +; GCN-NEXT: amd_machine_kind = 1 +; GCN-NEXT: amd_machine_version_major = 6 +; GCN-NEXT: amd_machine_version_minor = 0 +; GCN-NEXT: amd_machine_version_stepping = 0 +; GCN-NEXT: kernel_code_entry_byte_offset = 256 +; GCN-NEXT: kernel_code_prefetch_byte_size = 0 +; GCN-NEXT: granulated_workitem_vgpr_count = 0 +; GCN-NEXT: granulated_wavefront_sgpr_count = 0 +; GCN-NEXT: priority = 0 +; GCN-NEXT: float_mode = 240 +; GCN-NEXT: priv = 0 +; GCN-NEXT: enable_dx10_clamp = 1 +; GCN-NEXT: debug_mode = 0 +; GCN-NEXT: enable_ieee_mode = 1 +; GCN-NEXT: enable_wgp_mode = 0 +; GCN-NEXT: enable_mem_ordered = 0 +; GCN-NEXT: enable_fwd_progress = 0 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GCN-NEXT: user_sgpr_count = 12 +; GCN-NEXT: enable_trap_handler = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_info = 0 +; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_exception_msb = 0 +; GCN-NEXT: granulated_lds_size = 0 +; GCN-NEXT: enable_exception = 0 +; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 +; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 +; GCN-NEXT: enable_sgpr_private_segment_size = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NEXT: enable_wavefront_size32 = 0 +; GCN-NEXT: enable_ordered_append_gds = 0 +; GCN-NEXT: private_element_size = 1 +; GCN-NEXT: is_ptr64 = 1 +; GCN-NEXT: is_dynamic_callstack = 0 +; GCN-NEXT: is_debug_enabled = 0 +; GCN-NEXT: is_xnack_enabled = 0 +; GCN-NEXT: workitem_private_segment_byte_size = 0 +; GCN-NEXT: workgroup_group_segment_byte_size = 0 +; GCN-NEXT: gds_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 16 +; GCN-NEXT: workgroup_fbarrier_count = 0 +; GCN-NEXT: wavefront_sgpr_count = 4 +; GCN-NEXT: workitem_vgpr_count = 2 +; GCN-NEXT: reserved_vgpr_first = 0 +; GCN-NEXT: reserved_vgpr_count = 0 +; GCN-NEXT: reserved_sgpr_first = 0 +; GCN-NEXT: reserved_sgpr_count = 0 +; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 +; GCN-NEXT: kernarg_segment_alignment = 4 +; GCN-NEXT: group_segment_alignment = 4 +; GCN-NEXT: private_segment_alignment = 4 +; GCN-NEXT: wavefront_size = 6 +; GCN-NEXT: call_convention = -1 +; GCN-NEXT: runtime_loader_kernel_symbol = 0 +; GCN-NEXT: .end_amd_kernel_code_t +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 { ret void } -; GCN-LABEL: {{^}}kernel_ieee_mode_on: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_kernel void @kernel_ieee_mode_on() #1 { +; GCN-LABEL: kernel_ieee_mode_on: +; GCN: .amd_kernel_code_t +; GCN-NEXT: amd_code_version_major = 1 +; GCN-NEXT: amd_code_version_minor = 2 +; GCN-NEXT: amd_machine_kind = 1 +; GCN-NEXT: amd_machine_version_major = 6 +; GCN-NEXT: amd_machine_version_minor = 0 +; GCN-NEXT: amd_machine_version_stepping = 0 +; GCN-NEXT: kernel_code_entry_byte_offset = 256 +; GCN-NEXT: kernel_code_prefetch_byte_size = 0 +; GCN-NEXT: granulated_workitem_vgpr_count = 0 +; GCN-NEXT: granulated_wavefront_sgpr_count = 0 +; GCN-NEXT: priority = 0 +; GCN-NEXT: float_mode = 240 +; GCN-NEXT: priv = 0 +; GCN-NEXT: enable_dx10_clamp = 1 +; GCN-NEXT: debug_mode = 0 +; GCN-NEXT: enable_ieee_mode = 1 +; GCN-NEXT: enable_wgp_mode = 0 +; GCN-NEXT: enable_mem_ordered = 0 +; GCN-NEXT: enable_fwd_progress = 0 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GCN-NEXT: user_sgpr_count = 12 +; GCN-NEXT: enable_trap_handler = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_info = 0 +; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_exception_msb = 0 +; GCN-NEXT: granulated_lds_size = 0 +; GCN-NEXT: enable_exception = 0 +; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 +; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 +; GCN-NEXT: enable_sgpr_private_segment_size = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NEXT: enable_wavefront_size32 = 0 +; GCN-NEXT: enable_ordered_append_gds = 0 +; GCN-NEXT: private_element_size = 1 +; GCN-NEXT: is_ptr64 = 1 +; GCN-NEXT: is_dynamic_callstack = 0 +; GCN-NEXT: is_debug_enabled = 0 +; GCN-NEXT: is_xnack_enabled = 0 +; GCN-NEXT: workitem_private_segment_byte_size = 0 +; GCN-NEXT: workgroup_group_segment_byte_size = 0 +; GCN-NEXT: gds_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 16 +; GCN-NEXT: workgroup_fbarrier_count = 0 +; GCN-NEXT: wavefront_sgpr_count = 4 +; GCN-NEXT: workitem_vgpr_count = 2 +; GCN-NEXT: reserved_vgpr_first = 0 +; GCN-NEXT: reserved_vgpr_count = 0 +; GCN-NEXT: reserved_sgpr_first = 0 +; GCN-NEXT: reserved_sgpr_count = 0 +; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 +; GCN-NEXT: kernarg_segment_alignment = 4 +; GCN-NEXT: group_segment_alignment = 4 +; GCN-NEXT: private_segment_alignment = 4 +; GCN-NEXT: wavefront_size = 6 +; GCN-NEXT: call_convention = -1 +; GCN-NEXT: runtime_loader_kernel_symbol = 0 +; GCN-NEXT: .end_amd_kernel_code_t +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 { ret void } -; GCN-LABEL: {{^}}kernel_ieee_mode_off: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define amdgpu_kernel void @kernel_ieee_mode_off() #2 { +; GCN-LABEL: kernel_ieee_mode_off: +; GCN: .amd_kernel_code_t +; GCN-NEXT: amd_code_version_major = 1 +; GCN-NEXT: amd_code_version_minor = 2 +; GCN-NEXT: amd_machine_kind = 1 +; GCN-NEXT: amd_machine_version_major = 6 +; GCN-NEXT: amd_machine_version_minor = 0 +; GCN-NEXT: amd_machine_version_stepping = 0 +; GCN-NEXT: kernel_code_entry_byte_offset = 256 +; GCN-NEXT: kernel_code_prefetch_byte_size = 0 +; GCN-NEXT: granulated_workitem_vgpr_count = 0 +; GCN-NEXT: granulated_wavefront_sgpr_count = 0 +; GCN-NEXT: priority = 0 +; GCN-NEXT: float_mode = 240 +; GCN-NEXT: priv = 0 +; GCN-NEXT: enable_dx10_clamp = 1 +; GCN-NEXT: debug_mode = 0 +; GCN-NEXT: enable_ieee_mode = 0 +; GCN-NEXT: enable_wgp_mode = 0 +; GCN-NEXT: enable_mem_ordered = 0 +; GCN-NEXT: enable_fwd_progress = 0 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GCN-NEXT: user_sgpr_count = 12 +; GCN-NEXT: enable_trap_handler = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_info = 0 +; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_exception_msb = 0 +; GCN-NEXT: granulated_lds_size = 0 +; GCN-NEXT: enable_exception = 0 +; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 +; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 +; GCN-NEXT: enable_sgpr_private_segment_size = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NEXT: enable_wavefront_size32 = 0 +; GCN-NEXT: enable_ordered_append_gds = 0 +; GCN-NEXT: private_element_size = 1 +; GCN-NEXT: is_ptr64 = 1 +; GCN-NEXT: is_dynamic_callstack = 0 +; GCN-NEXT: is_debug_enabled = 0 +; GCN-NEXT: is_xnack_enabled = 0 +; GCN-NEXT: workitem_private_segment_byte_size = 0 +; GCN-NEXT: workgroup_group_segment_byte_size = 0 +; GCN-NEXT: gds_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 16 +; GCN-NEXT: workgroup_fbarrier_count = 0 +; GCN-NEXT: wavefront_sgpr_count = 4 +; GCN-NEXT: workitem_vgpr_count = 2 +; GCN-NEXT: reserved_vgpr_first = 0 +; GCN-NEXT: reserved_vgpr_count = 0 +; GCN-NEXT: reserved_sgpr_first = 0 +; GCN-NEXT: reserved_sgpr_count = 0 +; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 +; GCN-NEXT: kernarg_segment_alignment = 4 +; GCN-NEXT: group_segment_alignment = 4 +; GCN-NEXT: private_segment_alignment = 4 +; GCN-NEXT: wavefront_size = 6 +; GCN-NEXT: call_convention = -1 +; GCN-NEXT: runtime_loader_kernel_symbol = 0 +; GCN-NEXT: .end_amd_kernel_code_t +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 { ret void } -; GCN-LABEL: {{^}}func_ieee_mode_default: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define void @func_ieee_mode_default() #0 { +; GCN-LABEL: func_ieee_mode_default: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 { ret void } -; GCN-LABEL: {{^}}func_ieee_mode_on: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define void @func_ieee_mode_on() #1 { +; GCN-LABEL: func_ieee_mode_on: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 { ret void } -; GCN-LABEL: {{^}}func_ieee_mode_off: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define void @func_ieee_mode_off() #2 { +; GCN-LABEL: func_ieee_mode_off: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 { ret void } -; GCN-LABEL: {{^}}cs_ieee_mode_default: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_cs void @cs_ieee_mode_default() #0 { +; GCN-LABEL: cs_ieee_mode_default: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 { ret void } -; GCN-LABEL: {{^}}cs_ieee_mode_on: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_cs void @cs_ieee_mode_on() #1 { +; GCN-LABEL: cs_ieee_mode_on: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 { ret void } -; GCN-LABEL: {{^}}cs_ieee_mode_off: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define amdgpu_cs void @cs_ieee_mode_off() #2 { +; GCN-LABEL: cs_ieee_mode_off: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 { ret void } -; GCN-LABEL: {{^}}ps_ieee_mode_default: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define amdgpu_ps void @ps_ieee_mode_default() #0 { +; GCN-LABEL: ps_ieee_mode_default: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 { ret void } -; GCN-LABEL: {{^}}ps_ieee_mode_on: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_ps void @ps_ieee_mode_on() #1 { +; GCN-LABEL: ps_ieee_mode_on: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 { ret void } -; GCN-LABEL: {{^}}ps_ieee_mode_off: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define amdgpu_ps void @ps_ieee_mode_off() #2 { +; GCN-LABEL: ps_ieee_mode_off: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index e27164c2d6d69..b157d2c3d1225 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -2547,44 +2547,45 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: s_and_b32 s0, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s1, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_lshr_b32 s4, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s11, 0xffff -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s11, 16 +; GFX6-NEXT: s_lshr_b32 s4, s9, 16 ; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_lshr_b32 s4, s9, 16 +; GFX6-NEXT: s_lshr_b32 s4, s7, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -2600,6 +2601,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2607,42 +2609,43 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_and_b32 s6, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_and_b32 s5, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 -; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s0, s3, 0xffff +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: s_and_b32 s0, s1, 0xffff +; GFX9-NEXT: s_and_b32 s2, s3, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX9-NEXT: s_lshr_b32 s0, s3, 16 +; GFX9-NEXT: s_lshr_b32 s2, s7, 16 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s0, s1, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 +; GFX9-NEXT: s_lshr_b32 s2, s3, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -2650,7 +2653,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -2659,8 +2661,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -2762,49 +2763,51 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: s_and_b32 s0, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s1, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_lshr_b32 s5, s6, 16 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4 -; GFX6-NEXT: s_and_b32 s6, s11, 0xffff -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: s_and_b32 s5, s9, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 +; GFX6-NEXT: s_and_b32 s8, s9, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1 -; GFX6-NEXT: s_lshr_b32 s4, s11, 16 +; GFX6-NEXT: s_lshr_b32 s4, s9, 16 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: s_lshr_b32 s5, s9, 16 +; GFX6-NEXT: s_lshr_b32 s5, s7, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 @@ -2815,10 +2818,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s7, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -2830,67 +2833,67 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_and_b32 s8, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_and_b32 s5, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s4, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s8, s3, 0xffff ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 16 +; GFX9-NEXT: s_lshr_b32 s6, s7, 16 ; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 -; GFX9-NEXT: v_sub_u32_e32 v4, s0, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s1, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX9-NEXT: v_sub_u32_e32 v4, s2, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3000,62 +3003,64 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s4, s10 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_sext_i32_i16 s7, s10 +; GFX6-NEXT: s_sext_i32_i16 s6, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s7 +; GFX6-NEXT: s_xor_b32 s6, s7, s6 +; GFX6-NEXT: s_ashr_i32 s6, s6, 30 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_or_b32 s8, s6, 1 +; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: s_ashr_i32 s5, s10, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_cselect_b32 s6, s8, 0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GFX6-NEXT: s_ashr_i32 s6, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_xor_b32 s4, s6, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: s_sext_i32_i16 s5, s11 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_sext_i32_i16 s6, s5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s9 +; GFX6-NEXT: s_sext_i32_i16 s4, s11 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_xor_b32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: s_ashr_i32 s5, s11, 16 +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 -; GFX6-NEXT: s_ashr_i32 s4, s9, 16 +; GFX6-NEXT: s_ashr_i32 s4, s11, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 @@ -3080,13 +3085,13 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 +; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -3098,61 +3103,61 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s8, 0 +; GFX9-NEXT: s_ashr_i32 s5, s6, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 -; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 -; GFX9-NEXT: s_sext_i32_i16 s0, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: s_sext_i32_i16 s4, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_ashr_i32 s4, s7, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 ; GFX9-NEXT: s_ashr_i32 s2, s3, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 -; GFX9-NEXT: s_ashr_i32 s0, s1, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s2, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3270,53 +3275,55 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s4, s10 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_sext_i32_i16 s0, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX6-NEXT: s_sext_i32_i16 s1, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_or_b32 s10, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s10, 16 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_cselect_b32 s0, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_ashr_i32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_ashr_i32 s5, s8, 16 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_ashr_i32 s5, s6, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s8, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: s_lshr_b32 s7, s10, 16 -; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GFX6-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NEXT: s_lshr_b32 s8, s8, 16 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s8, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s11 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: s_sext_i32_i16 s4, s9 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s9 +; GFX6-NEXT: s_sext_i32_i16 s5, s7 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 @@ -3330,30 +3337,30 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 -; GFX6-NEXT: s_ashr_i32 s4, s11, 16 +; GFX6-NEXT: s_ashr_i32 s4, s9, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_ashr_i32 s5, s9, 16 +; GFX6-NEXT: s_ashr_i32 s5, s7, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s9, 16 -; GFX6-NEXT: s_lshr_b32 s7, s11, 16 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX6-NEXT: s_lshr_b32 s6, s7, 16 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: s_lshr_b32 s8, s9, 16 +; GFX6-NEXT: s_or_b32 s9, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s8, 0 +; GFX6-NEXT: s_cselect_b32 s4, s9, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s8 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -3362,13 +3369,13 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s8, s2 +; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 +; GFX9-NEXT: s_sext_i32_i16 s9, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -3380,69 +3387,69 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s10, 0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 -; GFX9-NEXT: s_ashr_i32 s0, s2, 16 +; GFX9-NEXT: s_ashr_i32 s10, s2, 16 +; GFX9-NEXT: s_ashr_i32 s2, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s2, s10, s0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_or_b32 s2, s2, 1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 -; GFX9-NEXT: s_sext_i32_i16 s8, s1 +; GFX9-NEXT: s_xor_b32 s4, s10, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s6, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: s_sext_i32_i16 s6, s7 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX9-NEXT: s_sext_i32_i16 s8, s3 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s8 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: s_xor_b32 s0, s8, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s0, s0, 1 -; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX9-NEXT: s_xor_b32 s2, s8, s6 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 -; GFX9-NEXT: s_ashr_i32 s2, s1, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_ashr_i32 s4, s7, 16 +; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX9-NEXT: s_ashr_i32 s5, s3, 16 +; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s0, s2, s3 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s5, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3 +; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s8, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 +; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3838,46 +3845,48 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: s_and_b32 s0, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s1, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_lshr_b32 s4, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s11, 0xffff -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3885,48 +3894,47 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_and_b32 s6, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_and_b32 s5, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s0, s3, 0xffff +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s0, s1, 0xffff +; GFX9-NEXT: s_and_b32 s2, s3, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v6, v2, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v6, v0, s[6:7] +; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4006,52 +4014,54 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: s_lshr_b32 s6, s10, 16 +; GFX6-NEXT: s_and_b32 s1, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_and_b32 s1, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v4 +; GFX6-NEXT: s_mov_b32 s1, s9 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s6, s11, 0xffff -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: s_and_b32 s6, s9, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: v_mad_f32 v2, -v1, v4, v3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_and_b32 s4, s11, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 +; GFX6-NEXT: v_mad_f32 v2, -v2, v3, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s11, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4059,33 +4069,34 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s2, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_and_b32 s8, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_and_b32 s5, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4094,18 +4105,17 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_sub_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2 +; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[6:7] +; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4191,46 +4201,47 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s4, s10 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_sext_i32_i16 s7, s10 +; GFX6-NEXT: s_sext_i32_i16 s6, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s7 +; GFX6-NEXT: s_xor_b32 s6, s7, s6 +; GFX6-NEXT: s_ashr_i32 s6, s6, 30 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_or_b32 s8, s6, 1 +; GFX6-NEXT: s_sext_i32_i16 s5, s5 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: s_ashr_i32 s5, s10, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: s_cselect_b32 s6, s8, 0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v2 +; GFX6-NEXT: s_ashr_i32 s6, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_xor_b32 s4, s6, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: s_sext_i32_i16 s5, s11 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s9 +; GFX6-NEXT: s_sext_i32_i16 s4, s11 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 @@ -4243,6 +4254,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4253,13 +4265,13 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 +; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -4271,44 +4283,44 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s8, 0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: s_ashr_i32 s5, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_or_b32 s0, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 -; GFX9-NEXT: s_sext_i32_i16 s0, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: s_sext_i32_i16 s4, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s2, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[6:7] +; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4400,68 +4412,70 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s4, s10 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_sext_i32_i16 s0, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX6-NEXT: s_sext_i32_i16 s1, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_or_b32 s10, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s10, 16 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_cselect_b32 s0, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_ashr_i32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_ashr_i32 s5, s8, 16 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_ashr_i32 s5, s6, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s8, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: s_lshr_b32 s7, s10, 16 -; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GFX6-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NEXT: s_lshr_b32 s8, s8, 16 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s8, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s11 +; GFX6-NEXT: s_sext_i32_i16 s4, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s9 +; GFX6-NEXT: s_sext_i32_i16 s5, s7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: s_or_b32 s7, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s7, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s9 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 @@ -4470,12 +4484,12 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s8, s2 +; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 +; GFX9-NEXT: s_sext_i32_i16 s9, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -4487,51 +4501,51 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s10, 0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 -; GFX9-NEXT: s_ashr_i32 s0, s2, 16 +; GFX9-NEXT: s_ashr_i32 s10, s2, 16 +; GFX9-NEXT: s_ashr_i32 s2, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s2, s10, s0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_or_b32 s2, s2, 1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX9-NEXT: s_xor_b32 s4, s10, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s6, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v3 +; GFX9-NEXT: s_sext_i32_i16 s4, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX9-NEXT: s_sext_i32_i16 s5, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: s_xor_b32 s0, s3, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s5, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s6, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[6:7] +; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -5653,29 +5667,31 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s0 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 ; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s8, s1 +; GFX6-NEXT: s_sub_i32 s1, s6, s1 ; GFX6-NEXT: s_sub_i32 s3, s1, s0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 @@ -5690,10 +5706,10 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s7, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 ; GFX6-NEXT: s_mul_i32 s0, s0, s2 -; GFX6-NEXT: s_sub_i32 s0, s9, s0 +; GFX6-NEXT: s_sub_i32 s0, s7, s0 ; GFX6-NEXT: s_sub_i32 s1, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2 @@ -5704,19 +5720,19 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s4, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -5728,37 +5744,37 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 ; GFX9-NEXT: s_mul_i32 s5, s4, s7 -; GFX9-NEXT: s_sub_i32 s0, s0, s5 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 ; GFX9-NEXT: s_add_i32 s9, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_sub_i32 s5, s2, s7 +; GFX9-NEXT: s_cmp_ge_u32 s2, s7 ; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 ; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s2, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s0, s5, s4 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 ; GFX9-NEXT: s_sub_i32 s4, 0, s6 ; GFX9-NEXT: s_mul_i32 s4, s4, s8 ; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 ; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s3, s8 ; GFX9-NEXT: s_mul_i32 s5, s4, s6 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 ; GFX9-NEXT: s_add_i32 s7, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s1, s6 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_sub_i32 s5, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 ; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s1, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = udiv <2 x i32> %x, %shl.y @@ -5994,64 +6010,67 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: s_lshl_b32 s8, 0x1000, s1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v0 -; GFX6-NEXT: s_mul_i32 s6, s6, s2 -; GFX6-NEXT: s_sub_i32 s0, s0, s6 -; GFX6-NEXT: s_sub_i32 s6, s0, s2 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s6, s0, s2 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s2, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s6 +; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s4, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s4, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s4, 0, s8 +; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s8 +; GFX6-NEXT: s_sub_i32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s4, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -6063,33 +6082,33 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 ; GFX9-NEXT: s_mul_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s0, s4, s0 -; GFX9-NEXT: s_sub_i32 s4, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s4, s2, s7 +; GFX9-NEXT: s_cmp_ge_u32 s2, s7 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s2, s7 +; GFX9-NEXT: s_cmp_ge_u32 s2, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s0, s4, s0 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 ; GFX9-NEXT: s_sub_i32 s4, 0, s6 ; GFX9-NEXT: s_mul_i32 s4, s4, s8 ; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 ; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s3, s8 ; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s4, s1, s6 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_sub_i32 s4, s1, s6 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s4, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = urem <2 x i32> %x, %shl.y @@ -6471,136 +6490,138 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX6-NEXT: s_abs_i32 s6, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s7, 0, s6 -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s8, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_lshl_b32 s9, 0x1000, s7 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0 -; GFX6-NEXT: s_abs_i32 s7, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_abs_i32 s4, s2 +; GFX6-NEXT: s_xor_b32 s2, s2, s6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s6 -; GFX6-NEXT: s_sub_i32 s2, s7, s2 -; GFX6-NEXT: s_sub_i32 s7, s2, s6 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: s_ashr_i32 s2, s2, 31 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_readfirstlane_b32 s5, v0 +; GFX6-NEXT: s_mul_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s5, s4, s8 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s4, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s2, s7, s2 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s4, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_abs_i32 s2, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: s_abs_i32 s8, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_xor_b32 s3, s1, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_abs_i32 s1, s1 -; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_abs_i32 s1, s3 +; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_ashr_i32 s3, s3, 31 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s0, v2 +; GFX6-NEXT: s_xor_b32 s0, s3, s9 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 -; GFX6-NEXT: s_mul_i32 s0, s0, s2 -; GFX6-NEXT: s_sub_i32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s2 +; GFX6-NEXT: v_readfirstlane_b32 s2, v1 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s2, s1, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s1, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s1, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX9-NEXT: s_abs_i32 s6, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s3 -; GFX9-NEXT: s_abs_i32 s3, s0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s8, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s7 +; GFX9-NEXT: s_abs_i32 s5, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 -; GFX9-NEXT: s_ashr_i32 s0, s0, 31 +; GFX9-NEXT: s_sub_i32 s6, 0, s8 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2 -; GFX9-NEXT: s_add_i32 s8, s8, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s8 -; GFX9-NEXT: s_mul_i32 s8, s2, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s8 -; GFX9-NEXT: s_add_i32 s9, s2, 1 -; GFX9-NEXT: s_sub_i32 s8, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s2, s9, s2 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s2, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s6, s8, s2 -; GFX9-NEXT: s_abs_i32 s8, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: s_xor_b32 s5, s6, s0 -; GFX9-NEXT: s_sub_i32 s6, 0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7 +; GFX9-NEXT: s_mul_i32 s7, s6, s8 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 +; GFX9-NEXT: s_add_i32 s9, s6, 1 +; GFX9-NEXT: s_sub_i32 s7, s5, s8 +; GFX9-NEXT: s_cmp_ge_u32 s5, s8 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s8 +; GFX9-NEXT: s_cselect_b32 s5, s7, s6 +; GFX9-NEXT: s_abs_i32 s6, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_xor_b32 s5, s5, s2 +; GFX9-NEXT: s_sub_i32 s7, 0, s6 +; GFX9-NEXT: s_sub_i32 s2, s5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_xor_b32 s4, s1, s7 -; GFX9-NEXT: s_abs_i32 s1, s1 +; GFX9-NEXT: s_xor_b32 s4, s3, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5 -; GFX9-NEXT: s_mul_i32 s6, s5, s8 -; GFX9-NEXT: s_sub_i32 s1, s1, s6 +; GFX9-NEXT: s_mul_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s7 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_sub_i32 s7, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 ; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s1, s8 -; GFX9-NEXT: s_cmp_ge_u32 s1, s8 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_cselect_b32 s1, s6, s1 -; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s8 -; GFX9-NEXT: s_cselect_b32 s1, s6, s5 -; GFX9-NEXT: s_xor_b32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s7, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = sdiv <2 x i32> %x, %shl.y @@ -6923,122 +6944,125 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX6-NEXT: s_abs_i32 s2, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s6, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX6-NEXT: s_abs_i32 s6, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s7 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_abs_i32 s4, s2 +; GFX6-NEXT: s_ashr_i32 s2, s2, 31 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s2 -; GFX6-NEXT: s_sub_i32 s6, s6, s7 -; GFX6-NEXT: s_sub_i32 s7, s6, s2 -; GFX6-NEXT: s_cmp_ge_u32 s6, s2 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 -; GFX6-NEXT: s_sub_i32 s7, s6, s2 -; GFX6-NEXT: s_cmp_ge_u32 s6, s2 -; GFX6-NEXT: s_cselect_b32 s2, s7, s6 -; GFX6-NEXT: s_abs_i32 s3, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s6, 0, s3 -; GFX6-NEXT: s_abs_i32 s8, s1 -; GFX6-NEXT: s_xor_b32 s2, s2, s0 +; GFX6-NEXT: s_mul_i32 s7, s7, s6 +; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s7, s4, s6 +; GFX6-NEXT: s_cmp_ge_u32 s4, s6 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s7, s4, s6 +; GFX6-NEXT: s_cmp_ge_u32 s4, s6 +; GFX6-NEXT: s_cselect_b32 s8, s7, s4 +; GFX6-NEXT: s_abs_i32 s9, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_sub_i32 s4, 0, s9 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_ashr_i32 s1, s3, 31 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s0, s2, s0 -; GFX6-NEXT: s_ashr_i32 s1, s1, 31 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_abs_i32 s0, s3 +; GFX6-NEXT: s_xor_b32 s3, s8, s2 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_sub_i32 s2, s3, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_xor_b32 s2, s2, s1 -; GFX6-NEXT: s_sub_i32 s1, s2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s9 +; GFX6-NEXT: s_sub_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s3, s0, s9 +; GFX6-NEXT: s_cmp_ge_u32 s0, s9 +; GFX6-NEXT: s_cselect_b32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s3, s0, s9 +; GFX6-NEXT: s_cmp_ge_u32 s0, s9 +; GFX6-NEXT: s_cselect_b32 s0, s3, s0 +; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX9-NEXT: s_abs_i32 s2, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s7, 0, s2 -; GFX9-NEXT: s_ashr_i32 s6, s0, 31 -; GFX9-NEXT: s_abs_i32 s0, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s6, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s5, s2, 31 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s0, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s7 -; GFX9-NEXT: s_sub_i32 s7, s0, s2 -; GFX9-NEXT: s_cmp_ge_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s0, s7, s0 -; GFX9-NEXT: s_sub_i32 s7, s0, s2 -; GFX9-NEXT: s_cmp_ge_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s0, s7, s0 -; GFX9-NEXT: s_abs_i32 s7, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s0, s0, s6 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s2, s8 +; GFX9-NEXT: s_mul_i32 s7, s7, s6 +; GFX9-NEXT: s_sub_i32 s2, s2, s7 +; GFX9-NEXT: s_sub_i32 s7, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s2, s7, s2 +; GFX9-NEXT: s_sub_i32 s7, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s2, s7, s2 +; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s7, 0, s4 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s0, s0, s6 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_abs_i32 s1, s1 +; GFX9-NEXT: s_ashr_i32 s6, s3, 31 +; GFX9-NEXT: s_abs_i32 s3, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_sub_i32 s5, s1, s7 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_sub_i32 s5, s1, s7 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_xor_b32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s5, s5, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y @@ -10262,6 +10286,9 @@ define i64 @udiv_i64_9divbits(i8 %size) { } define <2 x i64> @srem_zero_zero() { +; GCN-LABEL: kernel: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_endpgm ; GFX6-LABEL: srem_zero_zero: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index 29bfc253e2e7e..ab76b2ad18b89 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -697,6 +697,12 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i ret void } +; FUNC-LABEL: {{^}}s_and_32_bit_constant_i64: +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}} +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, i64 %a) { ; GFX6-LABEL: s_and_32_bit_constant_i64: ; GFX6: ; %bb.0: @@ -728,6 +734,17 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, ret void } +; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64: +; SI: s_load_dwordx4 +; SI: s_load_dword +; SI: s_load_dwordx2 +; SI-NOT: and +; SI: s_lshl_b32 [[C:s[0-9]+]], [[A:s[0-9]+]], 1 +; SI: s_lshl_b32 [[D:s[0-9]+]], [[B:s[0-9]+]], 1 +; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62 +; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) { ; GFX6-LABEL: s_and_multi_use_inline_imm_i64: ; GFX6: ; %bb.0: @@ -1179,6 +1196,14 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink: +; SI: s_load_dwordx4 +; SI: s_lshl_b32 [[B:s[0-9]+]], [[A:s[0-9]+]], 1{{$}} +; SI-NOT: and +; SI: s_and_b32 s{{[0-9]+}}, [[B:s[0-9]+]], 64 +; SI-NOT: and +; SI: s_add_u32 +; SI-NEXT: s_mov_b32 define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a, i32, i64 %b) { ; GFX6-LABEL: s_and_inline_imm_64_i64_noshrink: ; GFX6: ; %bb.0: @@ -1220,6 +1245,12 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %ou ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64 +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_1_i64: ; GFX6: ; %bb.0: @@ -1251,6 +1282,14 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64 +; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 + +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_1.0_i64: ; GFX6: ; %bb.0: @@ -1282,6 +1321,14 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr a ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64 +; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0 + +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_neg_1.0_i64: ; GFX6: ; %bb.0: @@ -1313,6 +1360,14 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, p ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64 +; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5 + +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_0.5_i64: ; GFX6: ; %bb.0: @@ -1344,6 +1399,14 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr a ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64: +; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5 + +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_neg_0.5_i64: ; GFX6: ; %bb.0: @@ -1375,6 +1438,12 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, p ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64: +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_2.0_i64: ; GFX6: ; %bb.0: @@ -1406,6 +1475,12 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr a ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64: +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_neg_2.0_i64: ; GFX6: ; %bb.0: @@ -1437,6 +1512,14 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, p ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64: +; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0 + +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_4.0_i64: ; GFX6: ; %bb.0: @@ -1468,6 +1551,14 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr a ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64: +; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0 + +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_neg_4.0_i64: ; GFX6: ; %bb.0: @@ -1502,6 +1593,12 @@ define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, p ; Test with the 64-bit integer bitpattern for a 32-bit float in the ; low 32-bits, which is not a valid 64-bit inline immmediate. +; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64: +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_f32_4.0_i64: ; GFX6: ; %bb.0: @@ -1533,6 +1630,12 @@ define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, p ret void } +; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64: +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_imm_f32_neg_4.0_i64: ; GFX6: ; %bb.0: @@ -1565,7 +1668,11 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %ou } ; Shift into upper 32-bits - +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_high_imm_f32_4.0_i64: ; GFX6: ; %bb.0: @@ -1597,6 +1704,12 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %o ret void } +; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64: +; SI: s_load_dwordx4 +; SI-NOT: and +; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0 +; SI-NOT: and +; SI: buffer_store_dwordx2 define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; GFX6-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64: ; GFX6: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 763f436997c21..2a9ce083dc0e3 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -271,13 +271,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s0, s3, 16 +; GFX8-NEXT: s_lshl_b32 s1, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll index c46fcde739b1c..61fb18e00917b 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll @@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff -; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff +; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_fabs_v4bf16: @@ -234,23 +234,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s3, 0x7fff -; VI-NEXT: s_lshr_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s5, s2, 0x7fff -; VI-NEXT: s_lshr_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s3, s3, 0x7fff -; VI-NEXT: s_and_b32 s2, s2, 0x7fff -; VI-NEXT: s_and_b32 s4, 0xffff, s4 -; VI-NEXT: s_and_b32 s5, 0xffff, s5 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s3, s4, s3 -; VI-NEXT: s_or_b32 s2, s5, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff +; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_fabs_v4bf16: @@ -258,14 +248,8 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s3, 0x7fff -; GFX9-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NEXT: s_and_b32 s5, s2, 0x7fff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff -; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s2 +; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -275,14 +259,8 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s4, s3, 0x7fff -; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_lshr_b32 s5, s2, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff -; GFX11-NEXT: s_and_b32 s5, s5, 0x7fff -; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s4, s3 +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 27cf49aec8229..a77c7ae923d0f 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff -; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff +; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_fabs_v4f16: @@ -234,13 +234,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff -; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff +; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_fabs_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 6bcb086944c91..baf9b0abf7b0c 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -115,13 +115,13 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s3, 31 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff +; VI-NEXT: s_and_b32 s1, s2, 0x7fffffff +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 0a2e758f7cf21..1d87d938cc41c 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -472,50 +472,52 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_brev_b32 s8, -2 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfi_b32 v1, s8, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_bfi_b32 v0, s8, v0, v2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_brev_b32 s6, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_bfi_b32 v1, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_bfi_b32 v0, s6, v2, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_bfi_b32 v3, s6, v2, v3 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_bfi_b32 v2, s6, v2, v4 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) store <2 x float> %result, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index b826e6c469d8e..72e3549656327 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -938,16 +938,18 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-FASTFMA-NEXT: s_mov_b32 s4, s0 +; GFX6-FASTFMA-NEXT: s_mov_b32 s5, s1 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -956,13 +958,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s11, v1 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s9, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -972,20 +974,21 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s10, v2 -; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s8, v2 +; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-FASTFMA-NEXT: s_endpgm ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -995,14 +998,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s2 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2 +; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s3, v0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s5, v0 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v5, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v5, v5 @@ -1012,22 +1014,24 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s2, v4 -; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v4 +; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_mov_b32 s5, s1 +; GFX7-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, s11 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -1036,13 +1040,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2 +; GFX7-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_div_fixup_f32 v1, v0, s11, v1 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 +; GFX7-NEXT: v_div_fixup_f32 v1, v0, s9, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -1052,19 +1056,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX7-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v0, s10, v2 -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: v_div_fixup_f32 v0, v0, s8, v2 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -1074,13 +1079,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-NEXT: v_div_fixup_f32 v1, v1, s3, v0 +; GFX8-NEXT: v_div_fixup_f32 v1, v1, s5, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v0, -v2, v5, 1.0 ; GFX8-NEXT: v_fma_f32 v0, v0, v5, v5 @@ -1090,19 +1094,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_div_fixup_f32 v0, v0, s2, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_div_fixup_f32 v0, v0, s4, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s6, s3, s3, s1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1 +; GFX10-NEXT: v_div_scale_f32 v0, s4, s7, s7, s3 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s7, s3 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1112,12 +1117,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s6, s2, s2, s0 +; GFX10-NEXT: v_div_scale_f32 v2, s4, s6, s6, s2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_div_fixup_f32 v1, v0, s3, s1 -; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0 +; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s3 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s6, s2 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -1128,19 +1132,18 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, s0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1 +; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s3 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s5, s3 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1151,11 +1154,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_scale_f32 v2, null, s2, s2, s0 +; GFX11-NEXT: v_div_scale_f32 v2, null, s4, s4, s2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: v_div_fixup_f32 v1, v0, s3, s1 -; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0 +; GFX11-NEXT: v_div_fixup_f32 v1, v0, s5, s3 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s4, s2 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -1167,8 +1170,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32: @@ -1193,58 +1196,60 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s2 -; GFX67-NEXT: v_rcp_f32_e32 v1, s3 -; GFX67-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX67-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX67-NEXT: v_rcp_f32_e32 v0, s8 +; GFX67-NEXT: v_rcp_f32_e32 v1, s9 +; GFX67-NEXT: s_mov_b32 s4, s0 +; GFX67-NEXT: s_mov_b32 s5, s1 +; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX67-NEXT: v_mul_f32_e32 v1, s3, v1 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s2 -; GFX8-NEXT: v_rcp_f32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s1, v1 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_rcp_f32_e32 v2, s6 +; GFX8-NEXT: v_rcp_f32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_f32_e32 v2, s2, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, s3, v3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s2 -; GFX10-NEXT: v_rcp_f32_e32 v1, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, s1, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_rcp_f32_e32 v0, s6 +; GFX10-NEXT: v_rcp_f32_e32 v1, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s2 -; GFX11-NEXT: v_rcp_f32_e32 v1, s3 +; GFX11-NEXT: v_rcp_f32_e32 v0, s6 +; GFX11-NEXT: v_rcp_f32_e32 v1, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_ulp25_v2f32: @@ -1269,58 +1274,60 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s3 -; GFX67-NEXT: v_rcp_f32_e32 v2, s2 -; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX67-NEXT: v_rcp_f32_e32 v0, s9 +; GFX67-NEXT: v_rcp_f32_e32 v2, s8 +; GFX67-NEXT: s_mov_b32 s4, s0 +; GFX67-NEXT: s_mov_b32 s5, s1 +; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s3 -; GFX8-NEXT: v_rcp_f32_e32 v2, s2 -; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_rcp_f32_e32 v2, s7 +; GFX8-NEXT: v_rcp_f32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_rcp_f32_e32 v2, s2 -; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_rcp_f32_e32 v2, s6 +; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s3 -; GFX11-NEXT: v_rcp_f32_e32 v2, s2 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 +; GFX11-NEXT: v_rcp_f32_e32 v2, s6 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_fast_math: @@ -1345,58 +1352,60 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s3 -; GFX67-NEXT: v_rcp_f32_e32 v2, s2 -; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX67-NEXT: v_rcp_f32_e32 v0, s9 +; GFX67-NEXT: v_rcp_f32_e32 v2, s8 +; GFX67-NEXT: s_mov_b32 s4, s0 +; GFX67-NEXT: s_mov_b32 s5, s1 +; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s3 -; GFX8-NEXT: v_rcp_f32_e32 v2, s2 -; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_rcp_f32_e32 v2, s7 +; GFX8-NEXT: v_rcp_f32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_rcp_f32_e32 v2, s2 -; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_rcp_f32_e32 v2, s6 +; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s3 -; GFX11-NEXT: v_rcp_f32_e32 v2, s2 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 +; GFX11-NEXT: v_rcp_f32_e32 v2, s6 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_arcp_math: diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index a025c36f620c7..3e450b785b57b 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -135,11 +135,11 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_rndne_f32_e32 v1, s3 -; VI-NEXT: v_rndne_f32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_rndne_f32_e32 v3, s3 +; VI-NEXT: v_rndne_f32_e32 v2, s2 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fnearbyint_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 76da0aaf251b2..25010677bc19a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -589,13 +589,13 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_or_b32 s3, s3, 0x80008000 -; CI-NEXT: s_or_b32 s2, s2, 0x80008000 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_or_b32 s0, s3, 0x80008000 +; CI-NEXT: s_or_b32 s1, s2, 0x80008000 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabs_v4bf16: @@ -605,23 +605,25 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_bitset1_b32 s3, 15 -; VI-NEXT: s_bitset1_b32 s2, 15 -; VI-NEXT: s_bitset1_b32 s5, 15 -; VI-NEXT: s_bitset1_b32 s4, 15 -; VI-NEXT: s_and_b32 s3, 0xffff, s3 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_and_b32 s2, 0xffff, s2 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_or_b32 s3, s3, s5 -; VI-NEXT: s_or_b32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s0, s2, 0x7fff7fff +; VI-NEXT: s_and_b32 s1, s3, 0x7fff7fff +; VI-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; VI-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; VI-NEXT: s_xor_b32 s1, s1, 0x8000 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_xor_b32 s0, s0, 0x8000 +; VI-NEXT: s_xor_b32 s2, s2, 0x8000 +; VI-NEXT: s_and_b32 s1, 0xffff, s1 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_and_b32 s0, 0xffff, s0 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_or_b32 s1, s1, s3 +; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fneg_fabs_v4bf16: @@ -629,14 +631,16 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_bitset1_b32 s3, 15 -; GFX9-NEXT: s_bitset1_b32 s2, 15 -; GFX9-NEXT: s_bitset1_b32 s5, 15 -; GFX9-NEXT: s_bitset1_b32 s4, 15 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s5, s3, 0x7fff7fff +; GFX9-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; GFX9-NEXT: s_xor_b32 s3, s3, 0x8000 +; GFX9-NEXT: s_xor_b32 s5, s5, 0x8000 +; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX9-NEXT: s_xor_b32 s4, s4, 0x8000 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -646,14 +650,16 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: s_lshr_b32 s5, s3, 16 -; GFX11-NEXT: s_bitset1_b32 s3, 15 -; GFX11-NEXT: s_bitset1_b32 s2, 15 -; GFX11-NEXT: s_bitset1_b32 s4, 15 -; GFX11-NEXT: s_bitset1_b32 s5, 15 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-NEXT: s_and_b32 s4, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s5, s3, 0x7fff7fff +; GFX11-NEXT: s_bfe_u32 s3, s3, 0xf0010 +; GFX11-NEXT: s_bfe_u32 s2, s2, 0xf0010 +; GFX11-NEXT: s_xor_b32 s3, s3, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX11-NEXT: s_xor_b32 s4, s4, 0x8000 +; GFX11-NEXT: s_xor_b32 s5, s5, 0x8000 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 9d9a851a5507e..305f4e56184cc 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -516,13 +516,13 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 -; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 -; CIVI-NEXT: v_mov_b32_e32 v3, s1 -; CIVI-NEXT: v_mov_b32_e32 v0, s2 -; CIVI-NEXT: v_mov_b32_e32 v1, s3 -; CIVI-NEXT: v_mov_b32_e32 v2, s0 -; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_or_b32 s0, s3, 0x80008000 +; CIVI-NEXT: s_or_b32 s1, s2, 0x80008000 +; CIVI-NEXT: v_mov_b32_e32 v2, s1 +; CIVI-NEXT: v_mov_b32_e32 v3, s0 +; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CIVI-NEXT: s_endpgm ; ; GFX9-LABEL: fneg_fabs_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 1fa9bfa3cfa3f..b93a598cb52ae 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -215,13 +215,13 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s3, 31 -; VI-NEXT: s_bitset1_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_or_b32 s0, s3, 0x80000000 +; VI-NEXT: s_or_b32 s1, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index c3f4ebe30152b..17225b7c39f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -68,13 +68,13 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 +; VI-NEXT: s_xor_b32 s1, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index ed1ee4527ed89..9385b3c26276e 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -179,22 +179,22 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; SI-NEXT: s_not_b32 s3, s5 -; SI-NEXT: s_lshr_b32 s1, s1, 1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_not_b32 s1, s4 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s0, 1 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_not_b32 s1, s7 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: v_alignbit_b32 v0, s3, v0, 1 +; SI-NEXT: s_lshr_b32 s0, s3, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_not_b32 s1, s6 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 +; SI-NEXT: s_lshr_b32 s0, s2, 1 ; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 @@ -202,47 +202,43 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_not_b32 s7, s7 -; VI-NEXT: s_lshr_b32 s3, s1, 1 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_not_b32 s1, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: s_lshr_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_not_b32 s1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 s0, s3, 1 +; VI-NEXT: v_alignbit_b32 v2, s3, v2, 1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_alignbit_b32 v3, s0, v2, v3 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_not_b32 s1, s6 +; VI-NEXT: v_alignbit_b32 v2, s2, v2, 1 +; VI-NEXT: s_lshr_b32 s0, s2, 1 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_alignbit_b32 v2, s0, v2, v4 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; GFX9-NEXT: s_not_b32 s1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_not_b32 s1, s15 +; GFX9-NEXT: s_lshr_b32 s0, s11, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_not_b32 s1, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_not_b32 s1, s14 +; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s10, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -265,40 +261,34 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_not_b32 s2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s11, s13, 1 +; GFX10-NEXT: v_alignbit_b32 v3, s10, s12, 1 +; GFX10-NEXT: s_lshr_b32 s0, s11, 1 +; GFX10-NEXT: s_not_b32 s1, s15 +; GFX10-NEXT: s_lshr_b32 s2, s10, 1 +; GFX10-NEXT: s_not_b32 s3, s14 +; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s2, v3, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_not_b32 s2, s7 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s3, s6 -; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_alignbit_b32 v0, s3, s5, 1 +; GFX11-NEXT: v_alignbit_b32 v3, s2, s4, 1 +; GFX11-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-NEXT: s_not_b32 s4, s7 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_not_b32 s5, s6 +; GFX11-NEXT: v_alignbit_b32 v1, s3, v0, s4 +; GFX11-NEXT: v_alignbit_b32 v0, s2, v3, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -309,43 +299,45 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_alignbit_b32 v1, s3, v0, 23 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_alignbit_b32 v3, s3, v2, 23 +; VI-NEXT: v_alignbit_b32 v2, s2, v4, 25 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, 23 +; GFX9-NEXT: v_alignbit_b32 v0, s2, v3, 25 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -365,25 +357,25 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_alignbit_b32 v1, s3, s7, 23 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s6, 25 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_alignbit_b32 v1, s3, s5, 23 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s4, 25 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index ef68f44bac203..88c232fea5952 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -221,51 +221,47 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_alignbit_b32 v3, s3, v2, v3 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_alignbit_b32 v2, s2, v4, v2 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-NEXT: v_alignbit_b32 v1, s11, v0, v1 +; GFX9-NEXT: v_alignbit_b32 v0, s10, v3, v4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; @@ -285,79 +281,64 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s15 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: v_alignbit_b32 v1, s11, s13, v0 +; GFX10-NEXT: v_alignbit_b32 v0, s10, s12, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fshr_v2i32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s3, s5, v0.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s2, s4, v0.h +; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fshr_v2i32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s3, s5, v0 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s2, s4, v2 +; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-TRUE16-LABEL: fshr_v2i32: ; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s3, s5, v0.l +; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s2, s4, v0.h +; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-TRUE16-NEXT: s_endpgm ; ; GFX12-FAKE16-LABEL: fshr_v2i32: ; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s3, s5, v0 +; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s2, s4, v2 +; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX12-FAKE16-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -368,43 +349,45 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_alignbit_b32 v1, s3, v0, 9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_alignbit_b32 v3, s3, v2, 9 +; VI-NEXT: v_alignbit_b32 v2, s2, v4, 7 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, 9 +; GFX9-NEXT: v_alignbit_b32 v0, s2, v3, 7 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -424,37 +407,37 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_alignbit_b32 v1, s3, s7, 9 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s6, 7 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_alignbit_b32 v1, s3, s5, 9 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s4, 7 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_v2i32_imm: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_alignbit_b32 v1, s3, s5, 9 +; GFX12-NEXT: v_alignbit_b32 v0, s2, s4, 7 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index da132d0269e6b..580eeda73781e 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -9552,6 +9552,47 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a ; GFX9-NEXT: s_cbranch_execnz .LBB136_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s3, s0, 0x4650 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_and_b32 s0, s3, -4 +; GFX11-NEXT: s_and_b32 s3, s3, 3 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_lshl_b32 s5, s3, 3 +; GFX11-NEXT: s_and_b32 s6, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s3, s2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s4, s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1 +; GFX11-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB136_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_endpgm %gep = getelementptr i16, ptr addrspace(1) %out, i64 9000 %val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void @@ -9671,6 +9712,47 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: s_cbranch_execnz .LBB137_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s3, s0, 0x2328 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_and_b32 s0, s3, -4 +; GFX11-NEXT: s_and_b32 s3, s3, 3 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_lshl_b32 s5, s3, 3 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, 0xff, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s3, s2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s4, s6, s5 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1 +; GFX11-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_execnz .LBB137_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %out, i64 9000 %val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 8e427a6ef2023..87e57298f5dc6 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -98,16 +98,16 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_add_u32 s4, s0, 4 -; CIVI-NEXT: s_addc_u32 s5, s1, 0 -; CIVI-NEXT: v_mov_b32_e32 v2, s4 -; CIVI-NEXT: v_mov_b32_e32 v4, s3 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v3, s5 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: v_mov_b32_e32 v5, s2 -; CIVI-NEXT: flat_store_short v[2:3], v4 -; CIVI-NEXT: flat_store_dword v[0:1], v5 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: s_add_u32 s0, s0, 4 +; CIVI-NEXT: flat_store_dword v[0:1], v2 +; CIVI-NEXT: s_addc_u32 s1, s1, 0 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s3 +; CIVI-NEXT: flat_store_short v[0:1], v2 ; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3f16_arg: @@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: v_mov_b32_e32 v3, s3 ; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CIVI-NEXT: s_endpgm @@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm store <4 x half> %arg, ptr addrspace(1) %out @@ -348,37 +348,21 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 } define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { -; CI-LABEL: extload_v3f16_to_v3f32_arg: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v4, s1 -; CI-NEXT: v_mov_b32_e32 v3, s0 -; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] -; CI-NEXT: s_endpgm -; -; VI-LABEL: extload_v3f16_to_v3f32_arg: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] -; VI-NEXT: s_endpgm +; CIVI-LABEL: extload_v3f16_to_v3f32_arg: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_lshr_b32 s4, s2, 16 +; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; CIVI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v3, s0 +; CIVI-NEXT: v_mov_b32_e32 v4, s1 +; CIVI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] +; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: @@ -386,9 +370,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x float> @@ -404,14 +388,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s3, 16 -; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: s_lshr_b32 s5, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; @@ -424,12 +408,12 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; @@ -440,10 +424,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 ; GFX11-NEXT: s_lshr_b32 s5, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x float> @@ -724,61 +708,33 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 } define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { -; CI-LABEL: extload_v4f16_to_v4f64_arg: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: s_lshr_b32 s5, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; CI-NEXT: s_add_u32 s2, s0, 16 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; CI-NEXT: v_mov_b32_e32 v9, s3 -; CI-NEXT: v_mov_b32_e32 v8, s2 -; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; CI-NEXT: s_endpgm -; -; VI-LABEL: extload_v4f16_to_v4f64_arg: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v8, s2 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; VI-NEXT: s_endpgm +; CIVI-LABEL: extload_v4f16_to_v4f64_arg: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_lshr_b32 s5, s3, 16 +; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; CIVI-NEXT: s_lshr_b32 s4, s2, 16 +; CIVI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; CIVI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; CIVI-NEXT: s_add_u32 s2, s0, 16 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CIVI-NEXT: s_addc_u32 s3, s1, 0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CIVI-NEXT: v_mov_b32_e32 v9, s3 +; CIVI-NEXT: v_mov_b32_e32 v8, s2 +; CIVI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CIVI-NEXT: s_nop 0 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e1b4cad370f96..11826aa0b360d 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -290,19 +290,19 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dword s5, s[4:5], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] -; GCN-NEXT: s_lshl_b32 s6, s6, 4 -; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshl_b32 s0, s5, 4 +; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] +; GCN-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -418,19 +418,19 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dword s5, s[4:5], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0x10001 -; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] -; GCN-NEXT: s_lshl_b32 s6, s6, 4 -; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshl_b32 s0, s5, 4 +; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] +; GCN-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -443,18 +443,18 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s5, s3, 0x1010101 -; GCN-NEXT: s_lshl_b32 s6, s6, 3 -; GCN-NEXT: s_xor_b32 s4, s2, 0x1010101 -; GCN-NEXT: s_lshl_b64 s[6:7], 0xff, s6 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshl_b32 s4, s4, 3 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_xor_b32 s1, s3, 0x1010101 +; GCN-NEXT: s_xor_b32 s0, s2, 0x1010101 +; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 44bd4090436ef..349806394244e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1573,13 +1573,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s0, 0x50005 +; VI-NEXT: s_lshl_b32 s0, s8, 4 +; VI-NEXT: s_mov_b32 s8, 0x50005 +; VI-NEXT: s_mov_b32 s9, s8 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_mov_b32 s1, s0 -; VI-NEXT: s_lshl_b32 s8, s8, 4 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; VI-NEXT: s_lshl_b64 s[8:9], 0xffff, s8 -; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 +; VI-NEXT: s_xor_b64 s[8:9], s[2:3], s[8:9] +; VI-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index a2da8876472ab..41b5103b38e50 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1000,16 +1000,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 4 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 -; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i16_arg: @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index ab0000f6831b6..23c5a079c5c6e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -139,20 +139,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -177,39 +183,51 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -258,20 +276,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -296,39 +320,51 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -5419,58 +5455,76 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -5519,58 +5573,76 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index ac356fad5b2da..9ea8771506aa2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -339,53 +339,53 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 +; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4 -; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000 -; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7 +; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index d12ebe49814d8..268e1e25f766f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -341,53 +341,53 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x40549000 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 +; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4 -; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000 -; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x40549000, v6 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x40549000, v8 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7 +; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp10_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 883db20a867b3..e5266c8c60f23 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -225,26 +225,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v2, s2, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: s_cselect_b32 s3, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3 -; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec -; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v3, v4, s0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, s0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index b5038c8f606ab..8763ed5356c34 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -406,51 +406,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 -; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3f317000, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7 +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v3 +; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 -; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v1 -; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 7465b492d75ea..44c28e4c71675 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -406,51 +406,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 -; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3e9a2000, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7 +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v3 +; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 -; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v1 -; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log10_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 0854134be6f46..54765dfb8ba9e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -285,16 +285,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 ; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s2, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4 +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 0458a64991028..11f56f3d3c8a1 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -1067,117 +1067,117 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_ashr_i32 s6, s0, 16 -; CI-NEXT: s_ashr_i32 s7, s1, 16 -; CI-NEXT: s_sext_i32_i16 s0, s0 -; CI-NEXT: s_sext_i32_i16 s1, s1 -; CI-NEXT: s_ashr_i32 s8, s2, 16 -; CI-NEXT: s_ashr_i32 s9, s3, 16 -; CI-NEXT: s_sext_i32_i16 s2, s2 -; CI-NEXT: s_sext_i32_i16 s3, s3 -; CI-NEXT: s_min_i32 s7, s7, s9 -; CI-NEXT: s_min_i32 s1, s1, s3 -; CI-NEXT: s_min_i32 s3, s6, s8 -; CI-NEXT: s_min_i32 s0, s0, s2 -; CI-NEXT: s_lshl_b32 s7, s7, 16 -; CI-NEXT: s_and_b32 s1, s1, 0xffff -; CI-NEXT: s_lshl_b32 s3, s3, 16 -; CI-NEXT: s_and_b32 s0, s0, 0xffff -; CI-NEXT: s_or_b32 s1, s1, s7 -; CI-NEXT: s_or_b32 s0, s0, s3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_ashr_i32 s0, s2, 16 +; CI-NEXT: s_ashr_i32 s1, s3, 16 +; CI-NEXT: s_sext_i32_i16 s2, s2 +; CI-NEXT: s_sext_i32_i16 s3, s3 +; CI-NEXT: s_ashr_i32 s6, s4, 16 +; CI-NEXT: s_ashr_i32 s7, s5, 16 +; CI-NEXT: s_sext_i32_i16 s4, s4 +; CI-NEXT: s_sext_i32_i16 s5, s5 +; CI-NEXT: s_min_i32 s1, s1, s7 +; CI-NEXT: s_min_i32 s3, s3, s5 +; CI-NEXT: s_min_i32 s0, s0, s6 +; CI-NEXT: s_min_i32 s2, s2, s4 +; CI-NEXT: s_lshl_b32 s1, s1, 16 +; CI-NEXT: s_and_b32 s3, s3, 0xffff +; CI-NEXT: s_lshl_b32 s0, s0, 16 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s1, s3, s1 +; CI-NEXT: s_or_b32 s0, s2, s0 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s6, s3, 16 -; VI-NEXT: s_ashr_i32 s7, s1, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_ashr_i32 s0, s5, 16 +; VI-NEXT: s_ashr_i32 s1, s3, 16 +; VI-NEXT: s_min_i32 s0, s1, s0 +; VI-NEXT: s_sext_i32_i16 s1, s5 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_min_i32 s6, s7, s6 -; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_min_i32 s1, s3, s1 +; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s1, s1, s6 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_ashr_i32 s1, s4, 16 ; VI-NEXT: s_ashr_i32 s3, s2, 16 -; VI-NEXT: s_ashr_i32 s6, s0, 16 +; VI-NEXT: s_min_i32 s1, s3, s1 +; VI-NEXT: s_sext_i32_i16 s3, s4 ; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_min_i32 s3, s6, s3 -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_min_i32 s2, s2, s3 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_or_b32 s1, s2, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 -; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_pk_min_i16 v1, s3, v0 +; GFX9-NEXT: v_pk_min_i16 v0, s2, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 -; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: v_pk_min_i16 v1, s3, s5 +; GFX10-NEXT: v_pk_min_i16 v0, s2, s4 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_min_i16 v1, s1, s3 -; GFX11-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_pk_min_i16 v1, s3, s5 +; GFX11-NEXT: v_pk_min_i16 v0, s2, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_imin_sle_v4i16: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_min_i16 v1, s1, s3 -; GFX1250-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1250-NEXT: v_pk_min_i16 v1, s3, s7 +; GFX1250-NEXT: v_pk_min_i16 v0, s2, s6 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b @@ -1624,91 +1624,91 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_min_i32 s1, s1, s3 -; CI-NEXT: s_min_i32 s0, s0, s2 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_min_i32 s0, s3, s5 +; CI-NEXT: s_min_i32 s1, s2, s4 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_min_i32 s0, s3, s5 +; VI-NEXT: s_min_i32 s1, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_min_i32 s1, s1, s3 -; GFX9-NEXT: s_min_i32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_min_i32 s3, s3, s5 +; GFX9-NEXT: s_min_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_min_i32 s0, s0, s2 -; GFX10-NEXT: s_min_i32 s1, s1, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_min_i32 s2, s2, s4 +; GFX10-NEXT: s_min_i32 s3, s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s0, s0, s2 -; GFX11-NEXT: s_min_i32 s1, s1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_min_i32 s2, s2, s4 +; GFX11-NEXT: s_min_i32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_imin_slt_v2i32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_min_i32 s0, s0, s2 -; GFX1250-NEXT: s_min_i32 s1, s1, s3 -; GFX1250-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1250-NEXT: s_min_i32 s2, s2, s6 +; GFX1250-NEXT: s_min_i32 s3, s3, s7 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s3 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 78207c2cf605e..f3fb792f141ce 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -75,18 +75,31 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm ; -; GFX1250-LABEL: fadd_v2_vs: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset -; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-SDAG-LABEL: fadd_v2_vs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fadd_v2_vs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1332,18 +1345,31 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm ; -; GFX1250-LABEL: fmul_v2_vs: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset -; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-SDAG-LABEL: fmul_v2_vs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: fmul_v2_vs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -3466,8 +3492,8 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX900-LABEL: fadd_fadd_fsub: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, s3 ; GFX900-NEXT: v_add_f32_e32 v0, s1, v0 @@ -3475,14 +3501,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX900-NEXT: v_add_f32_e32 v3, s2, v0 ; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1 ; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-NEXT: s_endpgm ; ; PACKED-SDAG-LABEL: fadd_fadd_fsub: ; PACKED-SDAG: ; %bb.0: ; %bb ; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0 @@ -3490,7 +3516,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0 ; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] -; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; PACKED-SDAG-NEXT: s_endpgm ; ; GFX90A-GISEL-LABEL: fadd_fadd_fsub: diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index 0a1d15bf945f9..9df798eafec3a 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -94,62 +94,64 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_sub_i32 s2, 32, s2 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_sub_i32 s0, 32, s9 +; SI-NEXT: s_sub_i32 s1, 32, s8 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_alignbit_b32 v1, s3, s3, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 32, s2 -; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sub_i32 s1, 32, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v2 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: s_sub_i32 s2, 32, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: s_sub_i32 s4, 32, s7 +; GFX10-NEXT: s_sub_i32 s5, 32, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s4 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s5 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s3, 32, s3 -; GFX11-NEXT: s_sub_i32 s2, 32, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sub_i32 s4, 32, s7 +; GFX11-NEXT: s_sub_i32 s5, 32, s6 +; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s4 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %0 = shl <2 x i32> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 403a556688091..4a730eb2580df 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -83,54 +83,56 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_alignbit_b32 v1, s3, s3, v0 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2 +; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v4 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s7 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s6 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s5 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %tmp0 = sub <2 x i32> , %y diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll index f14a5cc19774d..401b6f20d3405 100644 --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -56,8 +56,8 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 { } ; SI-LABEL: {{^}}s_addk_v2i32_k0: -; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI: s_endpgm ; Note: dummy argument here to prevent combining of descriptor loads for %out and %b define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 47998767a948c..be10302c42854 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -331,39 +331,39 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s7, s2, 16 -; VI-NEXT: s_sub_i32 s7, 0, s7 -; VI-NEXT: s_sub_i32 s4, 0, s3 -; VI-NEXT: s_lshr_b32 s6, s3, 16 -; VI-NEXT: s_ashr_i32 s8, s2, 16 -; VI-NEXT: s_sext_i32_i16 s7, s7 -; VI-NEXT: s_sub_i32 s5, 0, s2 -; VI-NEXT: s_sub_i32 s6, 0, s6 -; VI-NEXT: s_max_i32 s7, s8, s7 -; VI-NEXT: s_ashr_i32 s8, s3, 16 -; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_sub_i32 s1, 0, s2 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_i32 s0, 0, s3 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_sub_i32 s5, 0, s5 +; VI-NEXT: s_ashr_i32 s6, s2, 16 +; VI-NEXT: s_sext_i32_i16 s1, s1 ; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_max_i32 s3, s3, s4 -; VI-NEXT: s_max_i32 s6, s8, s6 -; VI-NEXT: s_max_i32 s2, s2, s5 -; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s4, s6, 16 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_add_i32 s2, s2, 2 -; VI-NEXT: s_or_b32 s3, s4, s3 -; VI-NEXT: s_lshl_b32 s4, s7, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s2, s4, s2 -; VI-NEXT: s_add_i32 s3, s3, 0x20000 -; VI-NEXT: s_add_i32 s2, s2, 0x20000 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_sub_i32 s4, 0, s4 +; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: s_max_i32 s1, s2, s1 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_sext_i32_i16 s2, s3 +; VI-NEXT: s_max_i32 s5, s6, s5 +; VI-NEXT: s_ashr_i32 s6, s3, 16 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_max_i32 s0, s2, s0 +; VI-NEXT: s_max_i32 s4, s6, s4 +; VI-NEXT: s_add_i32 s0, s0, 2 +; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_add_i32 s1, s1, 2 +; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: s_lshl_b32 s2, s5, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s1, s2, s1 +; VI-NEXT: s_add_i32 s0, s0, 0x20000 +; VI-NEXT: s_add_i32 s1, s1, 0x20000 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_abs_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll index 9b3b52012f327..d8f7f8d7fefcc 100644 --- a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll @@ -136,7 +136,8 @@ define amdgpu_kernel void @store_as4_2xi32(ptr addrspace(4) %p, <2 x i32> %v) { ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; CHECK-NEXT: s_endpgm store <2 x i32> %v, ptr addrspace(4) %p @@ -163,7 +164,8 @@ define amdgpu_kernel void @store_as4_2xfloat(ptr addrspace(4) %p, <2 x float> %v ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; CHECK-NEXT: s_endpgm store <2 x float> %v, ptr addrspace(4) %p diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index eaab3531824c4..fc42f476fe7d0 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -164,98 +164,102 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v0 -; GFX6-NEXT: s_mul_i32 s6, s6, s2 -; GFX6-NEXT: s_sub_i32 s0, s0, s6 -; GFX6-NEXT: s_sub_i32 s6, s0, s2 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s6, s0, s2 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s2, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s4, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s4, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s4, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s9 +; GFX6-NEXT: s_sub_i32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s9 +; GFX6-NEXT: s_cmp_ge_u32 s0, s9 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s9 +; GFX6-NEXT: s_cmp_ge_u32 s0, s9 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX8-NEXT: s_sub_i32 s6, 0, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s6 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_readfirstlane_b32 s6, v0 -; GFX8-NEXT: s_mul_i32 s6, s6, s2 -; GFX8-NEXT: s_sub_i32 s0, s0, s6 -; GFX8-NEXT: s_sub_i32 s6, s0, s2 -; GFX8-NEXT: s_cmp_ge_u32 s0, s2 -; GFX8-NEXT: s_cselect_b32 s0, s6, s0 -; GFX8-NEXT: s_sub_i32 s6, s0, s2 -; GFX8-NEXT: s_cmp_ge_u32 s0, s2 -; GFX8-NEXT: s_cselect_b32 s0, s6, s0 -; GFX8-NEXT: s_sub_i32 s2, 0, s3 -; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_mul_i32 s4, s4, s6 +; GFX8-NEXT: s_sub_i32 s2, s2, s4 +; GFX8-NEXT: s_sub_i32 s4, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s4, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s4, 0, s7 +; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: s_mul_i32 s0, s0, s3 -; GFX8-NEXT: s_sub_i32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s3 -; GFX8-NEXT: s_cmp_ge_u32 s0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v2 +; GFX8-NEXT: s_mul_i32 s0, s0, s7 +; GFX8-NEXT: s_sub_i32 s0, s3, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s7 +; GFX8-NEXT: s_cmp_ge_u32 s0, s7 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s3 -; GFX8-NEXT: s_cmp_ge_u32 s0, s3 +; GFX8-NEXT: s_sub_i32 s1, s0, s7 +; GFX8-NEXT: s_cmp_ge_u32 s0, s7 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm %result0 = udiv <2 x i32> %x, %y store <2 x i32> %result0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index b31cc36a5f7c6..a3309481fb941 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -340,8 +340,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll index c53f4b6d7ff2b..8d811d82efed3 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll @@ -45,13 +45,17 @@ define void @merge_f32_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2 ; CHECK-LABEL: define void @merge_f32_v2f16_type( ; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0 -; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr addrspace(1) [[GEP1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1 -; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-NEXT: [[LOAD1_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[LOAD2_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[LOAD1_TOORIG:%.*]] = bitcast i32 [[LOAD1_MUT1]] to float +; CHECK-NEXT: [[LOAD2_TOORIG:%.*]] = bitcast i32 [[LOAD2_MUT2]] to <2 x half> ; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 -; CHECK-NEXT: store float [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 -; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-NEXT: [[LOAD1_BC:%.*]] = bitcast float [[LOAD1_TOORIG]] to i32 +; CHECK-NEXT: [[LOAD2_BC:%.*]] = bitcast <2 x half> [[LOAD2_TOORIG]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD1_BC]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD2_BC]], i32 1 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(2) [[STORE_GEP1]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0 @@ -112,3 +116,24 @@ define void @no_merge_mixed_ptr_addrspaces(ptr addrspace(1) %ptr1, ptr addrspace store ptr addrspace(2) %load2, ptr addrspace(2) %store.gep2, align 4 ret void } + +; Stores in this test should not be vectorized as as the total byte span +; from the end of %gep.a to the end of %gep.b is not a power of 2. This +; is a necessary condition for splitChainByAlignment. +define void @check_contiguity_of_base_ptrs(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define void @check_contiguity_of_base_ptrs( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) { +; CHECK-NEXT: store i32 274, ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 4 +; CHECK-NEXT: store i64 3610770474484254748, ptr addrspace(1) [[GEP_A]], align 8 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 12 +; CHECK-NEXT: store <2 x i32> , ptr addrspace(1) [[GEP_B]], align 4 +; CHECK-NEXT: ret void +; + store i32 274, ptr addrspace(1) %ptr, align 4 + %gep.a = getelementptr inbounds nuw i8, ptr addrspace(1) %ptr, i64 4 + store i64 3610770474484254748, ptr addrspace(1) %gep.a, align 8 + %gep.b = getelementptr inbounds nuw i8, ptr addrspace(1) %ptr, i64 12 + store <2 x i32> , ptr addrspace(1) %gep.b, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll index d6b51039d5b44..40d4fd40a0bcc 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll @@ -130,21 +130,39 @@ entry: ret void } -; Ideally this would be merged define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 { ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16( ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: [[LD_01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[LD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[LD_1_TOORIG:%.*]] = bitcast i32 [[LD_1_MUT2]] to <2 x i16> +; CHECK-NEXT: ret void +; +entry: + %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1 + + %ld.0 = load i32, ptr addrspace(1) %a + %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1 + + ret void +} + +define amdgpu_kernel void @no_merge_load_i32_v2i8(ptr addrspace(1) nocapture %a) #0 { +; CHECK-LABEL: define amdgpu_kernel void @no_merge_load_i32_v2i8( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1 ; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4 -; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4 +; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i8>, ptr addrspace(1) [[A_1]], align 2 ; CHECK-NEXT: ret void ; entry: %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1 %ld.0 = load i32, ptr addrspace(1) %a - %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1 + %ld.1 = load <2 x i8>, ptr addrspace(1) %a.1 ret void }