diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index d0af2d3d2e4c2..9acfd872e574b 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -431,7 +431,7 @@ LLVM_ABI void combineAAMetadata(Instruction *K, const Instruction *J); /// Copy the metadata from the source instruction to the destination (the /// replacement for the source instruction). -LLVM_ABI void copyMetadataForAccess(Instruction &Dest, Instruction &Source); +LLVM_ABI void copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source); /// Patch the replacement so that it is not more restrictive than the value /// being replaced. It assumes that the replacement does not get moved from diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 98884c441096e..fdff21b6ef8df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -1035,7 +1035,7 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl( LoadInst *NewLI = IRB.CreateAlignedLoad( LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset), Name + ".off." + Twine(ByteOffset)); - copyMetadataForAccess(*NewLI, OrigLI); + copyMetadataForLoad(*NewLI, OrigLI); NewLI->setAAMetadata( AANodes.adjustForAccess(ByteOffset, LoadableType, DL)); NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 3e04aeb675d2a..9491610190c10 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -415,7 +415,7 @@ void PointerReplacer::replace(Instruction *I) { LT->getAlign(), LT->getOrdering(), LT->getSyncScopeID()); NewI->takeName(LT); - copyMetadataForAccess(*NewI, *LT); + copyMetadataForLoad(*NewI, *LT); IC.InsertNewInstWith(NewI, LT->getIterator()); IC.replaceInstUsesWith(*LT, NewI); @@ -606,7 +606,7 @@ LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy, Builder.CreateAlignedLoad(NewTy, LI.getPointerOperand(), LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix); NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); - copyMetadataForAccess(*NewLoad, LI); + copyMetadataForLoad(*NewLoad, LI); return NewLoad; } diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index a7c322bfcb981..70afe833c9f47 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -3272,7 +3272,7 @@ class AllocaSliceRewriter : public InstVisitor { // Copy any metadata that is valid for the new load. This may require // conversion to a different kind of metadata, e.g. !nonnull might change // to !range or vice versa. - copyMetadataForAccess(*NewLI, LI); + copyMetadataForLoad(*NewLI, LI); // Do this after copyMetadataForLoad() to preserve the TBAA shift. if (AATags) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index dec2e019333b9..a03cf6e953e35 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3100,70 +3100,54 @@ void llvm::combineAAMetadata(Instruction *K, const Instruction *J) { combineMetadata(K, J, /*DoesKMove=*/true, /*AAOnly=*/true); } -void llvm::copyMetadataForAccess(Instruction &DestI, Instruction &SourceI) { +void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) { SmallVector, 8> MD; - SourceI.getAllMetadata(MD); - MDBuilder MDB(DestI.getContext()); - Type *NewType = DestI.getType(); - - // Only needed for range metadata on loads. - const DataLayout *DL = nullptr; - const LoadInst *LSource = dyn_cast(&SourceI); - if (LSource) - DL = &LSource->getDataLayout(); - + Source.getAllMetadata(MD); + MDBuilder MDB(Dest.getContext()); + Type *NewType = Dest.getType(); + const DataLayout &DL = Source.getDataLayout(); for (const auto &MDPair : MD) { unsigned ID = MDPair.first; MDNode *N = MDPair.second; - + // Note, essentially every kind of metadata should be preserved here! This + // routine is supposed to clone a load instruction changing *only its type*. + // The only metadata it makes sense to drop is metadata which is invalidated + // when the pointer type changes. This should essentially never be the case + // in LLVM, but we explicitly switch over only known metadata to be + // conservatively correct. If you are adding metadata to LLVM which pertains + // to loads, you almost certainly want to add it here. switch (ID) { - // Applies to both loads and stores as-is. case LLVMContext::MD_dbg: + case LLVMContext::MD_tbaa: case LLVMContext::MD_prof: + case LLVMContext::MD_fpmath: case LLVMContext::MD_tbaa_struct: + case LLVMContext::MD_invariant_load: case LLVMContext::MD_alias_scope: case LLVMContext::MD_noalias: case LLVMContext::MD_nontemporal: + case LLVMContext::MD_mem_parallel_loop_access: case LLVMContext::MD_access_group: case LLVMContext::MD_noundef: case LLVMContext::MD_noalias_addrspace: - case LLVMContext::MD_mem_parallel_loop_access: - DestI.setMetadata(ID, N); - break; - - // Load-only metadata. - case LLVMContext::MD_fpmath: - case LLVMContext::MD_invariant_load: - if (isa(DestI)) - DestI.setMetadata(ID, N); + // All of these directly apply. + Dest.setMetadata(ID, N); break; case LLVMContext::MD_nonnull: - if (auto *LDest = dyn_cast(&DestI)) { - if (LSource) - copyNonnullMetadata(*LSource, N, *LDest); - } + copyNonnullMetadata(Source, N, Dest); break; case LLVMContext::MD_align: case LLVMContext::MD_dereferenceable: case LLVMContext::MD_dereferenceable_or_null: - // Applies to both loads and stores only if the new type is also a - // pointer. + // These only directly apply if the new type is also a pointer. if (NewType->isPointerTy()) - DestI.setMetadata(ID, N); + Dest.setMetadata(ID, N); break; case LLVMContext::MD_range: - if (auto *LDest = dyn_cast(&DestI)) { - if (LSource && DL) - copyRangeMetadata(*DL, *LSource, N, *LDest); - } - break; - - case LLVMContext::MD_tbaa: - if (isa(DestI)) - DestI.setMetadata(ID, N); + copyRangeMetadata(DL, Source, N, Dest); break; } } diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 114df653bad83..c28314f6ab124 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -112,7 +112,6 @@ #include #include #include -#include #include #include @@ -269,6 +268,11 @@ class Vectorizer { /// isGuaranteedToTransferExecutionToSuccessor(I) == true. bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End); + /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores + /// in the same BB with the same value for getUnderlyingObject() etc. + bool runOnEquivalenceClass(const EqClassKey &EqClassKey, + ArrayRef EqClass); + /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class /// where all instructions access a known, constant offset from the first /// instruction. @@ -334,22 +338,12 @@ class Vectorizer { EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin, BasicBlock::iterator End); - /// Inserts a cast instruction to convert Inst to DstTy. - Value *insertCast(Value *Val, Type *DstTy); - /// Partitions Instrs into "chains" where every instruction has a known /// constant offset from the first instr in the chain. /// /// Postcondition: For all i, ret[i][0].second == 0, because the first instr /// in the chain is the leader, and an instr touches distance 0 from itself. std::vector gatherChains(ArrayRef Instrs); - - // Helpers for chain merging. - std::optional computeLeaderDelta(Instruction *I1, Instruction *I2); - bool chainsOverlapAfterRebase(const Chain &A, const Chain &B, - const APInt &Delta) const; - static void rebaseChain(Chain &C, const APInt &Delta); - void normalizeChainToType(Chain &C, Type *CastTy); }; class LoadStoreVectorizerLegacyPass : public FunctionPass { @@ -431,20 +425,6 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, return Changed ? PA : PreservedAnalyses::all(); } -static const Value *getUnderlyingObject(const Value *Ptr) { - const Value *ObjPtr = llvm::getUnderlyingObject(Ptr); - if (const auto *Sel = dyn_cast(ObjPtr)) { - // The select's themselves are distinct instructions even if they share - // the same condition and evaluate to consecutive pointers for true and - // false values of the condition. Therefore using the select's themselves - // for grouping instructions would put consecutive accesses into different - // lists and they won't be even checked for being consecutive, and won't - // be vectorized. - return Sel->getCondition(); - } - return ObjPtr; -} - bool Vectorizer::run() { bool Changed = false; // Break up the BB if there are any instrs which aren't guaranteed to transfer @@ -488,88 +468,6 @@ bool Vectorizer::run() { return Changed; } -Value *Vectorizer::insertCast(Value *Val, Type *DstTy) { - if (DL.getTypeSizeInBits(Val->getType()) == DL.getTypeSizeInBits(DstTy)) { - return Builder.CreateBitOrPointerCast(Val, DstTy, Val->getName() + ".bc"); - } - - // If the types are of different sizes and both are integers, we can use - // zext or sext to cast. - if (Val->getType()->isIntegerTy() && DstTy->isIntegerTy()) { - if (DL.getTypeSizeInBits(Val->getType()) < DL.getTypeSizeInBits(DstTy)) { - return Builder.CreateZExt(Val, DstTy, Val->getName() + ".bc"); - } - return Builder.CreateTrunc(Val, DstTy, Val->getName() + ".bc"); - } - - return nullptr; -} - -std::optional Vectorizer::computeLeaderDelta(Instruction *I1, - Instruction *I2) { - assert(((isa(I1) && isa(I2)) || - (isa(I1) && isa(I2))) && - "computeLeaderDelta must be called with two load or two store " - "instructions"); - Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1; - const Value *Ptr1 = getLoadStorePointerOperand(I1); - const Value *Ptr2 = getLoadStorePointerOperand(I2); - return getConstantOffset(const_cast(Ptr1), const_cast(Ptr2), - CtxInst); -} - -bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B, - const APInt &Delta) const { - ConstantRange ARange( - A.front().OffsetFromLeader, - A.back().OffsetFromLeader + - DL.getTypeStoreSize(getLoadStoreType(A.back().Inst))); - ConstantRange BRange( - B.front().OffsetFromLeader + Delta, - B.back().OffsetFromLeader + Delta + - DL.getTypeStoreSize(getLoadStoreType(B.back().Inst))); - return !ARange.intersectWith(BRange).isEmptySet(); -} - -void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) { - for (ChainElem &E : C) - E.OffsetFromLeader += Delta; -} - -void Vectorizer::normalizeChainToType(Chain &C, Type *CastTy) { - for (ChainElem &Elem : C) { - Instruction *Inst = Elem.Inst; - Type *OrigValTy = getLoadStoreType(Inst); - if (OrigValTy == CastTy) - continue; - - if (auto *LI = dyn_cast(Inst)) { - Builder.SetInsertPoint(LI); - LoadInst *NewLoad = Builder.CreateLoad(CastTy, LI->getPointerOperand(), - LI->getName() + ".mut"); - copyMetadataForAccess(*NewLoad, *LI); - Value *CastBack = insertCast(NewLoad, OrigValTy); - if (!CastBack) - llvm_unreachable("Failed to insert cast"); - LI->replaceAllUsesWith(CastBack); - ToErase.emplace_back(LI); - Elem.Inst = NewLoad; - } else if (auto *SI = dyn_cast(Inst)) { - Builder.SetInsertPoint(SI); - Value *CastVal = insertCast(SI->getValueOperand(), CastTy); - if (!CastVal) - llvm_unreachable("Failed to insert cast"); - StoreInst *NewStore = - Builder.CreateStore(CastVal, SI->getPointerOperand()); - NewStore->setAlignment(SI->getAlign()); - NewStore->setVolatile(SI->isVolatile()); - copyMetadataForAccess(*NewStore, *SI); - ToErase.emplace_back(SI); - Elem.Inst = NewStore; - } - } -} - bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End) { LLVM_DEBUG({ @@ -582,120 +480,49 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin, }); bool Changed = false; - SmallVector ContiguousSubChains; - for (const auto &[EqClassKey, EqClass] : - collectEquivalenceClasses(Begin, End)) { - - LLVM_DEBUG({ - dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() - << " keyed on " << EqClassKey << ":\n"; - for (Instruction *I : EqClass) - dbgs() << " " << *I << "\n"; - }); - - for (Chain &C : gatherChains(EqClass)) { - - // Split up the chain into increasingly smaller chains, until we can - // finally vectorize the chains. - // - // (Don't be scared by the depth of the loop nest here. These operations - // are all at worst O(n lg n) in the number of instructions, and splitting - // chains doesn't change the number of instrs. So the whole loop nest is - // O(n lg n).) - for (auto &C : splitChainByMayAliasInstrs(C)) { - for (auto &C : splitChainByContiguity(C)) { - ContiguousSubChains.emplace_back(C); - } - } - } - } - - // Merge chains in reverse order, so that the first chain is the largest. - for (int I = ContiguousSubChains.size() - 1; I > 0; I--) { - Chain &C1 = ContiguousSubChains[I - 1]; - Chain &C2 = ContiguousSubChains[I]; + collectEquivalenceClasses(Begin, End)) + Changed |= runOnEquivalenceClass(EqClassKey, EqClass); - // If the scalar types of the chains are the same, we can merge them - // without inserting any casts. - if (getLoadStoreType(C1[0].Inst)->getScalarType() == - getLoadStoreType(C2[0].Inst)->getScalarType()) - continue; - - const Value *C1Ptr = getLoadStorePointerOperand(C1[0].Inst); - const Value *C2Ptr = getLoadStorePointerOperand(C2[0].Inst); - unsigned AS1 = C1Ptr->getType()->getPointerAddressSpace(); - unsigned AS2 = C2Ptr->getType()->getPointerAddressSpace(); - bool C1IsLoad = isa(C1[0].Inst); - bool C2IsLoad = isa(C2[0].Inst); - - // If the chains are mapped to different types, have distinct underlying - // pointer objects, or include both loads and stores, skip. - if (C1IsLoad != C2IsLoad || AS1 != AS2 || - ::getUnderlyingObject(C1Ptr) != ::getUnderlyingObject(C2Ptr)) - continue; - - // Compute constant offset between chain leaders; if unknown, skip. - std::optional DeltaOpt = computeLeaderDelta(C1[0].Inst, C2[0].Inst); - if (!DeltaOpt) - continue; - - // Check that rebasing C2 into C1's coordinate space will not overlap C1. - if (chainsOverlapAfterRebase(C1, C2, *DeltaOpt)) - continue; - - // Determine the common integer cast type for normalization and ensure total - // bitwidth matches across all elements of both chains. - Type *C1ElemTy = getLoadStoreType(C1[0].Inst); - unsigned TotalBits = DL.getTypeSizeInBits(C1ElemTy); - auto AllElemsMatchTotalBits = [&](const Chain &C) { - return llvm::all_of(C, [&](const ChainElem &E) { - return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits; - }); - }; - if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2)) - continue; + return Changed; +} - // Power-of-two span ensures we can form a legal, single vector access - // without padding or splitting. Many targets and cost models assume POT - // widths, and it guarantees an integral element count for the chosen - // VecElemTy. - APInt Sz = C2.front().OffsetFromLeader + - DL.getTypeStoreSize(getLoadStoreType(C2.front().Inst)) - - C1.back().OffsetFromLeader + *DeltaOpt; - if (!Sz.isPowerOf2()) - continue; +bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey, + ArrayRef EqClass) { + bool Changed = false; - // Rebase C2's offsets into C1's coordinate space prior to merging and - // merge C2 into C1 by appending all elements of C2 to C1, then erase C2 - // from ContiguousSubChains. - rebaseChain(C2, *DeltaOpt); - C1.insert(C1.end(), C2.begin(), C2.end()); - ContiguousSubChains.erase(ContiguousSubChains.begin() + I); - - // Normalize the value operand/result type of each instruction in C1 to - // C1CastTy. - Type *C1CastTy = - Type::getIntNTy(C1ElemTy->getContext(), DL.getTypeSizeInBits(C1ElemTy)); - normalizeChainToType(C1, C1CastTy); - } + LLVM_DEBUG({ + dbgs() << "LSV: Running on equivalence class of size " << EqClass.size() + << " keyed on " << EqClassKey << ":\n"; + for (Instruction *I : EqClass) + dbgs() << " " << *I << "\n"; + }); - for (auto &C : ContiguousSubChains) { - if (C.size() <= 1) - continue; - for (auto &AlignedSubChain : splitChainByAlignment(C)) - Changed |= vectorizeChain(AlignedSubChain); - } + std::vector Chains = gatherChains(EqClass); + LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size() + << " nontrivial chains.\n";); + for (Chain &C : Chains) + Changed |= runOnChain(C); + return Changed; +} - // Erase all instructions scheduled for deletion in this pseudo-BB. - for (Instruction *I : ToErase) { - auto *PtrOperand = getLoadStorePointerOperand(I); - if (I->use_empty()) - I->eraseFromParent(); - RecursivelyDeleteTriviallyDeadInstructions(PtrOperand); - } - ToErase.clear(); +bool Vectorizer::runOnChain(Chain &C) { + LLVM_DEBUG({ + dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n"; + dumpChain(C); + }); + // Split up the chain into increasingly smaller chains, until we can finally + // vectorize the chains. + // + // (Don't be scared by the depth of the loop nest here. These operations are + // all at worst O(n lg n) in the number of instructions, and splitting chains + // doesn't change the number of instrs. So the whole loop nest is O(n lg n).) + bool Changed = false; + for (auto &C : splitChainByMayAliasInstrs(C)) + for (auto &C : splitChainByContiguity(C)) + for (auto &C : splitChainByAlignment(C)) + Changed |= vectorizeChain(C); return Changed; } @@ -756,7 +583,7 @@ std::vector Vectorizer::splitChainByMayAliasInstrs(Chain &C) { LLVM_DEBUG( dbgs() << "LSV: Found intervening may-alias instrs; cannot merge " << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n"); - if (!NewChain.empty()) { + if (NewChain.size() > 1) { LLVM_DEBUG({ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; dumpChain(NewChain); @@ -768,7 +595,7 @@ std::vector Vectorizer::splitChainByMayAliasInstrs(Chain &C) { NewChain = SmallVector({*ChainIt}); } } - if (!NewChain.empty()) { + if (NewChain.size() > 1) { LLVM_DEBUG({ dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n"; dumpChain(NewChain); @@ -833,6 +660,8 @@ std::vector Vectorizer::splitChainByContiguity(Chain &C) { PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd); } + // Filter out length-1 chains, these are uninteresting. + llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; }); return Ret; } @@ -852,7 +681,7 @@ Type *Vectorizer::getChainElemTy(const Chain &C) { if (any_of(C, [](const ChainElem &E) { return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy(); })) { - return IntegerType::getIntNTy( + return Type::getIntNTy( F.getContext(), DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType())); } @@ -1640,6 +1469,20 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, BasicBlock::iterator End) { EquivalenceClassMap Ret; + auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * { + const Value *ObjPtr = llvm::getUnderlyingObject(Ptr); + if (const auto *Sel = dyn_cast(ObjPtr)) { + // The select's themselves are distinct instructions even if they share + // the same condition and evaluate to consecutive pointers for true and + // false values of the condition. Therefore using the select's themselves + // for grouping instructions would put consecutive accesses into different + // lists and they won't be even checked for being consecutive, and won't + // be vectorized. + return Sel->getCondition(); + } + return ObjPtr; + }; + for (Instruction &I : make_range(Begin, End)) { auto *LI = dyn_cast(&I); auto *SI = dyn_cast(&I); @@ -1687,7 +1530,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; - Ret[{::getUnderlyingObject(Ptr), AS, + Ret[{GetUnderlyingObject(Ptr), AS, DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()), /*IsLoad=*/LI != nullptr}] .emplace_back(&I); @@ -1782,7 +1625,8 @@ std::vector Vectorizer::gatherChains(ArrayRef Instrs) { Ret.reserve(Chains.size()); // Iterate over MRU rather than Chains so the order is deterministic. for (auto &E : MRU) - Ret.emplace_back(std::move(E.second)); + if (E.second.size() > 1) + Ret.emplace_back(std::move(E.second)); return Ret; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll index 7dd907e3c143f..fc236147f1238 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll @@ -20,5 +20,3 @@ define void @addi32(i32 %arg1, i32 %arg2) { store i32 %res, ptr addrspace(1) poison ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index a58766270561b..ebbeab94066d6 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -510,55 +510,53 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 +; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18 +; GFX908-NEXT: s_mov_b32 s12, 0 +; GFX908-NEXT: s_mov_b32 s9, s12 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 -; GFX908-NEXT: s_load_dword s5, s[8:9], 0x18 -; GFX908-NEXT: s_mov_b32 s4, 0 -; GFX908-NEXT: s_mov_b32 s9, s4 -; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX908-NEXT: s_sub_i32 s8, 0, s1 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s5 +; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX908-NEXT: s_sub_i32 s1, 0, s7 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0 ; GFX908-NEXT: v_mov_b32_e32 v17, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX908-NEXT: v_readfirstlane_b32 s10, v0 -; GFX908-NEXT: s_mul_i32 s8, s8, s10 -; GFX908-NEXT: s_mul_hi_u32 s8, s10, s8 -; GFX908-NEXT: s_add_i32 s10, s10, s8 -; GFX908-NEXT: s_mul_hi_u32 s8, s0, s10 -; GFX908-NEXT: s_mul_i32 s10, s8, s1 -; GFX908-NEXT: s_sub_i32 s0, s0, s10 -; GFX908-NEXT: s_add_i32 s11, s8, 1 -; GFX908-NEXT: s_sub_i32 s10, s0, s1 -; GFX908-NEXT: s_cmp_ge_u32 s0, s1 -; GFX908-NEXT: s_cselect_b32 s8, s11, s8 -; GFX908-NEXT: s_cselect_b32 s0, s10, s0 -; GFX908-NEXT: s_add_i32 s10, s8, 1 -; GFX908-NEXT: s_cmp_ge_u32 s0, s1 -; GFX908-NEXT: s_cselect_b32 s8, s10, s8 -; GFX908-NEXT: s_lshr_b32 s5, s5, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s5 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[8:9], 5 -; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; GFX908-NEXT: v_readfirstlane_b32 s2, v0 +; GFX908-NEXT: s_mul_i32 s1, s1, s2 +; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1 +; GFX908-NEXT: s_add_i32 s2, s2, s1 +; GFX908-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX908-NEXT: s_mul_i32 s2, s1, s7 +; GFX908-NEXT: s_sub_i32 s2, s6, s2 +; GFX908-NEXT: s_add_i32 s3, s1, 1 +; GFX908-NEXT: s_sub_i32 s6, s2, s7 +; GFX908-NEXT: s_cmp_ge_u32 s2, s7 +; GFX908-NEXT: s_cselect_b32 s1, s3, s1 +; GFX908-NEXT: s_cselect_b32 s2, s6, s2 +; GFX908-NEXT: s_add_i32 s3, s1, 1 +; GFX908-NEXT: s_cmp_ge_u32 s2, s7 +; GFX908-NEXT: s_cselect_b32 s8, s3, s1 +; GFX908-NEXT: s_lshr_b32 s2, s0, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2 +; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_or_b32 s12, s12, 28 +; GFX908-NEXT: s_or_b32 s14, s14, 28 +; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s5, v16 -; GFX908-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX908-NEXT: s_mul_i32 s3, s3, s5 -; GFX908-NEXT: s_mul_hi_u32 s9, s2, s5 -; GFX908-NEXT: s_mul_i32 s2, s2, s5 -; GFX908-NEXT: s_add_i32 s3, s9, s3 -; GFX908-NEXT: s_lshl_b64 s[16:17], s[2:3], 5 +; GFX908-NEXT: v_readfirstlane_b32 s2, v16 +; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX908-NEXT: s_mul_i32 s3, s5, s2 +; GFX908-NEXT: s_mul_hi_u32 s5, s4, s2 +; GFX908-NEXT: s_mul_i32 s2, s4, s2 +; GFX908-NEXT: s_add_i32 s3, s5, s3 +; GFX908-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 ; GFX908-NEXT: s_branch .LBB3_2 -; GFX908-NEXT: .LBB3_1: ; %Flow21 +; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 @@ -571,47 +569,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1 -; GFX908-NEXT: s_mov_b32 s5, s4 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 +; GFX908-NEXT: s_mov_b32 s13, s12 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s12 ; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v7, s5 -; GFX908-NEXT: v_mov_b32_e32 v9, s5 -; GFX908-NEXT: v_mov_b32_e32 v5, s5 -; GFX908-NEXT: v_mov_b32_e32 v6, s4 -; GFX908-NEXT: v_mov_b32_e32 v8, s4 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s12 +; GFX908-NEXT: v_mov_b32_e32 v8, s12 +; GFX908-NEXT: v_mov_b32_e32 v5, s13 +; GFX908-NEXT: v_mov_b32_e32 v7, s13 +; GFX908-NEXT: v_mov_b32_e32 v9, s13 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[12:13] +; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s5, v2 -; GFX908-NEXT: v_readfirstlane_b32 s9, v3 -; GFX908-NEXT: s_add_u32 s5, s5, 1 -; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s22, s10, s5 -; GFX908-NEXT: s_mul_i32 s9, s10, s9 -; GFX908-NEXT: s_mul_i32 s23, s11, s5 -; GFX908-NEXT: s_add_i32 s9, s22, s9 -; GFX908-NEXT: s_mul_i32 s5, s10, s5 -; GFX908-NEXT: s_add_i32 s9, s9, s23 +; GFX908-NEXT: v_readfirstlane_b32 s9, v2 +; GFX908-NEXT: v_readfirstlane_b32 s13, v3 +; GFX908-NEXT: s_add_u32 s9, s9, 1 +; GFX908-NEXT: s_addc_u32 s13, s13, 0 +; GFX908-NEXT: s_mul_hi_u32 s22, s6, s9 +; GFX908-NEXT: s_mul_i32 s13, s6, s13 +; GFX908-NEXT: s_mul_i32 s23, s7, s9 +; GFX908-NEXT: s_add_i32 s13, s22, s13 +; GFX908-NEXT: s_mul_i32 s9, s6, s9 +; GFX908-NEXT: s_add_i32 s13, s13, s23 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s16 +; GFX908-NEXT: s_add_u32 s20, s20, s4 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s17 +; GFX908-NEXT: s_addc_u32 s21, s21, s5 ; GFX908-NEXT: s_mov_b64 s[22:23], 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s22, s20, s5 -; GFX908-NEXT: s_addc_u32 s23, s21, s9 +; GFX908-NEXT: s_add_u32 s22, s20, s9 +; GFX908-NEXT: s_addc_u32 s23, s21, s13 ; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc @@ -659,17 +657,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1 -; GFX908-NEXT: .LBB3_10: ; %Flow20 +; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_mov_b64 s[2:3], -1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s6, s6, s8 -; GFX908-NEXT: s_addc_u32 s7, s7, 0 -; GFX908-NEXT: s_add_u32 s12, s12, s14 -; GFX908-NEXT: s_addc_u32 s13, s13, s15 +; GFX908-NEXT: s_add_u32 s10, s10, s8 +; GFX908-NEXT: s_addc_u32 s11, s11, 0 +; GFX908-NEXT: s_add_u32 s14, s14, s16 +; GFX908-NEXT: s_addc_u32 s15, s15, s17 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock @@ -678,54 +676,52 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 +; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18 +; GFX90A-NEXT: s_mov_b32 s12, 0 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 -; GFX90A-NEXT: s_load_dword s5, s[8:9], 0x18 -; GFX90A-NEXT: s_mov_b32 s4, 0 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX90A-NEXT: s_sub_i32 s8, 0, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX90A-NEXT: s_sub_i32 s1, 0, s7 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s5 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 -; GFX90A-NEXT: s_mul_i32 s8, s8, s10 -; GFX90A-NEXT: s_mul_hi_u32 s8, s10, s8 -; GFX90A-NEXT: s_add_i32 s10, s10, s8 -; GFX90A-NEXT: s_mul_hi_u32 s8, s0, s10 -; GFX90A-NEXT: s_mul_i32 s10, s8, s1 -; GFX90A-NEXT: s_sub_i32 s0, s0, s10 -; GFX90A-NEXT: s_add_i32 s11, s8, 1 -; GFX90A-NEXT: s_sub_i32 s10, s0, s1 -; GFX90A-NEXT: s_cmp_ge_u32 s0, s1 -; GFX90A-NEXT: s_cselect_b32 s8, s11, s8 -; GFX90A-NEXT: s_cselect_b32 s0, s10, s0 -; GFX90A-NEXT: s_add_i32 s10, s8, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s0, s1 -; GFX90A-NEXT: s_cselect_b32 s8, s10, s8 -; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], 5 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[8:9], 5 -; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 +; GFX90A-NEXT: s_mul_i32 s1, s1, s2 +; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1 +; GFX90A-NEXT: s_add_i32 s2, s2, s1 +; GFX90A-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX90A-NEXT: s_mul_i32 s2, s1, s7 +; GFX90A-NEXT: s_sub_i32 s2, s6, s2 +; GFX90A-NEXT: s_add_i32 s3, s1, 1 +; GFX90A-NEXT: s_sub_i32 s6, s2, s7 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 +; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 +; GFX90A-NEXT: s_cselect_b32 s2, s6, s2 +; GFX90A-NEXT: s_add_i32 s3, s1, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 +; GFX90A-NEXT: s_cselect_b32 s8, s3, s1 +; GFX90A-NEXT: s_lshr_b32 s2, s0, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 +; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 ; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_or_b32 s12, s12, 28 +; GFX90A-NEXT: s_or_b32 s14, s14, 28 +; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s5, v18 -; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX90A-NEXT: s_mul_i32 s3, s3, s5 -; GFX90A-NEXT: s_mul_hi_u32 s9, s2, s5 -; GFX90A-NEXT: s_mul_i32 s2, s2, s5 -; GFX90A-NEXT: s_add_i32 s3, s9, s3 -; GFX90A-NEXT: s_lshl_b64 s[16:17], s[2:3], 5 +; GFX90A-NEXT: v_readfirstlane_b32 s2, v18 +; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX90A-NEXT: s_mul_i32 s3, s5, s2 +; GFX90A-NEXT: s_mul_hi_u32 s5, s4, s2 +; GFX90A-NEXT: s_mul_i32 s2, s4, s2 +; GFX90A-NEXT: s_add_i32 s3, s5, s3 +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 ; GFX90A-NEXT: s_branch .LBB3_2 -; GFX90A-NEXT: .LBB3_1: ; %Flow21 +; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 @@ -738,34 +734,34 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1 -; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 +; GFX90A-NEXT: s_mov_b32 s13, s12 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8 -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[6:7], 0 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[12:13] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 +; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 -; GFX90A-NEXT: s_add_u32 s5, s5, 1 -; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s22, s10, s5 -; GFX90A-NEXT: s_mul_i32 s9, s10, s9 -; GFX90A-NEXT: s_mul_i32 s23, s11, s5 -; GFX90A-NEXT: s_add_i32 s9, s22, s9 -; GFX90A-NEXT: s_mul_i32 s5, s10, s5 -; GFX90A-NEXT: s_add_i32 s9, s9, s23 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s13, v5 +; GFX90A-NEXT: s_add_u32 s9, s9, 1 +; GFX90A-NEXT: s_addc_u32 s13, s13, 0 +; GFX90A-NEXT: s_mul_hi_u32 s22, s6, s9 +; GFX90A-NEXT: s_mul_i32 s13, s6, s13 +; GFX90A-NEXT: s_mul_i32 s23, s7, s9 +; GFX90A-NEXT: s_add_i32 s13, s22, s13 +; GFX90A-NEXT: s_mul_i32 s9, s6, s9 +; GFX90A-NEXT: s_add_i32 s13, s13, s23 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s16 -; GFX90A-NEXT: s_addc_u32 s21, s21, s17 +; GFX90A-NEXT: s_add_u32 s20, s20, s4 +; GFX90A-NEXT: s_addc_u32 s21, s21, s5 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[22:23], 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] @@ -773,8 +769,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s22, s20, s5 -; GFX90A-NEXT: s_addc_u32 s23, s21, s9 +; GFX90A-NEXT: s_add_u32 s22, s20, s9 +; GFX90A-NEXT: s_addc_u32 s23, s21, s13 ; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc @@ -815,17 +811,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1 -; GFX90A-NEXT: .LBB3_10: ; %Flow20 +; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_mov_b64 s[2:3], -1 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s6, s6, s8 -; GFX90A-NEXT: s_addc_u32 s7, s7, 0 -; GFX90A-NEXT: s_add_u32 s12, s12, s14 -; GFX90A-NEXT: s_addc_u32 s13, s13, s15 +; GFX90A-NEXT: s_add_u32 s10, s10, s8 +; GFX90A-NEXT: s_addc_u32 s11, s11, 0 +; GFX90A-NEXT: s_add_u32 s14, s14, s16 +; GFX90A-NEXT: s_addc_u32 s15, s15, s17 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 08ce28c12118b..df77e7de43bf6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -2543,45 +2543,44 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_and_b32 s1, s6, 0xffff +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: s_lshr_b32 s4, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s9, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: s_and_b32 s4, s11, 0xffff +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_and_b32 s4, s9, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s9, 16 +; GFX6-NEXT: s_lshr_b32 s4, s11, 16 ; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_lshr_b32 s4, s7, 16 +; GFX6-NEXT: s_lshr_b32 s4, s9, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -2597,7 +2596,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2605,43 +2603,42 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 0xffff -; GFX9-NEXT: s_and_b32 s5, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_and_b32 s7, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_and_b32 s6, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: s_and_b32 s2, s3, 0xffff +; GFX9-NEXT: s_and_b32 s0, s1, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX9-NEXT: s_lshr_b32 s2, s7, 16 +; GFX9-NEXT: s_lshr_b32 s0, s3, 16 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 +; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -2649,6 +2646,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -2657,7 +2655,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -2759,51 +2758,49 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_and_b32 s1, s6, 0xffff +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_lshr_b32 s5, s6, 16 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 -; GFX6-NEXT: s_and_b32 s8, s9, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4 +; GFX6-NEXT: s_and_b32 s6, s11, 0xffff +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX6-NEXT: s_and_b32 s5, s9, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_lshr_b32 s4, s9, 16 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v1 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1 +; GFX6-NEXT: s_lshr_b32 s4, s11, 16 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_lshr_b32 s5, s7, 16 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 +; GFX6-NEXT: s_lshr_b32 s5, s9, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 @@ -2814,10 +2811,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s7, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -2829,67 +2826,67 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 0xffff -; GFX9-NEXT: s_and_b32 s5, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_and_b32 s9, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: s_and_b32 s8, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: s_and_b32 s4, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_and_b32 s5, s7, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s8, s3, 0xffff +; GFX9-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s6, s7, 16 +; GFX9-NEXT: s_lshr_b32 s2, s3, 16 ; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX9-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 -; GFX9-NEXT: v_sub_u32_e32 v4, s2, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 +; GFX9-NEXT: v_sub_u32_e32 v4, s0, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %r = urem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -2999,64 +2996,62 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s7, s10 -; GFX6-NEXT: s_sext_i32_i16 s6, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GFX6-NEXT: s_xor_b32 s6, s7, s6 -; GFX6-NEXT: s_ashr_i32 s6, s6, 30 +; GFX6-NEXT: s_sext_i32_i16 s4, s10 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s0, s8 -; GFX6-NEXT: s_or_b32 s8, s6, 1 -; GFX6-NEXT: s_mov_b32 s1, s9 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: s_cselect_b32 s6, s8, 0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: s_ashr_i32 s6, s10, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_ashr_i32 s5, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: s_ashr_i32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_xor_b32 s4, s6, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: s_sext_i32_i16 s6, s5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: s_sext_i32_i16 s5, s11 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s11 +; GFX6-NEXT: s_sext_i32_i16 s4, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_ashr_i32 s5, s11, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 -; GFX6-NEXT: s_ashr_i32 s4, s11, 16 +; GFX6-NEXT: s_ashr_i32 s4, s9, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 @@ -3081,13 +3076,13 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s6 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-NEXT: s_sext_i32_i16 s5, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -3099,61 +3094,61 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s8, 0 -; GFX9-NEXT: s_ashr_i32 s5, s6, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: s_or_b32 s2, s2, 1 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 +; GFX9-NEXT: s_or_b32 s0, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s4, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s2, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: s_ashr_i32 s4, s7, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 ; GFX9-NEXT: s_ashr_i32 s2, s3, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 +; GFX9-NEXT: s_ashr_i32 s0, s1, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3271,55 +3266,53 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s0, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX6-NEXT: s_sext_i32_i16 s1, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: s_sext_i32_i16 s4, s10 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s10, s0, 1 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX6-NEXT: s_cselect_b32 s0, s10, 0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_ashr_i32 s4, s8, 16 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GFX6-NEXT: s_ashr_i32 s4, s10, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_ashr_i32 s5, s6, 16 +; GFX6-NEXT: s_ashr_i32 s5, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_lshr_b32 s6, s8, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GFX6-NEXT: s_lshr_b32 s10, s6, 16 -; GFX6-NEXT: s_lshr_b32 s8, s8, 16 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: s_lshr_b32 s7, s10, 16 +; GFX6-NEXT: s_or_b32 s8, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_cselect_b32 s4, s8, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s9 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX6-NEXT: s_sext_i32_i16 s4, s11 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s7 +; GFX6-NEXT: s_sext_i32_i16 s5, s9 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 @@ -3333,30 +3326,30 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 -; GFX6-NEXT: s_ashr_i32 s4, s9, 16 +; GFX6-NEXT: s_ashr_i32 s4, s11, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_ashr_i32 s5, s7, 16 +; GFX6-NEXT: s_ashr_i32 s5, s9, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX6-NEXT: s_lshr_b32 s6, s7, 16 +; GFX6-NEXT: s_lshr_b32 s6, s9, 16 +; GFX6-NEXT: s_lshr_b32 s7, s11, 16 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: s_lshr_b32 s8, s9, 16 -; GFX6-NEXT: s_or_b32 s9, s4, 1 +; GFX6-NEXT: s_or_b32 s8, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s9, 0 +; GFX6-NEXT: s_cselect_b32 s4, s8, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s8 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s7, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -3365,13 +3358,13 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s8, s6 +; GFX9-NEXT: s_sext_i32_i16 s8, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s2 +; GFX9-NEXT: s_sext_i32_i16 s9, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -3383,69 +3376,69 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s10, 0 -; GFX9-NEXT: s_ashr_i32 s10, s2, 16 -; GFX9-NEXT: s_ashr_i32 s2, s6, 16 +; GFX9-NEXT: s_ashr_i32 s10, s0, 16 +; GFX9-NEXT: s_ashr_i32 s0, s2, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s2, s10, s0 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s4, s10, s2 -; GFX9-NEXT: s_ashr_i32 s4, s4, 30 -; GFX9-NEXT: s_or_b32 s6, s4, 1 +; GFX9-NEXT: s_or_b32 s2, s2, 1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX9-NEXT: s_sext_i32_i16 s8, s1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s4, s6, 0 -; GFX9-NEXT: s_sext_i32_i16 s6, s7 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s6 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 -; GFX9-NEXT: s_sext_i32_i16 s8, s3 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v4 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s8 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: s_xor_b32 s0, s8, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: s_xor_b32 s2, s8, s6 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: s_ashr_i32 s4, s7, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s4 -; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: s_ashr_i32 s3, s3, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 +; GFX9-NEXT: s_ashr_i32 s2, s1, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s2, s5, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX9-NEXT: s_xor_b32 s0, s2, s3 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_or_b32 s6, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s8, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 +; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3841,48 +3834,46 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_and_b32 s1, s6, 0xffff +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: s_lshr_b32 s4, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s9, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: s_and_b32 s4, s11, 0xffff +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_and_b32 s4, s9, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3890,47 +3881,48 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 0xffff -; GFX9-NEXT: s_and_b32 s5, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_and_b32 s7, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_and_b32 s6, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s2, s3, 0xffff +; GFX9-NEXT: s_and_b32 s0, s1, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v6, v0, s[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_short v6, v2, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v6, v0, s[6:7] ; GFX9-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4010,54 +4002,52 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s6, s10, 16 -; GFX6-NEXT: s_and_b32 s1, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_and_b32 s1, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s7, s4, 16 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s7 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GFX6-NEXT: s_mov_b32 s0, s8 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: s_lshr_b32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v4 -; GFX6-NEXT: s_mov_b32 s1, s9 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 +; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s5, 0xffff -; GFX6-NEXT: v_mad_f32 v2, -v1, v4, v3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: s_and_b32 s4, s11, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX6-NEXT: s_and_b32 s6, s11, 0xffff +; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 +; GFX6-NEXT: s_and_b32 s6, s9, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v2, -v2, v3, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 +; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s11, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4065,34 +4055,33 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 0xffff -; GFX9-NEXT: s_and_b32 s5, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_and_b32 s9, s2, 0xffff ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s8, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_and_b32 s5, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4101,17 +4090,18 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v1, s0, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[6:7] ; GFX9-NEXT: s_endpgm %r = urem <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4197,47 +4187,46 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s7, s10 -; GFX6-NEXT: s_sext_i32_i16 s6, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GFX6-NEXT: s_xor_b32 s6, s7, s6 -; GFX6-NEXT: s_ashr_i32 s6, s6, 30 +; GFX6-NEXT: s_sext_i32_i16 s4, s10 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s0, s8 -; GFX6-NEXT: s_or_b32 s8, s6, 1 -; GFX6-NEXT: s_sext_i32_i16 s5, s5 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: s_cselect_b32 s6, s8, 0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v2 -; GFX6-NEXT: s_ashr_i32 s6, s10, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_ashr_i32 s5, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v2 +; GFX6-NEXT: s_ashr_i32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_xor_b32 s4, s6, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| +; GFX6-NEXT: s_sext_i32_i16 s5, s11 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s11 +; GFX6-NEXT: s_sext_i32_i16 s4, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 @@ -4250,7 +4239,6 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4261,13 +4249,13 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s6 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-NEXT: s_sext_i32_i16 s5, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -4279,44 +4267,44 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s8, 0 -; GFX9-NEXT: s_ashr_i32 s5, s6, 16 -; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s2, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: s_or_b32 s0, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s4, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] +; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[6:7] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4408,70 +4396,68 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s0, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX6-NEXT: s_sext_i32_i16 s1, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: s_sext_i32_i16 s4, s10 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s10, s0, 1 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX6-NEXT: s_cselect_b32 s0, s10, 0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_ashr_i32 s4, s8, 16 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GFX6-NEXT: s_ashr_i32 s4, s10, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_ashr_i32 s5, s6, 16 +; GFX6-NEXT: s_ashr_i32 s5, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_lshr_b32 s6, s8, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GFX6-NEXT: s_lshr_b32 s10, s6, 16 -; GFX6-NEXT: s_lshr_b32 s8, s8, 16 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: s_lshr_b32 s7, s10, 16 +; GFX6-NEXT: s_or_b32 s8, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_cselect_b32 s4, s8, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s9 +; GFX6-NEXT: s_sext_i32_i16 s4, s11 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s7 +; GFX6-NEXT: s_sext_i32_i16 s5, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: s_or_b32 s7, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_cselect_b32 s4, s7, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s9 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s10, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 @@ -4480,12 +4466,12 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s8, s6 +; GFX9-NEXT: s_sext_i32_i16 s8, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s2 +; GFX9-NEXT: s_sext_i32_i16 s9, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -4497,51 +4483,51 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s10, 0 -; GFX9-NEXT: s_ashr_i32 s10, s2, 16 -; GFX9-NEXT: s_ashr_i32 s2, s6, 16 +; GFX9-NEXT: s_ashr_i32 s10, s0, 16 +; GFX9-NEXT: s_ashr_i32 s0, s2, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s2, s10, s0 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_xor_b32 s4, s10, s2 -; GFX9-NEXT: s_ashr_i32 s4, s4, 30 -; GFX9-NEXT: s_or_b32 s6, s4, 1 +; GFX9-NEXT: s_or_b32 s2, s2, 1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s4, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v3 -; GFX9-NEXT: s_sext_i32_i16 s4, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX9-NEXT: s_sext_i32_i16 s3, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: s_xor_b32 s2, s5, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s3, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 -; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[6:7] ; GFX9-NEXT: s_endpgm %r = srem <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -5512,16 +5498,15 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_lshr_b32 s0, s2, 12 -; GFX6-NEXT: s_lshr_b32 s1, s3, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_lshr_b32 s4, s4, 12 +; GFX6-NEXT: s_lshr_b32 s5, s5, 12 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: @@ -5555,19 +5540,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 12 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: s_lshr_b32 s4, s4, 12 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: @@ -5662,31 +5646,29 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s0 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s2, 0, s0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 ; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s6, s1 +; GFX6-NEXT: s_sub_i32 s1, s8, s1 ; GFX6-NEXT: s_sub_i32 s3, s1, s0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 @@ -5701,10 +5683,10 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 ; GFX6-NEXT: s_mul_i32 s0, s0, s2 -; GFX6-NEXT: s_sub_i32 s0, s7, s0 +; GFX6-NEXT: s_sub_i32 s0, s9, s0 ; GFX6-NEXT: s_sub_i32 s1, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2 @@ -5715,19 +5697,19 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s0 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s1 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s4, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -5739,37 +5721,37 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 ; GFX9-NEXT: s_mul_i32 s5, s4, s7 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s0, s0, s5 ; GFX9-NEXT: s_add_i32 s9, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s2, s7 -; GFX9-NEXT: s_cmp_ge_u32 s2, s7 +; GFX9-NEXT: s_sub_i32 s5, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 ; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_cselect_b32 s0, s5, s0 ; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: s_cselect_b32 s0, s5, s4 ; GFX9-NEXT: s_sub_i32 s4, 0, s6 ; GFX9-NEXT: s_mul_i32 s4, s4, s8 ; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 ; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 ; GFX9-NEXT: s_mul_i32 s5, s4, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: s_sub_i32 s1, s1, s5 ; GFX9-NEXT: s_add_i32 s7, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_sub_i32 s5, s1, s6 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 ; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s1, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = udiv <2 x i32> %x, %shl.y @@ -5908,16 +5890,15 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_and_b32 s0, s2, 0xfff -; GFX6-NEXT: s_and_b32 s1, s3, 0xfff -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_and_b32 s4, s4, 0xfff +; GFX6-NEXT: s_and_b32 s5, s5, 0xfff +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: @@ -6004,67 +5985,64 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 -; GFX6-NEXT: s_lshl_b32 s8, 0x1000, s1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s6 -; GFX6-NEXT: s_sub_i32 s2, s2, s4 -; GFX6-NEXT: s_sub_i32 s4, s2, s6 -; GFX6-NEXT: s_cmp_ge_u32 s2, s6 -; GFX6-NEXT: s_cselect_b32 s2, s4, s2 -; GFX6-NEXT: s_sub_i32 s4, s2, s6 -; GFX6-NEXT: s_cmp_ge_u32 s2, s6 -; GFX6-NEXT: s_cselect_b32 s2, s4, s2 -; GFX6-NEXT: s_sub_i32 s4, 0, s8 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 -; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_readfirstlane_b32 s6, v0 +; GFX6-NEXT: s_mul_i32 s6, s6, s2 +; GFX6-NEXT: s_sub_i32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s6, s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s6, s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s2, 0, s3 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s3, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s0 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s1 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s4, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -6076,33 +6054,33 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 ; GFX9-NEXT: s_mul_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s4, s2, s7 -; GFX9-NEXT: s_cmp_ge_u32 s2, s7 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s4, s2, s7 -; GFX9-NEXT: s_cmp_ge_u32 s2, s7 +; GFX9-NEXT: s_sub_i32 s0, s0, s4 +; GFX9-NEXT: s_sub_i32 s4, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s0, s4, s0 +; GFX9-NEXT: s_sub_i32 s4, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_cselect_b32 s0, s4, s0 ; GFX9-NEXT: s_sub_i32 s4, 0, s6 ; GFX9-NEXT: s_mul_i32 s4, s4, s8 ; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 ; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 ; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s4, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_sub_i32 s1, s1, s4 +; GFX9-NEXT: s_sub_i32 s4, s1, s6 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s1, s4, s1 +; GFX9-NEXT: s_sub_i32 s4, s1, s6 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s1, s4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = urem <2 x i32> %x, %shl.y @@ -6291,22 +6269,21 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 -; GFX6-NEXT: s_ashr_i32 s1, s3, 31 -; GFX6-NEXT: s_lshr_b32 s0, s0, 20 -; GFX6-NEXT: s_lshr_b32 s1, s1, 20 -; GFX6-NEXT: s_add_i32 s0, s2, s0 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 12 -; GFX6-NEXT: s_ashr_i32 s1, s1, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_lshr_b32 s7, s7, 20 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: s_add_i32 s5, s5, s7 +; GFX6-NEXT: s_ashr_i32 s4, s4, 12 +; GFX6-NEXT: s_ashr_i32 s5, s5, 12 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: @@ -6346,22 +6323,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_i32 v0, s3, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 -; GFX6-NEXT: s_lshr_b32 s0, s0, 20 -; GFX6-NEXT: s_add_i32 s0, s2, s0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s3, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 12 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: v_mul_hi_i32 v0, s5, v0 +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 12 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: @@ -6477,138 +6453,136 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s8, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 -; GFX6-NEXT: s_lshl_b32 s9, 0x1000, s7 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX6-NEXT: s_abs_i32 s6, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s7, 0, s6 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0 +; GFX6-NEXT: s_abs_i32 s7, s0 +; GFX6-NEXT: s_xor_b32 s0, s0, s2 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s4, s2 -; GFX6-NEXT: s_xor_b32 s2, s2, s6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: s_ashr_i32 s2, s2, 31 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_readfirstlane_b32 s5, v0 -; GFX6-NEXT: s_mul_i32 s5, s5, s8 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 -; GFX6-NEXT: s_sub_i32 s5, s4, s8 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s6 +; GFX6-NEXT: s_sub_i32 s2, s7, s2 +; GFX6-NEXT: s_sub_i32 s7, s2, s6 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s4, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_cselect_b32 s2, s7, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s4, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_abs_i32 s8, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_abs_i32 s2, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_xor_b32 s3, s1, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_abs_i32 s1, s3 -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX6-NEXT: s_abs_i32 s1, s1 +; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v2 -; GFX6-NEXT: s_xor_b32 s0, s3, s9 -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: s_ashr_i32 s3, s3, 31 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v1 -; GFX6-NEXT: s_mul_i32 s2, s2, s8 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_sub_i32 s2, s1, s8 +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: s_mul_i32 s0, s0, s2 +; GFX6-NEXT: s_sub_i32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s1, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s1, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s8, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s7 -; GFX9-NEXT: s_abs_i32 s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX9-NEXT: s_abs_i32 s6, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s3 +; GFX9-NEXT: s_abs_i32 s3, s0 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_ashr_i32 s0, s0, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7 -; GFX9-NEXT: s_mul_i32 s7, s6, s8 -; GFX9-NEXT: s_sub_i32 s5, s5, s7 -; GFX9-NEXT: s_add_i32 s9, s6, 1 -; GFX9-NEXT: s_sub_i32 s7, s5, s8 -; GFX9-NEXT: s_cmp_ge_u32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s7, s6 -; GFX9-NEXT: s_abs_i32 s6, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s2 -; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s2, s5, s2 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2 +; GFX9-NEXT: s_add_i32 s8, s8, s2 +; GFX9-NEXT: s_mul_hi_u32 s2, s3, s8 +; GFX9-NEXT: s_mul_i32 s8, s2, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: s_add_i32 s9, s2, 1 +; GFX9-NEXT: s_sub_i32 s8, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s2, s9, s2 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s8, s2, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s6, s8, s2 +; GFX9-NEXT: s_abs_i32 s8, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_xor_b32 s5, s6, s0 +; GFX9-NEXT: s_sub_i32 s6, 0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s4, s3, s4 -; GFX9-NEXT: s_abs_i32 s3, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_sub_i32 s0, s5, s0 +; GFX9-NEXT: s_xor_b32 s4, s1, s7 +; GFX9-NEXT: s_abs_i32 s1, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s7, s7, s5 -; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 -; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_add_i32 s8, s5, 1 -; GFX9-NEXT: s_sub_i32 s7, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s5, s8, s5 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 +; GFX9-NEXT: s_mul_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_add_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5 +; GFX9-NEXT: s_mul_i32 s6, s5, s8 +; GFX9-NEXT: s_sub_i32 s1, s1, s6 ; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s7, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_sub_i32 s6, s1, s8 +; GFX9-NEXT: s_cmp_ge_u32 s1, s8 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_cselect_b32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s6, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s8 +; GFX9-NEXT: s_cselect_b32 s1, s6, s5 +; GFX9-NEXT: s_xor_b32 s1, s1, s4 +; GFX9-NEXT: s_sub_i32 s1, s1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = sdiv <2 x i32> %x, %shl.y @@ -6798,24 +6772,23 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 -; GFX6-NEXT: s_ashr_i32 s1, s3, 31 -; GFX6-NEXT: s_lshr_b32 s0, s0, 20 -; GFX6-NEXT: s_lshr_b32 s1, s1, 20 -; GFX6-NEXT: s_add_i32 s0, s2, s0 -; GFX6-NEXT: s_add_i32 s1, s3, s1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX6-NEXT: s_sub_i32 s0, s2, s0 -; GFX6-NEXT: s_sub_i32 s1, s3, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: s_add_i32 s6, s4, s6 +; GFX6-NEXT: s_lshr_b32 s7, s7, 20 +; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s4, s4, s6 +; GFX6-NEXT: s_add_i32 s6, s5, s7 +; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: @@ -6926,125 +6899,122 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s6, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX6-NEXT: s_abs_i32 s2, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: s_abs_i32 s6, s0 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s4, s2 -; GFX6-NEXT: s_ashr_i32 s2, s2, 31 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s6 -; GFX6-NEXT: s_sub_i32 s4, s4, s7 -; GFX6-NEXT: s_sub_i32 s7, s4, s6 -; GFX6-NEXT: s_cmp_ge_u32 s4, s6 -; GFX6-NEXT: s_cselect_b32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s7, s4, s6 -; GFX6-NEXT: s_cmp_ge_u32 s4, s6 -; GFX6-NEXT: s_cselect_b32 s8, s7, s4 -; GFX6-NEXT: s_abs_i32 s9, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_sub_i32 s4, 0, s9 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_ashr_i32 s1, s3, 31 +; GFX6-NEXT: s_mul_i32 s7, s7, s2 +; GFX6-NEXT: s_sub_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s7, s6, s2 +; GFX6-NEXT: s_cmp_ge_u32 s6, s2 +; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_sub_i32 s7, s6, s2 +; GFX6-NEXT: s_cmp_ge_u32 s6, s2 +; GFX6-NEXT: s_cselect_b32 s2, s7, s6 +; GFX6-NEXT: s_abs_i32 s3, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s6, 0, s3 +; GFX6-NEXT: s_abs_i32 s8, s1 +; GFX6-NEXT: s_xor_b32 s2, s2, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_sub_i32 s0, s2, s0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 31 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_abs_i32 s0, s3 -; GFX6-NEXT: s_xor_b32 s3, s8, s2 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_sub_i32 s2, s3, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s3, v0 -; GFX6-NEXT: s_mul_i32 s3, s3, s9 -; GFX6-NEXT: s_sub_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s3, s0, s9 -; GFX6-NEXT: s_cmp_ge_u32 s0, s9 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_sub_i32 s3, s0, s9 -; GFX6-NEXT: s_cmp_ge_u32 s0, s9 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_xor_b32 s2, s2, s1 +; GFX6-NEXT: s_sub_i32 s1, s2, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s6, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s2, 31 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 ; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 +; GFX9-NEXT: s_abs_i32 s0, s0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s2, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s6 -; GFX9-NEXT: s_sub_i32 s2, s2, s7 -; GFX9-NEXT: s_sub_i32 s7, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s7, s2 -; GFX9-NEXT: s_sub_i32 s7, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s7, s2 -; GFX9-NEXT: s_abs_i32 s4, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s7, 0, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_mul_hi_u32 s7, s0, s8 +; GFX9-NEXT: s_mul_i32 s7, s7, s2 +; GFX9-NEXT: s_sub_i32 s0, s0, s7 +; GFX9-NEXT: s_sub_i32 s7, s0, s2 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cselect_b32 s0, s7, s0 +; GFX9-NEXT: s_sub_i32 s7, s0, s2 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cselect_b32 s0, s7, s0 +; GFX9-NEXT: s_abs_i32 s7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_xor_b32 s0, s0, s6 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_sub_i32 s5, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s6, s3, 31 -; GFX9-NEXT: s_abs_i32 s3, s3 +; GFX9-NEXT: s_sub_i32 s0, s0, s6 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_abs_i32 s1, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s7, s7, s5 -; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 -; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s5, s3, s4 -; GFX9-NEXT: s_cmp_ge_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s4 -; GFX9-NEXT: s_cmp_ge_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s7 +; GFX9-NEXT: s_sub_i32 s1, s1, s5 +; GFX9-NEXT: s_sub_i32 s5, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_sub_i32 s5, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_xor_b32 s1, s1, s4 +; GFX9-NEXT: s_sub_i32 s1, s1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y @@ -10151,9 +10121,6 @@ define i64 @udiv_i64_9divbits(i8 %size) { } define <2 x i64> @srem_zero_zero() { -; GCN-LABEL: kernel: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_endpgm ; GFX6-LABEL: srem_zero_zero: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 56ad037f65641..37f4094806637 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -257,29 +257,28 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_lshl_b32 s1, s2, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s0, s3, 16 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll index 61fb18e00917b..c46fcde739b1c 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll @@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff -; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v2, s1 -; CI-NEXT: v_mov_b32_e32 v3, s0 -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_fabs_v4bf16: @@ -234,13 +234,23 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff -; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_and_b32 s4, s3, 0x7fff +; VI-NEXT: s_lshr_b32 s3, s3, 16 +; VI-NEXT: s_and_b32 s5, s2, 0x7fff +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: s_and_b32 s3, s3, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff +; VI-NEXT: s_and_b32 s4, 0xffff, s4 +; VI-NEXT: s_and_b32 s5, 0xffff, s5 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_or_b32 s3, s4, s3 +; VI-NEXT: s_or_b32 s2, s5, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_fabs_v4bf16: @@ -248,8 +258,14 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff -; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s3, 0x7fff +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NEXT: s_and_b32 s5, s2, 0x7fff +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff +; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -259,8 +275,14 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s4, s3, 0x7fff +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_lshr_b32 s5, s2, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_and_b32 s5, s5, 0x7fff +; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s5 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index a77c7ae923d0f..27cf49aec8229 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff -; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v2, s1 -; CI-NEXT: v_mov_b32_e32 v3, s0 -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_fabs_v4f16: @@ -234,13 +234,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff -; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_fabs_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index baf9b0abf7b0c..97e23fcdb2263 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -99,29 +99,28 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fabs_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_and_b32 s0, s3, 0x7fffffff -; SI-NEXT: s_and_b32 s1, s2, 0x7fffffff -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_bitset0_b32 s5, 31 +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fabs_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff -; VI-NEXT: s_and_b32 s1, s2, 0x7fffffff -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: s_bitset0_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 1d87d938cc41c..0a2e758f7cf21 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -472,52 +472,50 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfi_b32 v1, s8, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_bfi_b32 v0, s8, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_brev_b32 s6, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_bfi_b32 v1, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_bfi_b32 v0, s6, v2, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_bfi_b32 v3, s6, v2, v3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfi_b32 v2, s6, v2, v4 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) store <2 x float> %result, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 0fc61cbe54bad..c510c40c8536c 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -932,18 +932,16 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-FASTFMA-NEXT: s_mov_b32 s4, s0 -; GFX6-FASTFMA-NEXT: s_mov_b32 s5, s1 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s9 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -952,13 +950,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s9, v1 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s11, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -968,21 +966,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s8, v2 -; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s10, v2 +; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-FASTFMA-NEXT: s_endpgm ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s3 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -992,13 +989,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0 +; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s5, v0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s3, v0 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v5, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v5, v5 @@ -1008,24 +1006,22 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v4 -; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s2, v4 +; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_mov_b32 s4, s0 -; GFX7-NEXT: s_mov_b32 s5, s1 -; GFX7-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, s9 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -1034,13 +1030,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX7-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2 +; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_div_fixup_f32 v1, v0, s9, v1 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 +; GFX7-NEXT: v_div_fixup_f32 v1, v0, s11, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -1050,20 +1046,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX7-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v0, s8, v2 -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: v_div_fixup_f32 v0, v0, s10, v2 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -1073,12 +1068,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-NEXT: v_div_fixup_f32 v1, v1, s5, v0 +; GFX8-NEXT: v_div_fixup_f32 v1, v1, s3, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v0, -v2, v5, 1.0 ; GFX8-NEXT: v_fma_f32 v0, v0, v5, v5 @@ -1088,20 +1084,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_div_fixup_f32 v0, v0, s4, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_div_fixup_f32 v0, v0, s2, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s7, s7, s3 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s7, s3 +; GFX10-NEXT: v_div_scale_f32 v0, s6, s3, s3, s1 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1111,11 +1106,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s4, s6, s6, s2 +; GFX10-NEXT: v_div_scale_f32 v2, s6, s2, s2, s0 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s3 -; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s6, s2 +; GFX10-NEXT: v_div_fixup_f32 v1, v0, s3, s1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -1126,18 +1122,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, s0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s3 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s5, s3 +; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s1 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) @@ -1148,11 +1145,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_scale_f32 v2, null, s4, s4, s2 +; GFX11-NEXT: v_div_scale_f32 v2, null, s2, s2, s0 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: v_div_fixup_f32 v1, v0, s5, s3 -; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s4, s2 +; GFX11-NEXT: v_div_fixup_f32 v1, v0, s3, s1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -1164,8 +1161,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32: @@ -1190,60 +1187,58 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s8 -; GFX67-NEXT: v_rcp_f32_e32 v1, s9 -; GFX67-NEXT: s_mov_b32 s4, s0 -; GFX67-NEXT: s_mov_b32 s5, s1 -; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX67-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX67-NEXT: v_rcp_f32_e32 v0, s2 +; GFX67-NEXT: v_rcp_f32_e32 v1, s3 +; GFX67-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX67-NEXT: v_mul_f32_e32 v1, s1, v1 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v2, s6 -; GFX8-NEXT: v_rcp_f32_e32 v3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_f32_e32 v2, s2, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, s3, v3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_rcp_f32_e32 v0, s2 +; GFX8-NEXT: v_rcp_f32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s6 -; GFX10-NEXT: v_rcp_f32_e32 v1, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, s3, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s2 +; GFX10-NEXT: v_rcp_f32_e32 v1, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s6 -; GFX11-NEXT: v_rcp_f32_e32 v1, s7 +; GFX11-NEXT: v_rcp_f32_e32 v0, s2 +; GFX11-NEXT: v_rcp_f32_e32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) -; GFX11-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_ulp25_v2f32: @@ -1268,60 +1263,58 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s9 -; GFX67-NEXT: v_rcp_f32_e32 v2, s8 -; GFX67-NEXT: s_mov_b32 s4, s0 -; GFX67-NEXT: s_mov_b32 s5, s1 -; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0 -; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2 +; GFX67-NEXT: v_rcp_f32_e32 v0, s3 +; GFX67-NEXT: v_rcp_f32_e32 v2, s2 +; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v2, s7 -; GFX8-NEXT: v_rcp_f32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2 -; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_rcp_f32_e32 v2, s2 +; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_rcp_f32_e32 v2, s6 -; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_rcp_f32_e32 v2, s2 +; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s7 -; GFX11-NEXT: v_rcp_f32_e32 v2, s6 +; GFX11-NEXT: v_rcp_f32_e32 v0, s3 +; GFX11-NEXT: v_rcp_f32_e32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) -; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_fast_math: @@ -1346,60 +1339,58 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s9 -; GFX67-NEXT: v_rcp_f32_e32 v2, s8 -; GFX67-NEXT: s_mov_b32 s4, s0 -; GFX67-NEXT: s_mov_b32 s5, s1 -; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0 -; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2 +; GFX67-NEXT: v_rcp_f32_e32 v0, s3 +; GFX67-NEXT: v_rcp_f32_e32 v2, s2 +; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v2, s7 -; GFX8-NEXT: v_rcp_f32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2 -; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_rcp_f32_e32 v2, s2 +; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_rcp_f32_e32 v2, s6 -; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_rcp_f32_e32 v2, s2 +; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s7 -; GFX11-NEXT: v_rcp_f32_e32 v2, s6 +; GFX11-NEXT: v_rcp_f32_e32 v0, s3 +; GFX11-NEXT: v_rcp_f32_e32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) -; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_arcp_math: diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index 3e450b785b57b..6c2ab5fb15a20 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -121,25 +121,24 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; SICI-LABEL: fnearbyint_v2f32: ; SICI: ; %bb.0: ; %entry ; SICI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SICI-NEXT: s_mov_b32 s7, 0xf000 -; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) -; SICI-NEXT: s_mov_b32 s4, s0 -; SICI-NEXT: s_mov_b32 s5, s1 -; SICI-NEXT: v_rndne_f32_e32 v1, s3 -; SICI-NEXT: v_rndne_f32_e32 v0, s2 -; SICI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SICI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SICI-NEXT: s_mov_b32 s3, 0xf000 +; SICI-NEXT: s_mov_b32 s2, -1 +; SICI-NEXT: v_rndne_f32_e32 v1, s5 +; SICI-NEXT: v_rndne_f32_e32 v0, s4 +; SICI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SICI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_v2f32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_rndne_f32_e32 v3, s3 -; VI-NEXT: v_rndne_f32_e32 v2, s2 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_rndne_f32_e32 v1, s3 +; VI-NEXT: v_rndne_f32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fnearbyint_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 468df77f5c2aa..5424ebfcffcd1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -624,13 +624,13 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_or_b32 s0, s3, 0x80008000 -; CI-NEXT: s_or_b32 s1, s2, 0x80008000 -; CI-NEXT: v_mov_b32_e32 v2, s1 -; CI-NEXT: v_mov_b32_e32 v3, s0 -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: s_or_b32 s3, s3, 0x80008000 +; CI-NEXT: s_or_b32 s2, s2, 0x80008000 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabs_v4bf16: @@ -640,25 +640,23 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s2, 0x7fff7fff -; VI-NEXT: s_and_b32 s1, s3, 0x7fff7fff -; VI-NEXT: s_bfe_u32 s3, s3, 0xf0010 -; VI-NEXT: s_bfe_u32 s2, s2, 0xf0010 -; VI-NEXT: s_xor_b32 s1, s1, 0x8000 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 -; VI-NEXT: s_xor_b32 s0, s0, 0x8000 -; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_and_b32 s1, 0xffff, s1 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s0, 0xffff, s0 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s1, s1, s3 -; VI-NEXT: s_or_b32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_bitset1_b32 s3, 15 +; VI-NEXT: s_bitset1_b32 s2, 15 +; VI-NEXT: s_bitset1_b32 s5, 15 +; VI-NEXT: s_bitset1_b32 s4, 15 +; VI-NEXT: s_and_b32 s3, 0xffff, s3 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_and_b32 s2, 0xffff, s2 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_or_b32 s3, s3, s5 +; VI-NEXT: s_or_b32 s2, s2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fneg_fabs_v4bf16: @@ -666,16 +664,14 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff7fff -; GFX9-NEXT: s_and_b32 s5, s3, 0x7fff7fff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0xf0010 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf0010 -; GFX9-NEXT: s_xor_b32 s3, s3, 0x8000 -; GFX9-NEXT: s_xor_b32 s5, s5, 0x8000 -; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 -; GFX9-NEXT: s_xor_b32 s4, s4, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NEXT: s_bitset1_b32 s3, 15 +; GFX9-NEXT: s_bitset1_b32 s2, 15 +; GFX9-NEXT: s_bitset1_b32 s5, 15 +; GFX9-NEXT: s_bitset1_b32 s4, 15 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -685,16 +681,14 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s4, s2, 0x7fff7fff -; GFX11-NEXT: s_and_b32 s5, s3, 0x7fff7fff -; GFX11-NEXT: s_bfe_u32 s3, s3, 0xf0010 -; GFX11-NEXT: s_bfe_u32 s2, s2, 0xf0010 -; GFX11-NEXT: s_xor_b32 s3, s3, 0x8000 -; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 -; GFX11-NEXT: s_xor_b32 s4, s4, 0x8000 -; GFX11-NEXT: s_xor_b32 s5, s5, 0x8000 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_bitset1_b32 s3, 15 +; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_bitset1_b32 s4, 15 +; GFX11-NEXT: s_bitset1_b32 s5, 15 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 305f4e56184cc..9d9a851a5507e 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -516,13 +516,13 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: s_or_b32 s0, s3, 0x80008000 -; CIVI-NEXT: s_or_b32 s1, s2, 0x80008000 -; CIVI-NEXT: v_mov_b32_e32 v2, s1 -; CIVI-NEXT: v_mov_b32_e32 v3, s0 -; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 +; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 +; CIVI-NEXT: v_mov_b32_e32 v3, s1 +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: v_mov_b32_e32 v2, s0 +; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CIVI-NEXT: s_endpgm ; ; GFX9-LABEL: fneg_fabs_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index b93a598cb52ae..214ccedd75170 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -199,29 +199,28 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; SI-LABEL: fneg_fabsf_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s3, 31 -; SI-NEXT: s_bitset1_b32 s2, 31 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_bitset1_b32 s5, 31 +; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabsf_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_or_b32 s0, s3, 0x80000000 -; VI-NEXT: s_or_b32 s1, s2, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 17225b7c39f4f..02235151a83e1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -52,29 +52,28 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_xor_b32 s0, s3, 0x80000000 -; SI-NEXT: s_xor_b32 s1, s2, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b32 s5, s5, 0x80000000 +; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 -; VI-NEXT: s_xor_b32 s1, s2, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 35de9ccd99739..a2cd6d28e96cb 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -134,27 +134,24 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_cvt_i32_f32_e32 v1, s3 -; SI-NEXT: v_cvt_i32_f32_e32 v0, s2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cvt_i32_f32_e32 v1, s5 +; SI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_i32_f32_e32 v1, s3 ; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fp_to_sint_v2i32: @@ -438,26 +435,25 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, 0x2f800000 +; SI-NEXT: s_mov_b32 s7, 0xcf800000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s8, 0x2f800000 -; SI-NEXT: s_mov_b32 s9, 0xcf800000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_trunc_f32_e32 v0, s7 -; SI-NEXT: v_trunc_f32_e32 v1, s6 -; SI-NEXT: v_mul_f32_e64 v2, |v0|, s8 +; SI-NEXT: v_trunc_f32_e32 v0, s5 +; SI-NEXT: v_trunc_f32_e32 v1, s4 +; SI-NEXT: v_mul_f32_e64 v2, |v0|, s6 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; SI-NEXT: v_mul_f32_e64 v4, |v1|, s8 +; SI-NEXT: v_mul_f32_e64 v4, |v1|, s6 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; SI-NEXT: v_floor_f32_e32 v2, v2 ; SI-NEXT: v_floor_f32_e32 v4, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v2 -; SI-NEXT: v_fma_f32 v0, v2, s9, |v0| +; SI-NEXT: v_fma_f32 v0, v2, s7, |v0| ; SI-NEXT: v_cvt_u32_f32_e32 v2, v4 -; SI-NEXT: v_fma_f32 v1, v4, s9, |v1| +; SI-NEXT: v_fma_f32 v1, v4, s7, |v1| ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v4, v6, v3 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -474,36 +470,35 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s8, 0x2f800000 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s6, 0x2f800000 +; VI-NEXT: s_mov_b32 s7, 0xcf800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s3 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8 -; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b64 s[4:5], s[2:3] +; VI-NEXT: v_trunc_f32_e32 v0, s5 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s6 ; VI-NEXT: v_floor_f32_e32 v1, v1 -; VI-NEXT: s_mov_b32 s0, 0xcf800000 -; VI-NEXT: v_fma_f32 v2, v1, s0, |v0| -; VI-NEXT: v_trunc_f32_e32 v4, s2 -; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 -; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8 -; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 -; VI-NEXT: v_floor_f32_e32 v3, v3 -; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 -; VI-NEXT: v_fma_f32 v3, v3, s0, |v4| +; VI-NEXT: v_cvt_u32_f32_e32 v2, v1 +; VI-NEXT: v_fma_f32 v1, v1, s7, |v0| ; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0 -; VI-NEXT: v_cvt_u32_f32_e32 v6, v3 -; VI-NEXT: v_xor_b32_e32 v2, v2, v0 +; VI-NEXT: v_trunc_f32_e32 v4, s4 +; VI-NEXT: v_xor_b32_e32 v3, v2, v0 +; VI-NEXT: v_mul_f32_e64 v2, |v4|, s6 +; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 +; VI-NEXT: v_floor_f32_e32 v2, v2 +; VI-NEXT: v_cvt_u32_f32_e32 v5, v2 +; VI-NEXT: v_fma_f32 v2, v2, s7, |v4| +; VI-NEXT: v_cvt_u32_f32_e32 v6, v2 ; VI-NEXT: v_xor_b32_e32 v1, v1, v0 -; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc +; VI-NEXT: v_sub_u32_e32 v2, vcc, v1, v0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc ; VI-NEXT: v_xor_b32_e32 v0, v6, v1 ; VI-NEXT: v_xor_b32_e32 v4, v5, v1 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fp_to_sint_v2i64: @@ -1298,32 +1293,29 @@ define amdgpu_kernel void @fp_to_sint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x ; SI-LABEL: fp_to_sint_v2f32_to_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_cvt_i32_f32_e32 v0, s3 -; SI-NEXT: v_cvt_i32_f32_e32 v1, s2 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cvt_i32_f32_e32 v0, s5 +; SI-NEXT: v_cvt_i32_f32_e32 v1, s4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v2f32_to_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_i32_f32_e32 v1, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fp_to_sint_v2f32_to_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index 106d1116c2bc6..32f80ff6c22f8 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -72,27 +72,24 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_cvt_u32_f32_e32 v1, s3 -; SI-NEXT: v_cvt_u32_f32_e32 v0, s2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cvt_u32_f32_e32 v1, s5 +; SI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_u32_f32_e32 v1, s3 ; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i32: @@ -349,32 +346,29 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s8, 0xcf800000 +; SI-NEXT: s_mov_b32 s6, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_trunc_f32_e32 v0, s3 -; SI-NEXT: v_trunc_f32_e32 v2, s2 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_trunc_f32_e32 v0, s5 +; SI-NEXT: v_trunc_f32_e32 v2, s4 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_floor_f32_e32 v4, v1 ; SI-NEXT: v_floor_f32_e32 v5, v3 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v5 -; SI-NEXT: v_fma_f32 v0, v4, s8, v0 -; SI-NEXT: v_fma_f32 v4, v5, s8, v2 +; SI-NEXT: v_fma_f32 v0, v4, s6, v0 +; SI-NEXT: v_fma_f32 v4, v5, s6, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s3 ; VI-NEXT: v_trunc_f32_e32 v4, s2 @@ -389,9 +383,9 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; VI-NEXT: v_cvt_u32_f32_e32 v3, v5 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v6 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i64: @@ -1078,31 +1072,28 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x ; SI-LABEL: fp_to_uint_v2f32_to_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_cvt_u32_f32_e32 v0, s3 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cvt_u32_f32_e32 v0, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_u32_f32_e32 v1, s2 +; SI-NEXT: v_cvt_u32_f32_e32 v1, s4 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_u32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_u32_f32_e32 v1, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 0366d618249df..72c2003058a01 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -209,81 +209,85 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s5 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_lshr_b32 s12, s3, 1 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; SI-NEXT: s_not_b32 s3, s7 -; SI-NEXT: s_mov_b32 s1, s12 -; SI-NEXT: s_and_b32 s3, s3, 31 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; SI-NEXT: s_mov_b32 s5, s2 -; SI-NEXT: s_lshr_b32 s1, s2, 1 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], 1 -; SI-NEXT: s_mov_b32 s3, s1 -; SI-NEXT: s_not_b32 s1, s6 +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_mov_b32 s7, s1 +; SI-NEXT: s_lshr_b32 s12, s1, 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; SI-NEXT: s_not_b32 s1, s5 +; SI-NEXT: s_mov_b32 s7, s12 ; SI-NEXT: s_and_b32 s1, s1, 31 -; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; SI-NEXT: s_lshr_b32 s5, s0, 1 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; SI-NEXT: s_not_b32 s2, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_mov_b32 s0, s5 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_lshr_b32 s8, s3, 1 -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; VI-NEXT: s_not_b32 s3, s7 -; VI-NEXT: s_mov_b32 s1, s8 -; VI-NEXT: s_and_b32 s3, s3, 31 -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; VI-NEXT: s_mov_b32 s5, s2 -; VI-NEXT: s_lshr_b32 s1, s2, 1 -; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 1 -; VI-NEXT: s_mov_b32 s3, s1 -; VI-NEXT: s_not_b32 s1, s6 +; VI-NEXT: s_mov_b32 s8, s3 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_lshr_b32 s10, s1, 1 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 1 +; VI-NEXT: s_not_b32 s1, s5 +; VI-NEXT: s_mov_b32 s9, s10 ; VI-NEXT: s_and_b32 s1, s1, 31 -; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: s_lshr_b32 s5, s0, 1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; VI-NEXT: s_not_b32 s2, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s13 -; GFX9-NEXT: s_mov_b32 s1, s11 -; GFX9-NEXT: s_lshr_b32 s2, s11, 1 -; GFX9-NEXT: s_not_b32 s3, s15 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: s_mov_b32 s1, s2 -; GFX9-NEXT: s_and_b32 s2, s3, 31 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; GFX9-NEXT: s_mov_b32 s13, s10 -; GFX9-NEXT: s_lshr_b32 s1, s10, 1 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[12:13], 1 -; GFX9-NEXT: s_mov_b32 s3, s1 -; GFX9-NEXT: s_not_b32 s1, s14 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_lshr_b32 s10, s1, 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX9-NEXT: s_not_b32 s1, s9 +; GFX9-NEXT: s_mov_b32 s5, s10 ; GFX9-NEXT: s_and_b32 s1, s1, 31 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; GFX9-NEXT: s_not_b32 s2, s8 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -306,24 +310,27 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s13 -; GFX10-NEXT: s_mov_b32 s1, s11 -; GFX10-NEXT: s_not_b32 s2, s15 -; GFX10-NEXT: s_mov_b32 s13, s10 -; GFX10-NEXT: s_lshr_b32 s4, s11, 1 -; GFX10-NEXT: s_lshr_b32 s5, s10, 1 -; GFX10-NEXT: s_not_b32 s6, s14 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_and_b32 s7, s2, 31 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[12:13], 1 -; GFX10-NEXT: s_and_b32 s6, s6, 31 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s1, s4 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s7 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_lshr_b32 s10, s1, 1 +; GFX10-NEXT: s_not_b32 s7, s7 +; GFX10-NEXT: s_lshr_b32 s11, s0, 1 +; GFX10-NEXT: s_not_b32 s6, s6 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_and_b32 s4, s7, 31 +; GFX10-NEXT: s_and_b32 s5, s6, 31 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s1, s10 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -331,27 +338,30 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s5 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s5, s2 -; GFX11-NEXT: s_lshr_b32 s10, s3, 1 +; GFX11-NEXT: s_mov_b32 s8, s3 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_lshr_b32 s10, s1, 1 ; GFX11-NEXT: s_not_b32 s7, s7 -; GFX11-NEXT: s_lshr_b32 s11, s2, 1 +; GFX11-NEXT: s_lshr_b32 s11, s0, 1 ; GFX11-NEXT: s_not_b32 s6, s6 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], 1 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX11-NEXT: s_and_b32 s7, s7, 31 ; GFX11-NEXT: s_and_b32 s6, s6, 31 -; GFX11-NEXT: s_mov_b32 s5, s11 -; GFX11-NEXT: s_mov_b32 s3, s10 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s1, s10 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -362,54 +372,52 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s0, s9 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s9, s2 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 23 -; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], 25 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 23 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_mov_b32 s0, s5 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s5, s2 -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 23 -; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b32 s6, s3 +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s7 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: s_mov_b32 s7, s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 23 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 23 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -429,35 +437,35 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, s7 -; GFX10-NEXT: s_mov_b32 s7, s2 -; GFX10-NEXT: s_mov_b32 s5, s3 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[6:7], 25 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 23 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 23 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s7 -; GFX11-NEXT: s_mov_b32 s7, s2 -; GFX11-NEXT: s_mov_b32 s5, s3 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 25 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 23 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 23 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index d3ceaba111848..7afb2cf317869 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -325,56 +325,60 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s5 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_and_b32 s3, s7, 31 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; SI-NEXT: s_mov_b32 s5, s2 -; SI-NEXT: s_and_b32 s1, s6, 31 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_mov_b32 s7, s1 +; SI-NEXT: s_and_b32 s1, s5, 31 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_and_b32 s0, s4, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s3 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_and_b32 s1, s7, 31 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_and_b32 s0, s6, 31 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_mov_b32 s0, s5 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_and_b32 s3, s7, 31 -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; VI-NEXT: s_mov_b32 s5, s2 -; VI-NEXT: s_and_b32 s1, s6, 31 -; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s13 -; GFX9-NEXT: s_mov_b32 s1, s11 -; GFX9-NEXT: s_and_b32 s2, s15, 31 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; GFX9-NEXT: s_mov_b32 s13, s10 -; GFX9-NEXT: s_and_b32 s1, s14, 31 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[12:13], s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_and_b32 s0, s6, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; @@ -394,53 +398,62 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s13 -; GFX10-NEXT: s_mov_b32 s1, s11 -; GFX10-NEXT: s_mov_b32 s13, s10 -; GFX10-NEXT: s_and_b32 s2, s14, 31 -; GFX10-NEXT: s_and_b32 s4, s15, 31 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[12:13], s2 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_and_b32 s0, s6, 31 +; GFX10-NEXT: s_and_b32 s6, s7, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s5 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s5, s2 -; GFX11-NEXT: s_and_b32 s2, s6, 31 +; GFX11-NEXT: s_mov_b32 s8, s3 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_and_b32 s0, s6, 31 ; GFX11-NEXT: s_and_b32 s6, s7, 31 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[8:9], s6 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s5 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s5, s2 -; GFX12-NEXT: s_and_b32 s2, s6, 31 +; GFX12-NEXT: s_mov_b32 s8, s3 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_and_b32 s0, s6, 31 ; GFX12-NEXT: s_and_b32 s6, s7, 31 -; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX12-NEXT: s_lshr_b64 s[4:5], s[8:9], s6 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -451,54 +464,52 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s0, s9 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s9, s2 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 9 -; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], 7 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 9 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_mov_b32 s0, s5 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s5, s2 -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 9 -; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b32 s6, s3 +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 9 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s7 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: s_mov_b32 s7, s2 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 9 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 9 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -518,52 +529,52 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, s7 -; GFX10-NEXT: s_mov_b32 s7, s2 -; GFX10-NEXT: s_mov_b32 s5, s3 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[6:7], 7 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 9 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 9 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s7 -; GFX11-NEXT: s_mov_b32 s7, s2 -; GFX11-NEXT: s_mov_b32 s5, s3 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 7 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 9 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 9 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_v2i32_imm: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s7 -; GFX12-NEXT: s_mov_b32 s7, s2 -; GFX12-NEXT: s_mov_b32 s5, s3 -; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 7 -; GFX12-NEXT: s_lshr_b64 s[4:5], s[4:5], 9 +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 9 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) @@ -574,63 +585,61 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm_src1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s8, 9 ; SI-NEXT: s_mov_b32 s10, 7 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s0, 9 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_and_b32 s3, s9, 31 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; SI-NEXT: s_mov_b32 s11, s2 -; SI-NEXT: s_and_b32 s1, s8, 31 -; SI-NEXT: s_lshr_b64 s[2:3], s[10:11], s1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_and_b32 s1, s3, 31 +; SI-NEXT: s_mov_b32 s11, s0 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32_imm_src1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s6, 9 +; VI-NEXT: s_mov_b32 s8, 7 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_and_b32 s1, s3, 31 +; VI-NEXT: s_mov_b32 s9, s0 +; VI-NEXT: s_and_b32 s0, s2, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_and_b32 s0, s5, 31 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 -; VI-NEXT: s_mov_b32 s6, 7 -; VI-NEXT: s_mov_b32 s7, s2 -; VI-NEXT: s_and_b32 s1, s4, 31 -; VI-NEXT: s_lshr_b64 s[2:3], s[6:7], s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32_imm_src1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s4, 9 ; GFX9-NEXT: s_mov_b32 s8, 7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: s_and_b32 s3, s7, 31 -; GFX9-NEXT: s_mov_b32 s9, s2 -; GFX9-NEXT: s_and_b32 s2, s6, 31 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s9, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm_src1: @@ -650,61 +659,61 @@ define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> % ; GFX10-LABEL: fshr_v2i32_imm_src1: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b32 s4, 9 ; GFX10-NEXT: s_mov_b32 s8, 7 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, s3 -; GFX10-NEXT: s_mov_b32 s9, s2 -; GFX10-NEXT: s_and_b32 s2, s6, 31 -; GFX10-NEXT: s_and_b32 s6, s7, 31 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s9, s0 +; GFX10-NEXT: s_and_b32 s0, s2, 31 +; GFX10-NEXT: s_and_b32 s2, s3, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm_src1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, 9 ; GFX11-NEXT: s_mov_b32 s8, 7 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s7, s3 -; GFX11-NEXT: s_mov_b32 s9, s2 -; GFX11-NEXT: s_and_b32 s2, s4, 31 -; GFX11-NEXT: s_and_b32 s4, s5, 31 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_and_b32 s0, s2, 31 +; GFX11-NEXT: s_and_b32 s2, s3, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_v2i32_imm_src1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s6, 9 ; GFX12-NEXT: s_mov_b32 s8, 7 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s7, s3 -; GFX12-NEXT: s_mov_b32 s9, s2 -; GFX12-NEXT: s_and_b32 s2, s4, 31 -; GFX12-NEXT: s_and_b32 s4, s5, 31 -; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 -; GFX12-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_mov_b32 s9, s0 +; GFX12-NEXT: s_and_b32 s0, s2, 31 +; GFX12-NEXT: s_and_b32 s2, s3, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> , <2 x i32> %y) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 580eeda73781e..da132d0269e6b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -9552,47 +9552,6 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a ; GFX9-NEXT: s_cbranch_execnz .LBB136_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_u32 s3, s0, 0x4650 -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_and_b32 s0, s3, -4 -; GFX11-NEXT: s_and_b32 s3, s3, 3 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_lshl_b32 s5, s3, 3 -; GFX11-NEXT: s_and_b32 s6, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s3, s2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s4, s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .LBB136_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1 -; GFX11-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB136_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_endpgm %gep = getelementptr i16, ptr addrspace(1) %out, i64 9000 %val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void @@ -9712,47 +9671,6 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: s_cbranch_execnz .LBB137_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_u32 s3, s0, 0x2328 -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_and_b32 s0, s3, -4 -; GFX11-NEXT: s_and_b32 s3, s3, 3 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_lshl_b32 s5, s3, 3 -; GFX11-NEXT: s_and_b32 s6, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s2, 0xff, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s3, s2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: s_lshl_b32 s4, s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .LBB137_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1 -; GFX11-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB137_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %out, i64 9000 %val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 87e57298f5dc6..8e427a6ef2023 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -98,16 +98,16 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_add_u32 s4, s0, 4 +; CIVI-NEXT: s_addc_u32 s5, s1, 0 +; CIVI-NEXT: v_mov_b32_e32 v2, s4 +; CIVI-NEXT: v_mov_b32_e32 v4, s3 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v3, s5 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 -; CIVI-NEXT: s_add_u32 s0, s0, 4 -; CIVI-NEXT: flat_store_dword v[0:1], v2 -; CIVI-NEXT: s_addc_u32 s1, s1, 0 -; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: v_mov_b32_e32 v2, s3 -; CIVI-NEXT: flat_store_short v[0:1], v2 +; CIVI-NEXT: v_mov_b32_e32 v5, s2 +; CIVI-NEXT: flat_store_short v[2:3], v4 +; CIVI-NEXT: flat_store_dword v[0:1], v5 ; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3f16_arg: @@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: v_mov_b32_e32 v3, s3 ; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CIVI-NEXT: s_endpgm @@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm store <4 x half> %arg, ptr addrspace(1) %out @@ -348,21 +348,37 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 } define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { -; CIVI-LABEL: extload_v3f16_to_v3f32_arg: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CIVI-NEXT: s_add_i32 s12, s12, s17 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_lshr_b32 s4, s2, 16 -; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CIVI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CIVI-NEXT: v_mov_b32_e32 v3, s0 -; CIVI-NEXT: v_mov_b32_e32 v4, s1 -; CIVI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] -; CIVI-NEXT: s_endpgm +; CI-LABEL: extload_v3f16_to_v3f32_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] +; CI-NEXT: s_endpgm +; +; VI-LABEL: extload_v3f16_to_v3f32_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] +; VI-NEXT: s_endpgm ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: @@ -370,9 +386,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x float> @@ -388,14 +404,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: s_lshr_b32 s5, s3, 16 +; CI-NEXT: s_lshr_b32 s4, s3, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; @@ -408,12 +424,12 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; @@ -424,10 +440,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 ; GFX11-NEXT: s_lshr_b32 s5, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x float> @@ -708,33 +724,61 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 } define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { -; CIVI-LABEL: extload_v4f16_to_v4f64_arg: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CIVI-NEXT: s_add_i32 s12, s12, s17 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_lshr_b32 s5, s3, 16 -; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; CIVI-NEXT: s_lshr_b32 s4, s2, 16 -; CIVI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; CIVI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; CIVI-NEXT: s_add_u32 s2, s0, 16 -; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; CIVI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CIVI-NEXT: s_addc_u32 s3, s1, 0 -; CIVI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CIVI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; CIVI-NEXT: v_mov_b32_e32 v9, s3 -; CIVI-NEXT: v_mov_b32_e32 v8, s2 -; CIVI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; CIVI-NEXT: s_nop 0 -; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; CIVI-NEXT: s_endpgm +; CI-LABEL: extload_v4f16_to_v4f64_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s4, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_mov_b32_e32 v9, s3 +; CI-NEXT: v_mov_b32_e32 v8, s2 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CI-NEXT: s_endpgm +; +; VI-LABEL: extload_v4f16_to_v4f64_arg: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; VI-NEXT: s_endpgm ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 11826aa0b360d..e1b4cad370f96 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -290,19 +290,19 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dword s5, s[4:5], 0x34 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] +; GCN-NEXT: s_lshl_b32 s6, s6, 4 +; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_lshl_b32 s0, s5, 4 -; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 -; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] -; GCN-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -418,19 +418,19 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dword s5, s[4:5], 0x34 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0x10001 +; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] +; GCN-NEXT: s_lshl_b32 s6, s6, 4 +; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_lshl_b32 s0, s5, 4 -; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 -; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] -; GCN-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -443,18 +443,18 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x34 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_xor_b32 s5, s3, 0x1010101 +; GCN-NEXT: s_lshl_b32 s6, s6, 3 +; GCN-NEXT: s_xor_b32 s4, s2, 0x1010101 +; GCN-NEXT: s_lshl_b64 s[6:7], 0xff, s6 +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_lshl_b32 s4, s4, 3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_xor_b32 s1, s3, 0x1010101 -; GCN-NEXT: s_xor_b32 s0, s2, 0x1010101 -; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index e98d04556649a..7cbf9aeacfe48 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1571,13 +1571,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_lshl_b32 s0, s8, 4 -; VI-NEXT: s_mov_b32 s8, 0x50005 -; VI-NEXT: s_mov_b32 s9, s8 +; VI-NEXT: s_mov_b32 s0, 0x50005 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 -; VI-NEXT: s_xor_b64 s[8:9], s[2:3], s[8:9] -; VI-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: s_lshl_b32 s8, s8, 4 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; VI-NEXT: s_lshl_b64 s[8:9], 0xffff, s8 +; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 41b5103b38e50..a2da8876472ab 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1000,16 +1000,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s4, s0, 4 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_add_u32 s0, s0, 4 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: flat_store_short v[2:3], v4 +; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i16_arg: @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 23c5a079c5c6e..ab0000f6831b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -139,26 +139,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -183,51 +177,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v4, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: v_mov_b32_e32 v4, s12 -; HEURRC-NEXT: v_mov_b32_e32 v5, s13 -; HEURRC-NEXT: v_mov_b32_e32 v6, s14 -; HEURRC-NEXT: v_mov_b32_e32 v7, s15 -; HEURRC-NEXT: v_mov_b32_e32 v8, s0 -; HEURRC-NEXT: v_mov_b32_e32 v9, s1 -; HEURRC-NEXT: v_mov_b32_e32 v10, s2 -; HEURRC-NEXT: v_mov_b32_e32 v11, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -276,26 +258,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -320,51 +296,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v4, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: v_mov_b32_e32 v4, s12 -; HEURRC-NEXT: v_mov_b32_e32 v5, s13 -; HEURRC-NEXT: v_mov_b32_e32 v6, s14 -; HEURRC-NEXT: v_mov_b32_e32 v7, s15 -; HEURRC-NEXT: v_mov_b32_e32 v8, s0 -; HEURRC-NEXT: v_mov_b32_e32 v9, s1 -; HEURRC-NEXT: v_mov_b32_e32 v10, s2 -; HEURRC-NEXT: v_mov_b32_e32 v11, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -5455,76 +5419,58 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NEXT: v_mov_b32_e32 v5, s13 -; GCN-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v10, s2 -; GCN-NEXT: v_mov_b32_e32 v11, s3 +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v4, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: v_mov_b32_e32 v4, s12 -; HEURRC-NEXT: v_mov_b32_e32 v5, s13 -; HEURRC-NEXT: v_mov_b32_e32 v6, s14 -; HEURRC-NEXT: v_mov_b32_e32 v7, s15 -; HEURRC-NEXT: v_mov_b32_e32 v8, s0 -; HEURRC-NEXT: v_mov_b32_e32 v9, s1 -; HEURRC-NEXT: v_mov_b32_e32 v10, s2 -; HEURRC-NEXT: v_mov_b32_e32 v11, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -5573,76 +5519,58 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NEXT: v_mov_b32_e32 v5, s13 -; GCN-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v10, s2 -; GCN-NEXT: v_mov_b32_e32 v11, s3 +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v4, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: v_mov_b32_e32 v4, s12 -; HEURRC-NEXT: v_mov_b32_e32 v5, s13 -; HEURRC-NEXT: v_mov_b32_e32 v6, s14 -; HEURRC-NEXT: v_mov_b32_e32 v7, s15 -; HEURRC-NEXT: v_mov_b32_e32 v8, s0 -; HEURRC-NEXT: v_mov_b32_e32 v9, s1 -; HEURRC-NEXT: v_mov_b32_e32 v10, s2 -; HEURRC-NEXT: v_mov_b32_e32 v11, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 9ea8771506aa2..3897a0e028334 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -339,53 +339,53 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 -; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 -; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7 -; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000 +; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6 +; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v2f32: @@ -520,42 +520,41 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, s5, v1, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5 +; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x42b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 268e1e25f766f..3928ec2dd76d3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -341,53 +341,53 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x40549000 -; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc -; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x40549000, v8 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7 -; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000 +; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x40549000, v6 +; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 +; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc23369f4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp10_v2f32: @@ -522,42 +522,41 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, s5, v1, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5 +; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc23369f4 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x421a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index c3f5146168033..dd44a1a35067e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -176,26 +176,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: s_mov_b32 s4, s0 -; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v2, s5, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 -; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 -; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s0 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: s_cselect_b32 s6, 0xffffffc0, 0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s6 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s4 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v2f32: @@ -225,26 +224,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 -; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v2, s2, v2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_ldexp_f32 v3, v4, s0 -; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_cselect_b32 s3, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 74b6c75ac4948..7c06ae2f39f45 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf +; SI-SDAG-NEXT: s_mov_b32 s8, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s5, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 -; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-SDAG-NEXT: s_mov_b32 s7, 0x3f317217 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3 -; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4 +; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, -v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s5, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 ; SI-SDAG-NEXT: v_log_f32_e32 v5, v1 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5 -; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s5, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc @@ -406,51 +405,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 -; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3f317000, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7 -; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 -; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 -; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-SDAG-NEXT: v_log_f32_e32 v7, v3 -; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5 +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 -; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-SDAG-NEXT: v_log_f32_e32 v5, v1 +; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index c4fdac3ac5b0e..24e2fb4c8d9d7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b -; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf +; SI-SDAG-NEXT: s_mov_b32 s8, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s5, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 -; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3 -; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4 +; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, -v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; SI-SDAG-NEXT: v_fma_f32 v5, v3, s5, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1 ; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 ; SI-SDAG-NEXT: v_log_f32_e32 v5, v1 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5 -; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2 -; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v5, s5, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc @@ -406,51 +405,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 -; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3e9a2000, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7 -; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 -; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 -; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-SDAG-NEXT: v_log_f32_e32 v7, v3 -; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5 +; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 -; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-SDAG-NEXT: v_log_f32_e32 v5, v1 +; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log10_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 444f37059406a..e24fd1f22bfa6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -221,8 +221,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec @@ -238,11 +236,11 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v1 -; SI-SDAG-NEXT: s_mov_b32 s4, s0 -; SI-SDAG-NEXT: s_mov_b32 s5, s1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v2f32: @@ -285,16 +283,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 ; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; VI-SDAG-NEXT: v_ldexp_f32 v0, s2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4 -; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index 32a644ed334cc..ac6dd30283554 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -362,16 +362,15 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x ; SI-LABEL: s_test_imax_sgt_imm_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_max_i32 s0, s3, 9 -; SI-NEXT: s_max_i32 s1, s2, 9 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_max_i32 s5, s5, 9 +; SI-NEXT: s_max_i32 s4, s4, 9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_imax_sgt_imm_v2i32: @@ -869,16 +868,15 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x ; SI-LABEL: s_test_umax_ugt_imm_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_max_u32 s0, s3, 23 -; SI-NEXT: s_max_u32 s1, s2, 15 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_max_u32 s5, s5, 23 +; SI-NEXT: s_max_u32 s4, s4, 15 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_umax_ugt_imm_v2i32: diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index c571cfc3648e2..eff0680fe9a31 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -1074,118 +1074,118 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_ashr_i32 s0, s2, 16 -; CI-NEXT: s_ashr_i32 s1, s3, 16 +; CI-NEXT: s_ashr_i32 s6, s0, 16 +; CI-NEXT: s_ashr_i32 s7, s1, 16 +; CI-NEXT: s_sext_i32_i16 s0, s0 +; CI-NEXT: s_sext_i32_i16 s1, s1 +; CI-NEXT: s_ashr_i32 s8, s2, 16 +; CI-NEXT: s_ashr_i32 s9, s3, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 -; CI-NEXT: s_ashr_i32 s6, s4, 16 -; CI-NEXT: s_ashr_i32 s7, s5, 16 -; CI-NEXT: s_sext_i32_i16 s4, s4 -; CI-NEXT: s_sext_i32_i16 s5, s5 -; CI-NEXT: s_min_i32 s1, s1, s7 -; CI-NEXT: s_min_i32 s3, s3, s5 -; CI-NEXT: s_min_i32 s0, s0, s6 -; CI-NEXT: s_min_i32 s2, s2, s4 -; CI-NEXT: s_lshl_b32 s1, s1, 16 -; CI-NEXT: s_and_b32 s3, s3, 0xffff -; CI-NEXT: s_lshl_b32 s0, s0, 16 -; CI-NEXT: s_and_b32 s2, s2, 0xffff -; CI-NEXT: s_or_b32 s1, s3, s1 -; CI-NEXT: s_or_b32 s0, s2, s0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: s_min_i32 s7, s7, s9 +; CI-NEXT: s_min_i32 s1, s1, s3 +; CI-NEXT: s_min_i32 s3, s6, s8 +; CI-NEXT: s_min_i32 s0, s0, s2 +; CI-NEXT: s_lshl_b32 s7, s7, 16 +; CI-NEXT: s_and_b32 s1, s1, 0xffff +; CI-NEXT: s_lshl_b32 s3, s3, 16 +; CI-NEXT: s_and_b32 s0, s0, 0xffff +; CI-NEXT: s_or_b32 s1, s1, s7 +; CI-NEXT: s_or_b32 s0, s0, s3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_ashr_i32 s0, s5, 16 -; VI-NEXT: s_ashr_i32 s1, s3, 16 -; VI-NEXT: s_min_i32 s0, s1, s0 -; VI-NEXT: s_sext_i32_i16 s1, s5 +; VI-NEXT: s_ashr_i32 s6, s3, 16 +; VI-NEXT: s_ashr_i32 s7, s1, 16 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_min_i32 s1, s3, s1 -; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_min_i32 s6, s7, s6 +; VI-NEXT: s_min_i32 s1, s1, s3 +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: s_ashr_i32 s1, s4, 16 +; VI-NEXT: s_or_b32 s1, s1, s6 ; VI-NEXT: s_ashr_i32 s3, s2, 16 -; VI-NEXT: s_min_i32 s1, s3, s1 -; VI-NEXT: s_sext_i32_i16 s3, s4 +; VI-NEXT: s_ashr_i32 s6, s0, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s1, s2, s1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_min_i32 s3, s6, s3 +; VI-NEXT: s_min_i32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_pk_min_i16 v1, s3, v0 -; GFX9-NEXT: v_pk_min_i16 v0, s2, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 +; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_min_i16 v1, s3, s5 -; GFX10-NEXT: v_pk_min_i16 v0, s2, s4 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 +; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_min_i16 v1, s3, s5 -; GFX11-NEXT: v_pk_min_i16 v0, s2, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_pk_min_i16 v1, s1, s3 +; GFX11-NEXT: v_pk_min_i16 v0, s0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_imin_sle_v4i16: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_min_i16 v1, s3, s7 -; GFX1250-NEXT: v_pk_min_i16 v0, s2, s6 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: v_pk_min_i16 v1, s1, s3 +; GFX1250-NEXT: v_pk_min_i16 v0, s0, s2 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1250-NEXT: s_endpgm %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b @@ -1636,92 +1636,92 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_min_i32 s1, s1, s3 +; CI-NEXT: s_min_i32 s0, s0, s2 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_min_i32 s0, s3, s5 -; CI-NEXT: s_min_i32 s1, s2, s4 -; CI-NEXT: v_mov_b32_e32 v2, s1 -; CI-NEXT: v_mov_b32_e32 v3, s0 -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_min_i32 s1, s1, s3 +; VI-NEXT: s_min_i32 s0, s0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_min_i32 s0, s3, s5 -; VI-NEXT: s_min_i32 s1, s2, s4 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_min_i32 s3, s3, s5 -; GFX9-NEXT: s_min_i32 s2, s2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_min_i32 s1, s1, s3 +; GFX9-NEXT: s_min_i32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_min_i32 s2, s2, s4 -; GFX10-NEXT: s_min_i32 s3, s3, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_min_i32 s0, s0, s2 +; GFX10-NEXT: s_min_i32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s2, s4 -; GFX11-NEXT: s_min_i32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_min_i32 s0, s0, s2 +; GFX11-NEXT: s_min_i32 s1, s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; GFX1250-LABEL: s_test_imin_slt_v2i32: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_min_i32 s2, s2, s6 -; GFX1250-NEXT: s_min_i32 s3, s3, s7 -; GFX1250-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-NEXT: v_mov_b32_e32 v1, s3 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_min_i32 s0, s0, s2 +; GFX1250-NEXT: s_min_i32 s1, s1, s3 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1250-NEXT: s_endpgm %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 6eefafa37648f..1ed024f7aed36 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -76,33 +76,19 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm ; -; GFX1250-SDAG-LABEL: fadd_v2_vs: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: fadd_v2_vs: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] -; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: fadd_v2_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -1377,33 +1363,19 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-NEXT: s_endpgm ; -; GFX1250-SDAG-LABEL: fmul_v2_vs: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: fmul_v2_vs: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 -; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] -; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: fmul_v2_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -3568,8 +3540,8 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX900-LABEL: fadd_fadd_fsub: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, s3 ; GFX900-NEXT: v_add_f32_e32 v0, s1, v0 @@ -3577,14 +3549,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX900-NEXT: v_add_f32_e32 v3, s2, v0 ; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1 ; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX900-NEXT: s_endpgm ; ; PACKED-SDAG-LABEL: fadd_fadd_fsub: ; PACKED-SDAG: ; %bb.0: ; %bb ; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; PACKED-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0 @@ -3592,7 +3564,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0 ; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] -; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] ; PACKED-SDAG-NEXT: s_endpgm ; ; GFX90A-GISEL-LABEL: fadd_fadd_fsub: diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index bfdfce12cecf7..0a1d15bf945f9 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -94,64 +94,62 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_sub_i32 s5, 32, s5 -; SI-NEXT: s_sub_i32 s4, 32, s4 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_alignbit_b32 v1, s7, s7, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_alignbit_b32 v0, s6, s6, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_sub_i32 s3, 32, s3 +; SI-NEXT: s_sub_i32 s2, 32, s2 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_sub_i32 s1, 32, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_sub_i32 s0, 32, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v2 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_sub_i32 s2, 32, s2 +; GFX8-NEXT: s_sub_i32 s3, 32, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 +; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s4, 32, s7 -; GFX10-NEXT: s_sub_i32 s5, 32, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s4 -; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s5 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_sub_i32 s3, 32, s3 +; GFX10-NEXT: s_sub_i32 s2, 32, s2 +; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s4, 32, s7 -; GFX11-NEXT: s_sub_i32 s5, 32, s6 -; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s4 -; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s5 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sub_i32 s3, 32, s3 +; GFX11-NEXT: s_sub_i32 s2, 32, s2 +; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = shl <2 x i32> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 938d24481aaf7..403a556688091 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -83,56 +83,54 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_alignbit_b32 v1, s7, s7, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_alignbit_b32 v0, s6, s6, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v4 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 +; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s7 -; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s6 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s5 -; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %tmp0 = sub <2 x i32> , %y diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll index 401b6f20d3405..f14a5cc19774d 100644 --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -56,8 +56,8 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 { } ; SI-LABEL: {{^}}s_addk_v2i32_k0: -; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 ; SI: s_endpgm ; Note: dummy argument here to prevent combining of descriptor loads for %out and %b define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index be10302c42854..76f8f484fc763 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -331,80 +331,79 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_sub_i32 s1, 0, s2 -; VI-NEXT: s_lshr_b32 s5, s2, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_sub_i32 s0, 0, s3 -; VI-NEXT: s_lshr_b32 s4, s3, 16 -; VI-NEXT: s_sub_i32 s5, 0, s5 -; VI-NEXT: s_ashr_i32 s6, s2, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sub_i32 s4, 0, s4 -; VI-NEXT: s_sext_i32_i16 s5, s5 -; VI-NEXT: s_max_i32 s1, s2, s1 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_sext_i32_i16 s2, s3 -; VI-NEXT: s_max_i32 s5, s6, s5 -; VI-NEXT: s_ashr_i32 s6, s3, 16 +; VI-NEXT: s_lshr_b32 s7, s2, 16 +; VI-NEXT: s_sub_i32 s7, 0, s7 +; VI-NEXT: s_sub_i32 s4, 0, s3 +; VI-NEXT: s_lshr_b32 s6, s3, 16 +; VI-NEXT: s_ashr_i32 s8, s2, 16 +; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: s_sub_i32 s5, 0, s2 +; VI-NEXT: s_sub_i32 s6, 0, s6 +; VI-NEXT: s_max_i32 s7, s8, s7 +; VI-NEXT: s_ashr_i32 s8, s3, 16 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_max_i32 s0, s2, s0 -; VI-NEXT: s_max_i32 s4, s6, s4 -; VI-NEXT: s_add_i32 s0, s0, 2 -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_add_i32 s1, s1, 2 -; VI-NEXT: s_or_b32 s0, s2, s0 -; VI-NEXT: s_lshl_b32 s2, s5, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s1, s2, s1 -; VI-NEXT: s_add_i32 s0, s0, 0x20000 -; VI-NEXT: s_add_i32 s1, s1, 0x20000 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_sext_i32_i16 s3, s3 +; VI-NEXT: s_sext_i32_i16 s6, s6 +; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_max_i32 s3, s3, s4 +; VI-NEXT: s_max_i32 s6, s8, s6 +; VI-NEXT: s_max_i32 s2, s2, s5 +; VI-NEXT: s_add_i32 s3, s3, 2 +; VI-NEXT: s_lshl_b32 s4, s6, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_add_i32 s2, s2, 2 +; VI-NEXT: s_or_b32 s3, s4, s3 +; VI-NEXT: s_lshl_b32 s4, s7, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_or_b32 s2, s4, s2 +; VI-NEXT: s_add_i32 s3, s3, 0x20000 +; VI-NEXT: s_add_i32 s2, s2, 0x20000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_abs_v4i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s0 -; CI-NEXT: s_mov_b32 s5, s1 -; CI-NEXT: s_ashr_i32 s0, s3, 16 -; CI-NEXT: s_ashr_i32 s1, s2, 16 -; CI-NEXT: s_lshr_b32 s8, s2, 16 -; CI-NEXT: s_lshr_b32 s9, s3, 16 -; CI-NEXT: s_sext_i32_i16 s10, s3 -; CI-NEXT: s_sext_i32_i16 s11, s2 -; CI-NEXT: s_sub_i32 s3, 0, s3 -; CI-NEXT: s_sub_i32 s2, 0, s2 -; CI-NEXT: s_sext_i32_i16 s3, s3 -; CI-NEXT: s_sext_i32_i16 s2, s2 +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: s_ashr_i32 s6, s5, 16 +; CI-NEXT: s_lshr_b32 s9, s5, 16 +; CI-NEXT: s_sext_i32_i16 s10, s5 +; CI-NEXT: s_sub_i32 s5, 0, s5 +; CI-NEXT: s_ashr_i32 s7, s4, 16 +; CI-NEXT: s_lshr_b32 s8, s4, 16 +; CI-NEXT: s_sext_i32_i16 s11, s4 +; CI-NEXT: s_sext_i32_i16 s5, s5 +; CI-NEXT: s_sub_i32 s4, 0, s4 ; CI-NEXT: s_sub_i32 s9, 0, s9 -; CI-NEXT: s_sub_i32 s8, 0, s8 +; CI-NEXT: s_sext_i32_i16 s4, s4 ; CI-NEXT: s_sext_i32_i16 s9, s9 +; CI-NEXT: s_sub_i32 s8, 0, s8 +; CI-NEXT: s_max_i32 s5, s10, s5 ; CI-NEXT: s_sext_i32_i16 s8, s8 -; CI-NEXT: s_max_i32 s2, s11, s2 -; CI-NEXT: s_max_i32 s3, s10, s3 -; CI-NEXT: s_max_i32 s1, s1, s8 -; CI-NEXT: s_max_i32 s0, s0, s9 -; CI-NEXT: s_add_i32 s3, s3, 2 -; CI-NEXT: s_add_i32 s2, s2, 2 -; CI-NEXT: s_lshl_b32 s0, s0, 16 -; CI-NEXT: s_and_b32 s3, s3, 0xffff -; CI-NEXT: s_lshl_b32 s1, s1, 16 -; CI-NEXT: s_and_b32 s2, s2, 0xffff -; CI-NEXT: s_or_b32 s0, s0, s3 -; CI-NEXT: s_or_b32 s1, s1, s2 -; CI-NEXT: s_add_i32 s0, s0, 0x20000 -; CI-NEXT: s_add_i32 s1, s1, 0x20000 -; CI-NEXT: v_mov_b32_e32 v0, s1 -; CI-NEXT: v_mov_b32_e32 v1, s0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_max_i32 s6, s6, s9 +; CI-NEXT: s_max_i32 s4, s11, s4 +; CI-NEXT: s_add_i32 s5, s5, 2 +; CI-NEXT: s_max_i32 s7, s7, s8 +; CI-NEXT: s_lshl_b32 s6, s6, 16 +; CI-NEXT: s_and_b32 s5, s5, 0xffff +; CI-NEXT: s_add_i32 s4, s4, 2 +; CI-NEXT: s_or_b32 s5, s6, s5 +; CI-NEXT: s_lshl_b32 s6, s7, 16 +; CI-NEXT: s_and_b32 s4, s4, 0xffff +; CI-NEXT: s_or_b32 s4, s6, s4 +; CI-NEXT: s_add_i32 s5, s5, 0x20000 +; CI-NEXT: s_add_i32 s4, s4, 0x20000 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm %z0 = insertelement <4 x i16> poison, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll index d8f7f8d7fefcc..9b3b52012f327 100644 --- a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll @@ -136,8 +136,7 @@ define amdgpu_kernel void @store_as4_2xi32(ptr addrspace(4) %p, <2 x i32> %v) { ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; CHECK-NEXT: s_endpgm store <2 x i32> %v, ptr addrspace(4) %p @@ -164,8 +163,7 @@ define amdgpu_kernel void @store_as4_2xfloat(ptr addrspace(4) %p, <2 x float> %v ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; CHECK-NEXT: s_endpgm store <2 x float> %v, ptr addrspace(4) %p diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index fc42f476fe7d0..eaab3531824c4 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -164,102 +164,98 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s8 -; GFX6-NEXT: s_sub_i32 s2, s2, s4 -; GFX6-NEXT: s_sub_i32 s4, s2, s8 -; GFX6-NEXT: s_cmp_ge_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s2, s4, s2 -; GFX6-NEXT: s_sub_i32 s4, s2, s8 -; GFX6-NEXT: s_cmp_ge_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s2, s4, s2 -; GFX6-NEXT: s_sub_i32 s4, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 -; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_readfirstlane_b32 s6, v0 +; GFX6-NEXT: s_mul_i32 s6, s6, s2 +; GFX6-NEXT: s_sub_i32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s6, s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s6, s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s2, 0, s3 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s9 -; GFX6-NEXT: s_sub_i32 s0, s3, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s9 -; GFX6-NEXT: s_cmp_ge_u32 s0, s9 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s9 -; GFX6-NEXT: s_cmp_ge_u32 s0, s9 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX8-NEXT: s_sub_i32 s0, 0, s6 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: s_sub_i32 s6, 0, s2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_mul_i32 s4, s4, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s4 -; GFX8-NEXT: s_sub_i32 s4, s2, s6 -; GFX8-NEXT: s_cmp_ge_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s4, s2 -; GFX8-NEXT: s_sub_i32 s4, s2, s6 -; GFX8-NEXT: s_cmp_ge_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s4, s2 -; GFX8-NEXT: s_sub_i32 s4, 0, s7 -; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_readfirstlane_b32 s6, v0 +; GFX8-NEXT: s_mul_i32 s6, s6, s2 +; GFX8-NEXT: s_sub_i32 s0, s0, s6 +; GFX8-NEXT: s_sub_i32 s6, s0, s2 +; GFX8-NEXT: s_cmp_ge_u32 s0, s2 +; GFX8-NEXT: s_cselect_b32 s0, s6, s0 +; GFX8-NEXT: s_sub_i32 s6, s0, s2 +; GFX8-NEXT: s_cmp_ge_u32 s0, s2 +; GFX8-NEXT: s_cselect_b32 s0, s6, s0 +; GFX8-NEXT: s_sub_i32 s2, 0, s3 +; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_mul_hi_u32 v2, s3, v0 +; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v2 -; GFX8-NEXT: s_mul_i32 s0, s0, s7 -; GFX8-NEXT: s_sub_i32 s0, s3, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s7 -; GFX8-NEXT: s_cmp_ge_u32 s0, s7 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: s_mul_i32 s0, s0, s3 +; GFX8-NEXT: s_sub_i32 s0, s1, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s3 +; GFX8-NEXT: s_cmp_ge_u32 s0, s3 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s7 -; GFX8-NEXT: s_cmp_ge_u32 s0, s7 +; GFX8-NEXT: s_sub_i32 s1, s0, s3 +; GFX8-NEXT: s_cmp_ge_u32 s0, s3 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm %result0 = udiv <2 x i32> %x, %y store <2 x i32> %result0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index bd311a1054a41..983acfc2c0699 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -340,8 +340,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/Transforms/InstCombine/copy-access-metadata.ll b/llvm/test/Transforms/InstCombine/copy-access-metadata.ll deleted file mode 100644 index c687f3796edcb..0000000000000 --- a/llvm/test/Transforms/InstCombine/copy-access-metadata.ll +++ /dev/null @@ -1,215 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -S -passes=instcombine %s | FileCheck %s - -@test.data = private unnamed_addr constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 4 -@test.ptrdata = private unnamed_addr constant [8 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null], align 8 - -; Verify that InstCombine copies range metadata when cloning a load as part of -; replacing an alloca initialized via memcpy from a constant. OK -define i32 @copy_range_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define i32 @copy_range_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !range [[RNG0:![0-9]+]] -; CHECK-NEXT: ret i32 [[L]] -; -entry: - %data = alloca [8 x i32], align 4 - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false) - %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x - %l = load i32, ptr %arrayidx, align 4, !range !0 - ret i32 %l -} - -declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) - -!0 = !{i32 0, i32 100} - -; Verify TBAA metadata on a cloned load is preserved. OK -define i32 @copy_tbaa_metadata_after_memcpy(i64 %x, ptr %sink) { -; CHECK-LABEL: define i32 @copy_tbaa_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]], ptr [[SINK:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[SCALAR_TYPE_TBAA1:![0-9]+]] -; CHECK-NEXT: store i32 [[L]], ptr [[SINK]], align 4 -; CHECK-NEXT: ret i32 [[L]] -; -entry: - %data = alloca [8 x i32], align 4 - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false) - %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x - %l = load i32, ptr %arrayidx, align 4, !tbaa !1 - store i32 %l, ptr %sink, align 4 - ret i32 %l -} - -!1 = !{!2, !2, i64 0} -!2 = !{!"scalar type", !3} -!3 = !{!"root"} - -; Verify dereferenceable_or_null metadata on a cloned load is preserved -; when the loaded value type is a pointer. OK -define ptr @copy_deref_or_null_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define ptr @copy_deref_or_null_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: ret ptr null -; -entry: - %data = alloca [8 x ptr], align 8 - call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 @test.ptrdata, i64 64, i1 false) - %arrayidx = getelementptr inbounds [8 x ptr], ptr %data, i64 0, i64 %x - %l = load ptr, ptr %arrayidx, align 8, !dereferenceable_or_null !4 - ret ptr %l -} - -!4 = !{i64 8} - -; Verify nonnull metadata on a cloned load is preserved -; when the loaded value type is a pointer. OK -define ptr @copy_nonnull_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define ptr @copy_nonnull_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: ret ptr null -; -entry: - %data = alloca [8 x ptr], align 8 - call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 @test.ptrdata, i64 64, i1 false) - %arrayidx = getelementptr inbounds [8 x ptr], ptr %data, i64 0, i64 %x - %l = load ptr, ptr %arrayidx, align 8, !nonnull !5 - ret ptr %l -} - -!5 = !{} - -; Verify invariant.load metadata on a cloned load is preserved. OK -define i32 @copy_invariant_load_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define i32 @copy_invariant_load_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !invariant.load [[META4:![0-9]+]] -; CHECK-NEXT: ret i32 [[L]] -; -entry: - %data = alloca [8 x i32], align 4 - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false) - %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x - %l = load i32, ptr %arrayidx, align 4, !invariant.load !5 - ret i32 %l -} - -; Verify alias.scope and noalias metadata on a cloned load are preserved. OK -define i32 @copy_aliasscope_noalias_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define i32 @copy_aliasscope_noalias_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META5]] -; CHECK-NEXT: ret i32 [[L]] -; -entry: - %data = alloca [8 x i32], align 4 - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false) - %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x - %l = load i32, ptr %arrayidx, align 4, !alias.scope !6, !noalias !6 - ret i32 %l -} - -; Verify nontemporal metadata on a cloned load is preserved.OK -define i32 @copy_nontemporal_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define i32 @copy_nontemporal_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !nontemporal [[META8:![0-9]+]] -; CHECK-NEXT: ret i32 [[L]] -; -entry: - %data = alloca [8 x i32], align 4 - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false) - %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x - %l = load i32, ptr %arrayidx, align 4, !nontemporal !9 - ret i32 %l -} - -; Verify access group metadata on a cloned load is preserved. OK -define i32 @copy_access_group_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define i32 @copy_access_group_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]] -; CHECK-NEXT: ret i32 [[L]] -; -entry: - %data = alloca [8 x i32], align 4 - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false) - %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x - %l = load i32, ptr %arrayidx, align 4, !llvm.access.group !10 - ret i32 %l -} - -; Verify noalias.addrspace metadata on a cloned load is preserved. -define i32 @copy_noalias_addrspace_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define i32 @copy_noalias_addrspace_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !noalias.addrspace [[META10:![0-9]+]] -; CHECK-NEXT: ret i32 [[L]] -; -entry: - %data = alloca [8 x i32], align 4 - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false) - %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x - %l = load i32, ptr %arrayidx, align 4, !noalias.addrspace !12 - ret i32 %l -} - -; Verify llvm.mem.parallel_loop_access metadata on a cloned load is preserved. OK -define i32 @copy_mem_parallel_loop_access_metadata_after_memcpy(i64 %x) { -; CHECK-LABEL: define i32 @copy_mem_parallel_loop_access_metadata_after_memcpy( -; CHECK-SAME: i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access [[META11:![0-9]+]] -; CHECK-NEXT: ret i32 [[L]] -; -entry: - %data = alloca [8 x i32], align 4 - call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false) - %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x - %l = load i32, ptr %arrayidx, align 4, !llvm.mem.parallel_loop_access !13 - ret i32 %l -} - -!6 = !{!7} -!7 = distinct !{!7, !8} -!8 = distinct !{!8} -!9 = !{i32 1} -!10 = distinct !{} -!12 = !{i32 5, i32 6} -!13 = !{!14} -!14 = distinct !{} - - - -;. -; CHECK: [[RNG0]] = !{i32 0, i32 100} -; CHECK: [[SCALAR_TYPE_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0} -; CHECK: [[META2]] = !{!"scalar type", [[META3:![0-9]+]]} -; CHECK: [[META3]] = !{!"root"} -; CHECK: [[META4]] = !{} -; CHECK: [[META5]] = !{[[META6:![0-9]+]]} -; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]} -; CHECK: [[META7]] = distinct !{[[META7]]} -; CHECK: [[META8]] = !{i32 1} -; CHECK: [[ACC_GRP9]] = distinct !{} -; CHECK: [[META10]] = !{i32 5, i32 6} -; CHECK: [[META11]] = !{[[META12:![0-9]+]]} -; CHECK: [[META12]] = distinct !{} -;. diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/copy-metadata-load-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/copy-metadata-load-store.ll deleted file mode 100644 index 7cb74c3cc2d2c..0000000000000 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/copy-metadata-load-store.ll +++ /dev/null @@ -1,159 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s - -; We expect the merged vector load to retain nontemporal and tbaa, and normalization to handle -; other load-only metadata. -define void @lsv_copy_load_metadata(ptr %p) { -; CHECK-LABEL: define void @lsv_copy_load_metadata( -; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[CHAR_TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]] -; CHECK-NEXT: [[LD01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[LD1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[LD1_MUT_BC:%.*]] = bitcast i32 [[LD1_MUT2]] to <2 x i16> -; CHECK-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - %ld0 = load i32, ptr %p, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6 - %ld1 = load <2 x i16>, ptr %p1, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6 - ret void -} - -; Check that metadata on stores is preserved when LSV normalizes mixed-typed -; chains (exercises copyMetadataForAccess on stores). -define void @lsv_copy_store_metadata(ptr %p) { -; CHECK-LABEL: define void @lsv_copy_store_metadata( -; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]] -; CHECK-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - store i32 7, ptr %p, align 4, !nontemporal !5 - store <2 x i16> , ptr %p1, align 4, !nontemporal !5 - ret void -} - -; Copy alias.scope and noalias metadata on vectorized stores. -define void @lsv_copy_store_alias_metadata(ptr %p) { -; CHECK-LABEL: define void @lsv_copy_store_alias_metadata( -; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META5]] -; CHECK-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - store i32 1, ptr %p, align 4, !alias.scope !11, !noalias !11 - store <2 x i16> , ptr %p1, align 4, !alias.scope !11, !noalias !11 - ret void -} - -; Copy access group metadata on vectorized stores. -define void @lsv_copy_store_access_group(ptr %p) { -; CHECK-LABEL: define void @lsv_copy_store_access_group( -; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4 -; CHECK-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - store i32 9, ptr %p, align 4, !llvm.access.group !14 - store <2 x i16> , ptr %p1, align 4, !llvm.access.group !14 - ret void -} - -; Copy noundef metadata on vectorized stores. -define void @lsv_copy_store_noundef(ptr %p) { -; CHECK-LABEL: define void @lsv_copy_store_noundef( -; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4 -; CHECK-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - store i32 42, ptr %p, align 4, !noundef !15 - store <2 x i16> , ptr %p1, align 4, !noundef !15 - ret void -} - -; Copy noalias.addrspace metadata on vectorized stores. -define void @lsv_copy_store_noalias_addrspace(ptr %p) { -; CHECK-LABEL: define void @lsv_copy_store_noalias_addrspace( -; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4 -; CHECK-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - store i32 11, ptr %p, align 4, !noalias.addrspace !16 - store <2 x i16> , ptr %p1, align 4, !noalias.addrspace !16 - ret void -} - -; Copy llvm.mem.parallel_loop_access metadata on vectorized stores. -define void @lsv_copy_store_mem_parallel_loop_access(ptr %p) { -; CHECK-LABEL: define void @lsv_copy_store_mem_parallel_loop_access( -; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4 -; CHECK-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - store i32 13, ptr %p, align 4, !llvm.mem.parallel_loop_access !17 - store <2 x i16> , ptr %p1, align 4, !llvm.mem.parallel_loop_access !17 - ret void -} - -; Normalized type is not a pointer in the following test, avoid copying -; dereferenceable_or_null metadata. -define void @lsv_no_copy_deref_or_null(ptr %p) { -; CHECK-LABEL: define void @lsv_no_copy_deref_or_null( -; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[P]], align 8 -; CHECK-NEXT: [[LD0_MUT1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[LD12:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[LD0_MUT_BC:%.*]] = inttoptr i64 [[LD0_MUT1]] to ptr -; CHECK-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - %ld0 = load ptr, ptr %p, align 4, !dereferenceable_or_null !7 - %ld1 = load i64, ptr %p1, align 4 - ret void -} - -!0 = !{!3, !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} -!5 = !{i32 1} -!6 = !{} -!7 = !{i64 8} -!8 = !{i64 1, i64 256} -!11 = !{!12} -!12 = distinct !{!12, !13} -!13 = distinct !{!13} -!14 = distinct !{} -!15 = !{} -!16 = !{i32 5, i32 6} -!17 = !{!18} -!18 = distinct !{} -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -;. -; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} -; CHECK: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0} -; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"} -; CHECK: [[META3]] = !{} -; CHECK: [[META4]] = !{i32 1} -; CHECK: [[META5]] = !{[[META6:![0-9]+]]} -; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]} -; CHECK: [[META7]] = distinct !{[[META7]]} -;. diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll index 64e8b1afb8c80..c53f4b6d7ff2b 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll @@ -1,273 +1,57 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s -define void @no_merge_i16_half(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @no_merge_i16_half( +define void @merge_i32_v2i16_f32_v4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-LABEL: define void @merge_i32_v2i16_f32_v4i8( ; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR1]], i64 1 -; CHECK-NEXT: [[LOAD_0:%.*]] = load i16, ptr addrspace(1) [[PTR1]], align 2 -; CHECK-NEXT: [[LOAD_1:%.*]] = load half, ptr addrspace(1) [[GEP_1]], align 2 -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i16 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 2 -; CHECK-NEXT: store half [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 2 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i16, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i16, ptr addrspace(1) %ptr1 - %load.1 = load half, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i16, ptr addrspace(2) %ptr2, i64 1 - store i16 %load.0, ptr addrspace(2) %ptr2 - store half %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @no_merge_i16_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @no_merge_i16_float( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR1]], i64 1 -; CHECK-NEXT: [[LOAD_0:%.*]] = load i16, ptr addrspace(1) [[PTR1]], align 2 -; CHECK-NEXT: [[LOAD_1:%.*]] = load float, ptr addrspace(1) [[GEP_1]], align 4 -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i16 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 2 -; CHECK-NEXT: store float [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 4 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i16, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i16, ptr addrspace(1) %ptr1 - %load.1 = load float, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i16, ptr addrspace(2) %ptr2, i64 1 - store i16 %load.0, ptr addrspace(2) %ptr2 - store float %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @merge_i32_v2i16(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @merge_i32_v2i16( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[LOAD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[LOAD_1_MUT_BC:%.*]] = bitcast i32 [[LOAD_1_MUT2]] to <2 x i16> -; CHECK-NEXT: [[LOAD_1_BC:%.*]] = bitcast <2 x i16> [[LOAD_1_MUT_BC]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_01]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD_1_BC]], i32 1 -; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(2) [[PTR2]], align 4 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i32, ptr addrspace(1) %ptr1 - %load.1 = load <2 x i16>, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1 - store i32 %load.0, ptr addrspace(2) %ptr2 - store <2 x i16> %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @no_merge_i32_ptr(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @no_merge_i32_ptr( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1 -; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-NEXT: [[LOAD_1:%.*]] = load ptr, ptr addrspace(1) [[GEP_1]], align 8 -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4 -; CHECK-NEXT: store ptr [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 8 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i32, ptr addrspace(1) %ptr1 - %load.1 = load ptr, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1 - store i32 %load.0, ptr addrspace(2) %ptr2 - store ptr %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @no_merge_i32_half(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @no_merge_i32_half( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1 -; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-NEXT: [[LOAD_1:%.*]] = load half, ptr addrspace(1) [[GEP_1]], align 2 -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4 -; CHECK-NEXT: store half [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 2 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i32, ptr addrspace(1) %ptr1 - %load.1 = load half, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1 - store i32 %load.0, ptr addrspace(2) %ptr2 - store half %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @merge_i32_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @merge_i32_float( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[LOAD_12]] to float -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_01]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1 -; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(2) [[PTR2]], align 4 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i32, ptr addrspace(1) %ptr1 - %load.1 = load float, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1 - store i32 %load.0, ptr addrspace(2) %ptr2 - store float %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @no_merge_i32_double(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @no_merge_i32_double( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1 -; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-NEXT: [[LOAD_1:%.*]] = load double, ptr addrspace(1) [[GEP_1]], align 8 -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4 -; CHECK-NEXT: store double [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 8 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i32, ptr addrspace(1) %ptr1 - %load.1 = load double, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1 - store i32 %load.0, ptr addrspace(2) %ptr2 - store double %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @merge_i64_ptr(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @merge_i64_ptr( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8 -; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[LOAD_12]] to ptr -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8 -; CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(2) [[STORE_GEP_1]], align 8 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i64, ptr addrspace(1) %ptr1 - %load.1 = load ptr, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1 - store i64 %load.0, ptr addrspace(2) %ptr2 - store ptr %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @no_merge_i64_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @no_merge_i64_float( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[PTR1]], i64 1 -; CHECK-NEXT: [[LOAD_0:%.*]] = load i64, ptr addrspace(1) [[PTR1]], align 8 -; CHECK-NEXT: [[LOAD_1:%.*]] = load float, ptr addrspace(1) [[GEP_1]], align 4 -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i64 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 8 -; CHECK-NEXT: store float [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 4 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i64, ptr addrspace(1) %ptr1 - %load.1 = load float, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1 - store i64 %load.0, ptr addrspace(2) %ptr2 - store float %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @merge_i64_double(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @merge_i64_double( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8 -; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[LOAD_12]] to double -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8 -; CHECK-NEXT: store double [[TMP2]], ptr addrspace(2) [[STORE_GEP_1]], align 8 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i64, ptr addrspace(1) %ptr1 - %load.1 = load double, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1 - store i64 %load.0, ptr addrspace(2) %ptr2 - store double %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @merge_i64_v2i32(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @merge_i64_v2i32( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8 -; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[LOAD_1_MUT2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[LOAD_1_MUT_BC:%.*]] = bitcast i64 [[LOAD_1_MUT2]] to <2 x i32> -; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1 -; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8 -; CHECK-NEXT: [[LOAD_1_BC:%.*]] = bitcast <2 x i32> [[LOAD_1_MUT_BC]] to i64 -; CHECK-NEXT: store i64 [[LOAD_1_BC]], ptr addrspace(2) [[STORE_GEP_1]], align 8 -; CHECK-NEXT: ret void -; - %gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1 - %load.0 = load i64, ptr addrspace(1) %ptr1 - %load.1 = load <2 x i32>, ptr addrspace(1) %gep.1 - %store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1 - store i64 %load.0, ptr addrspace(2) %ptr2 - store <2 x i32> %load.1, ptr addrspace(2) %store.gep.1 - ret void -} - -define void @merge_i32_v2i16_v4i8(ptr addrspace(1) %ptr1) { -; CHECK-LABEL: define void @merge_i32_v2i16_v4i8( -; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]]) { -; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP2]], align 4 -; CHECK-NEXT: [[LOAD2_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[LOAD4_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[LOAD2_MUT_BC:%.*]] = bitcast i32 [[LOAD2_MUT1]] to <2 x i16> -; CHECK-NEXT: [[LOAD4_MUT_BC:%.*]] = bitcast i32 [[LOAD4_MUT2]] to <4 x i8> -; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 3 -; CHECK-NEXT: [[LOAD_3:%.*]] = load float, ptr addrspace(1) [[GEP_3]], align 4 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 2 +; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR1]], i64 3 +; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4 +; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-NEXT: store i32 [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-NEXT: store <2 x i16> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-NEXT: [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2 +; CHECK-NEXT: store float [[LOAD3]], ptr addrspace(2) [[STORE_GEP3]], align 4 +; CHECK-NEXT: [[STORE_GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(2) [[PTR2]], i64 3 +; CHECK-NEXT: store <4 x i8> [[LOAD4]], ptr addrspace(2) [[STORE_GEP4]], align 4 ; CHECK-NEXT: ret void ; - %load.0 = load i32, ptr addrspace(1) %ptr1, align 4 - %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1 - %load.1 = load <2 x i16>, ptr addrspace(1) %gep.1, align 4 - %gep.2 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 2 - %load.2 = load <4 x i8>, ptr addrspace(1) %gep.2, align 4 - %gep.3 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 3 - %load.3 = load float, ptr addrspace(1) %gep.3, align 4 + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0 + %load1 = load i32, ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr1, i64 1 + %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4 + %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 2 + %load3 = load float, ptr addrspace(1) %gep3, align 4 + %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr1, i64 3 + %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store i32 %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(2) %ptr2, i64 1 + store <2 x i16> %load2, ptr addrspace(2) %store.gep2, align 4 + %store.gep3 = getelementptr inbounds float, ptr addrspace(2) %ptr2, i64 2 + store float %load3, ptr addrspace(2) %store.gep3, align 4 + %store.gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(2) %ptr2, i64 3 + store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4 ret void } -define void @merge_float_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-LABEL: define void @merge_float_v2f16_type( +define void @merge_f32_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-LABEL: define void @merge_f32_v2f16_type( ; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP1]], align 4 -; CHECK-NEXT: [[LOAD1_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[LOAD2_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[LOAD1_TOORIG:%.*]] = bitcast i32 [[LOAD1_MUT1]] to float -; CHECK-NEXT: [[LOAD2_TOORIG:%.*]] = bitcast i32 [[LOAD2_MUT2]] to <2 x half> +; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4 ; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 -; CHECK-NEXT: [[LOAD1_BC:%.*]] = bitcast float [[LOAD1_TOORIG]] to i32 -; CHECK-NEXT: [[LOAD2_BC:%.*]] = bitcast <2 x half> [[LOAD2_TOORIG]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD1_BC]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD2_BC]], i32 1 -; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-NEXT: store float [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-NEXT: store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0 @@ -304,3 +88,27 @@ define void @merge_v2f16_bfloat_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %p store <2 x half> %load2, ptr addrspace(2) %store.gep2, align 4 ret void } + +define void @no_merge_mixed_ptr_addrspaces(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-LABEL: define void @no_merge_mixed_ptr_addrspaces( +; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load ptr addrspace(2), ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-NEXT: store ptr addrspace(1) [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-NEXT: store ptr addrspace(2) [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-NEXT: ret void +; + %gep1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %ptr1, i64 0 + %load1 = load ptr addrspace(1), ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) %ptr1, i64 1 + %load2 = load ptr addrspace(2), ptr addrspace(1) %gep2, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store ptr addrspace(1) %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) %ptr2, i64 1 + store ptr addrspace(2) %load2, ptr addrspace(2) %store.gep2, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll index e6f2be25030c5..d6b51039d5b44 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll @@ -130,302 +130,24 @@ entry: ret void } +; Ideally this would be merged define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 { ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16( ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[A]], align 4 -; CHECK-NEXT: [[LD_01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[LD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[LD_1_TOORIG:%.*]] = bitcast i32 [[LD_1_MUT2]] to <2 x i16> -; CHECK-NEXT: ret void -; -entry: - %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1 - - %ld.0 = load i32, ptr addrspace(1) %a - %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1 - - ret void -} - -define amdgpu_kernel void @no_merge_load_i32_v2i8(ptr addrspace(1) nocapture %a) #0 { -; CHECK-LABEL: define amdgpu_kernel void @no_merge_load_i32_v2i8( -; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1 ; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4 -; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i8>, ptr addrspace(1) [[A_1]], align 2 +; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4 ; CHECK-NEXT: ret void ; entry: %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1 %ld.0 = load i32, ptr addrspace(1) %a - %ld.1 = load <2 x i8>, ptr addrspace(1) %a.1 - - ret void -} - -define void @test_normalize_loads(ptr %p) { -; CHECK-OOB-RELAXED-LABEL: define void @test_normalize_loads( -; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4 -; CHECK-OOB-RELAXED-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 -; CHECK-OOB-RELAXED-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 -; CHECK-OOB-RELAXED-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16> -; CHECK-OOB-RELAXED-NEXT: [[L0_EXT:%.*]] = zext i32 [[L01]] to i64 -; CHECK-OOB-RELAXED-NEXT: [[L1_CAST:%.*]] = bitcast <2 x i16> [[L1_MUT_BC]] to i32 -; CHECK-OOB-RELAXED-NEXT: [[L1_EXT:%.*]] = zext i32 [[L1_CAST]] to i64 -; CHECK-OOB-RELAXED-NEXT: [[ADD:%.*]] = add i64 [[L0_EXT]], [[L1_EXT]] -; CHECK-OOB-RELAXED-NEXT: store i64 [[ADD]], ptr null, align 8 -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @test_normalize_loads( -; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-STRICT-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4 -; CHECK-OOB-STRICT-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 -; CHECK-OOB-STRICT-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 -; CHECK-OOB-STRICT-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16> -; CHECK-OOB-STRICT-NEXT: [[L0_EXT:%.*]] = zext i32 [[L01]] to i64 -; CHECK-OOB-STRICT-NEXT: [[L1_CAST:%.*]] = bitcast <2 x i16> [[L1_MUT_BC]] to i32 -; CHECK-OOB-STRICT-NEXT: [[L1_EXT:%.*]] = zext i32 [[L1_CAST]] to i64 -; CHECK-OOB-STRICT-NEXT: [[ADD:%.*]] = add i64 [[L0_EXT]], [[L1_EXT]] -; CHECK-OOB-STRICT-NEXT: store i64 [[ADD]], ptr null, align 8 -; CHECK-OOB-STRICT-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - %l0 = load i32, ptr %p - %l1 = load <2 x i16>, ptr %p1 - %l0_ext = zext i32 %l0 to i64 - %l1_cast = bitcast <2 x i16> %l1 to i32 - %l1_ext = zext i32 %l1_cast to i64 - %add = add i64 %l0_ext, %l1_ext - store i64 %add, ptr null - ret void -} - -define void @test_normalize_stores(ptr %p) { -; CHECK-OOB-RELAXED-LABEL: define void @test_normalize_stores( -; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR1]] { -; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-RELAXED-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4 -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @test_normalize_stores( -; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-STRICT-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4 -; CHECK-OOB-STRICT-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - store i32 123, ptr %p - store <2 x i16> , ptr %p1 - ret void -} - -; TODO: Fix the below test -; Check that metadata on loads is preserved when LSV normalizes mixed-typed -; chains (exercises copyMetadataForAccess on loads). -define void @lsv_copy_load_metadata(ptr %p) { -; CHECK-OOB-RELAXED-LABEL: define void @lsv_copy_load_metadata( -; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]] -; CHECK-OOB-RELAXED-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 -; CHECK-OOB-RELAXED-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 -; CHECK-OOB-RELAXED-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16> -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @lsv_copy_load_metadata( -; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-STRICT-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]] -; CHECK-OOB-STRICT-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 -; CHECK-OOB-STRICT-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 -; CHECK-OOB-STRICT-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16> -; CHECK-OOB-STRICT-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - %ld0 = load i32, ptr %p, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6 - %ld1 = load <2 x i16>, ptr %p1, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6 - ret void -} + %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1 -; Check that metadata on stores is preserved when LSV normalizes mixed-typed -; chains (exercises copyMetadataForAccess on stores). -define void @lsv_copy_store_metadata(ptr %p) { -; CHECK-OOB-RELAXED-LABEL: define void @lsv_copy_store_metadata( -; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR0]] { -; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-RELAXED-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]] -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @lsv_copy_store_metadata( -; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-STRICT-NEXT: store <2 x i32> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]] -; CHECK-OOB-STRICT-NEXT: ret void -; -entry: - %p1 = getelementptr i32, ptr %p, i64 1 - store i32 7, ptr %p, align 4, !nontemporal !5 - store <2 x i16> , ptr %p1, align 4, !nontemporal !5 ret void } -!0 = !{!3, !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} -!5 = !{i32 1} -!6 = !{} attributes #0 = { nounwind } attributes #1 = { nounwind readnone } - - -; Non power-of-two combined span (12 bytes) must not merge chains. -define void @no_merge_non_pot_span(ptr addrspace(1) %p) { -; CHECK-OOB-RELAXED-LABEL: define void @no_merge_non_pot_span( -; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[P:%.*]]) #[[ATTR1]] { -; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-RELAXED-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4 -; CHECK-OOB-RELAXED-NEXT: [[P8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 8 -; CHECK-OOB-RELAXED-NEXT: [[L1:%.*]] = load float, ptr addrspace(1) [[P8]], align 4 -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @no_merge_non_pot_span( -; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[P:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-STRICT-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4 -; CHECK-OOB-STRICT-NEXT: [[P8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 8 -; CHECK-OOB-STRICT-NEXT: [[L1:%.*]] = load float, ptr addrspace(1) [[P8]], align 4 -; CHECK-OOB-STRICT-NEXT: ret void -; -entry: - %l0 = load i32, ptr addrspace(1) %p, align 4 - %p8 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8 - %l1 = load float, ptr addrspace(1) %p8, align 4 - ret void -} - -define void @no_merge_diff_ptrop(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { -; CHECK-OOB-RELAXED-LABEL: define void @no_merge_diff_ptrop( -; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1]] { -; CHECK-OOB-RELAXED-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-OOB-RELAXED-NEXT: [[LOAD_1:%.*]] = load i32, ptr addrspace(2) [[PTR2]], align 4 -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @no_merge_diff_ptrop( -; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-OOB-STRICT-NEXT: [[LOAD_1:%.*]] = load i32, ptr addrspace(2) [[PTR2]], align 4 -; CHECK-OOB-STRICT-NEXT: ret void -; - %load.0 = load i32, ptr addrspace(1) %ptr1 - %load.1 = load i32, ptr addrspace(2) %ptr2 - ret void -} - -define void @no_merge_load_store(ptr addrspace(1) %ptr1) { -; CHECK-OOB-RELAXED-LABEL: define void @no_merge_load_store( -; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]]) #[[ATTR1]] { -; CHECK-OOB-RELAXED-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-OOB-RELAXED-NEXT: store i32 111, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @no_merge_load_store( -; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-OOB-STRICT-NEXT: store i32 111, ptr addrspace(1) [[PTR1]], align 4 -; CHECK-OOB-STRICT-NEXT: ret void -; - %load.0 = load i32, ptr addrspace(1) %ptr1 - store i32 111, ptr addrspace(1) %ptr1 - ret void -} - -; Stores in this test should not be vectorized as the total byte span -; from the end of %gep.a to the end of %gep.b is not a power of 2. This -; is a necessary condition for splitChainByAlignment. -define void @check_contiguity_of_base_ptrs(ptr addrspace(1) %ptr) { -; CHECK-OOB-RELAXED-LABEL: define void @check_contiguity_of_base_ptrs( -; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; CHECK-OOB-RELAXED-NEXT: store i32 274, ptr addrspace(1) [[PTR]], align 4 -; CHECK-OOB-RELAXED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 4 -; CHECK-OOB-RELAXED-NEXT: store i64 3610770474484254748, ptr addrspace(1) [[GEP_A]], align 8 -; CHECK-OOB-RELAXED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 12 -; CHECK-OOB-RELAXED-NEXT: store <2 x i32> , ptr addrspace(1) [[GEP_B]], align 4 -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @check_contiguity_of_base_ptrs( -; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR:%.*]]) { -; CHECK-OOB-STRICT-NEXT: store i32 274, ptr addrspace(1) [[PTR]], align 4 -; CHECK-OOB-STRICT-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 4 -; CHECK-OOB-STRICT-NEXT: store i64 3610770474484254748, ptr addrspace(1) [[GEP_A]], align 8 -; CHECK-OOB-STRICT-NEXT: [[GEP_B:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 12 -; CHECK-OOB-STRICT-NEXT: store <2 x i32> , ptr addrspace(1) [[GEP_B]], align 4 -; CHECK-OOB-STRICT-NEXT: ret void -; - store i32 274, ptr addrspace(1) %ptr, align 4 - %gep.a = getelementptr inbounds nuw i8, ptr addrspace(1) %ptr, i64 4 - store i64 3610770474484254748, ptr addrspace(1) %gep.a, align 8 - %gep.b = getelementptr inbounds nuw i8, ptr addrspace(1) %ptr, i64 12 - store <2 x i32> , ptr addrspace(1) %gep.b, align 4 - ret void -} - -; Offset is unknown in the following test, LSV should fail to vectorize. -define amdgpu_kernel void @assert_computeLeaderDelta(ptr addrspace(1) %a, i64 %idx) { -; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @assert_computeLeaderDelta( -; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[A:%.*]], i64 [[IDX:%.*]]) #[[ATTR1]] { -; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-RELAXED-NEXT: [[LD0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4 -; CHECK-OOB-RELAXED-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[A]], i64 [[IDX]] -; CHECK-OOB-RELAXED-NEXT: [[LD1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2 -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @assert_computeLeaderDelta( -; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[A:%.*]], i64 [[IDX:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-STRICT-NEXT: [[LD0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4 -; CHECK-OOB-STRICT-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[A]], i64 [[IDX]] -; CHECK-OOB-STRICT-NEXT: [[LD1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2 -; CHECK-OOB-STRICT-NEXT: ret void -; -entry: - %ld0 = load i32, ptr addrspace(1) %a, align 4 - %p1 = getelementptr inbounds i8, ptr addrspace(1) %a, i64 %idx - %ld1 = load <2 x i16>, ptr addrspace(1) %p1, align 2 - ret void -} - - -; Overlapping ranges after rebasing should prevent merging across chains. -define void @no_merge_overlap_after_rebase(ptr addrspace(1) %p) { -; CHECK-OOB-RELAXED-LABEL: define void @no_merge_overlap_after_rebase( -; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[P:%.*]]) #[[ATTR1]] { -; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-RELAXED-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4 -; CHECK-OOB-RELAXED-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 2 -; CHECK-OOB-RELAXED-NEXT: [[L1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2 -; CHECK-OOB-RELAXED-NEXT: ret void -; -; CHECK-OOB-STRICT-LABEL: define void @no_merge_overlap_after_rebase( -; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[P:%.*]]) { -; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] -; CHECK-OOB-STRICT-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4 -; CHECK-OOB-STRICT-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 2 -; CHECK-OOB-STRICT-NEXT: [[L1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2 -; CHECK-OOB-STRICT-NEXT: ret void -; -entry: - %l0 = load i32, ptr addrspace(1) %p, align 4 - %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 2 - %l1 = load <2 x i16>, ptr addrspace(1) %p1, align 2 - ret void -}