From ad7773547682f7f8e11ae04b6d06e7a960cb6ad1 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 12 Jun 2025 12:20:38 +0100 Subject: [PATCH 1/2] [IA] Remove recursive [de]interleaving support Now that the loop vectorizer emits just a single llvm.vector.[de]interleaveN intrinsic, we can remove the need to recognise recursively [de]interleaved intrinsics. No in-tree target currently has instructions to emit an interleaved access with a factor > 8, and I'm not aware of any other passes that will emit recursive interleave patterns, so this code is effectively dead. Some tests have been converted from the recursive form to a single intrinsic, and some others were deleted that are no longer needed, e.g. to do with the recursive tree. This closes off the work started in #139893. --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 199 ++--------- .../rvv/fixed-vectors-deinterleave-load.ll | 67 ---- .../rvv/fixed-vectors-interleave-store.ll | 34 -- .../rvv/fixed-vectors-interleaved-access.ll | 14 +- .../RISCV/rvv/vector-deinterleave-load.ll | 67 ---- .../RISCV/rvv/vector-interleave-store.ll | 34 -- .../RISCV/rvv/vp-vector-interleaved-access.ll | 314 ++++-------------- .../AArch64/sve-deinterleave4.ll | 90 ++--- .../AArch64/sve-interleave4.ll | 17 +- .../RISCV/interleaved-accesses.ll | 196 ----------- 10 files changed, 127 insertions(+), 905 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 49f1504d244ed..9c4c86cebe7e5 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -629,173 +629,12 @@ static unsigned getIntrinsicFactor(const IntrinsicInst *II) { } } -// For an (de)interleave tree like this: -// -// A C B D -// |___| |___| -// |_____| -// | -// A B C D -// -// We will get ABCD at the end while the leaf operands/results -// are ACBD, which are also what we initially collected in -// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI -// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need -// to reorder them by interleaving these values. -static void interleaveLeafValues(MutableArrayRef SubLeaves) { - unsigned NumLeaves = SubLeaves.size(); - assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); - if (NumLeaves == 2) - return; - - const unsigned HalfLeaves = NumLeaves / 2; - // Visit the sub-trees. - interleaveLeafValues(SubLeaves.take_front(HalfLeaves)); - interleaveLeafValues(SubLeaves.drop_front(HalfLeaves)); - - SmallVector Buffer; - // a0 a1 a2 a3 b0 b1 b2 b3 - // -> a0 b0 a1 b1 a2 b2 a3 b3 - for (unsigned i = 0U; i < NumLeaves; ++i) - Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]); - - llvm::copy(Buffer, SubLeaves.begin()); -} - -static bool -getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, - SmallVectorImpl &DeadInsts) { - assert(isInterleaveIntrinsic(II->getIntrinsicID())); - - // Visit with BFS - SmallVector Queue; - Queue.push_back(II); - while (!Queue.empty()) { - IntrinsicInst *Current = Queue.front(); - Queue.erase(Queue.begin()); - - // All the intermediate intrinsics will be deleted. - DeadInsts.push_back(Current); - - for (unsigned I = 0; I < getIntrinsicFactor(Current); ++I) { - Value *Op = Current->getOperand(I); - if (auto *OpII = dyn_cast(Op)) - if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) { - Queue.push_back(OpII); - continue; - } - - // If this is not a perfectly balanced tree, the leaf - // result types would be different. - if (!Operands.empty() && Op->getType() != Operands.back()->getType()) - return false; - - Operands.push_back(Op); - } - } - - const unsigned Factor = Operands.size(); - // Currently we only recognize factors 2...8 and other powers of 2. - // FIXME: should we assert here instead? - if (Factor <= 1 || - (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II))) - return false; - - // Recursively interleaved factors need to have their values reordered - // TODO: Remove once the loop vectorizer no longer recursively interleaves - // factors 4 + 8 - if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2) - interleaveLeafValues(Operands); - return true; -} - -static bool -getVectorDeinterleaveFactor(IntrinsicInst *II, - SmallVectorImpl &Results, - SmallVectorImpl &DeadInsts) { - assert(isDeinterleaveIntrinsic(II->getIntrinsicID())); - using namespace PatternMatch; - if (!II->hasNUses(getIntrinsicFactor(II))) - return false; - - // Visit with BFS - SmallVector Queue; - Queue.push_back(II); - while (!Queue.empty()) { - IntrinsicInst *Current = Queue.front(); - Queue.erase(Queue.begin()); - assert(Current->hasNUses(getIntrinsicFactor(Current))); - - // All the intermediate intrinsics will be deleted from the bottom-up. - DeadInsts.insert(DeadInsts.begin(), Current); - - SmallVector EVs(getIntrinsicFactor(Current), nullptr); - for (User *Usr : Current->users()) { - if (!isa(Usr)) - return 0; - - auto *EV = cast(Usr); - // Intermediate ExtractValue instructions will also be deleted. - DeadInsts.insert(DeadInsts.begin(), EV); - ArrayRef Indices = EV->getIndices(); - if (Indices.size() != 1) - return false; - - if (!EVs[Indices[0]]) - EVs[Indices[0]] = EV; - else - return false; - } - - // We have legal indices. At this point we're either going - // to continue the traversal or push the leaf values into Results. - for (ExtractValueInst *EV : EVs) { - // Continue the traversal. We're playing safe here and matching only the - // expression consisting of a perfectly balanced binary tree in which all - // intermediate values are only used once. - if (EV->hasOneUse() && - match(EV->user_back(), - m_Intrinsic()) && - EV->user_back()->hasNUses(2)) { - auto *EVUsr = cast(EV->user_back()); - Queue.push_back(EVUsr); - continue; - } - - // If this is not a perfectly balanced tree, the leaf - // result types would be different. - if (!Results.empty() && EV->getType() != Results.back()->getType()) - return false; - - // Save the leaf value. - Results.push_back(EV); - } - } - - const unsigned Factor = Results.size(); - // Currently we only recognize factors of 2...8 and other powers of 2. - // FIXME: should we assert here instead? - if (Factor <= 1 || - (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II))) - return 0; - - // Recursively interleaved factors need to have their values reordered - // TODO: Remove once the loop vectorizer no longer recursively interleaves - // factors 4 + 8 - if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2) - interleaveLeafValues(Results); - return true; -} - static Value *getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC) { if (auto *IMI = dyn_cast(WideMask)) { - SmallVector Operands; - SmallVector DeadInsts; - if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { - assert(!Operands.empty()); - if (Operands.size() == Factor && llvm::all_equal(Operands)) - return Operands[0]; + if (isInterleaveIntrinsic(IMI->getIntrinsicID()) && + getIntrinsicFactor(IMI) == Factor && llvm::all_equal(IMI->args())) { + return IMI->getArgOperand(0); } } @@ -830,13 +669,19 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( if (!LoadedVal->hasOneUse() || !isa(LoadedVal)) return false; - SmallVector DeinterleaveValues; - SmallVector DeinterleaveDeadInsts; - if (!getVectorDeinterleaveFactor(DI, DeinterleaveValues, - DeinterleaveDeadInsts)) + const unsigned Factor = getIntrinsicFactor(DI); + if (!DI->hasNUses(Factor)) return false; - - const unsigned Factor = DeinterleaveValues.size(); + SmallVector DeinterleaveValues(Factor); + for (auto *User : DI->users()) { + auto *Extract = dyn_cast(User); + if (!Extract || Extract->getNumIndices() != 1) + return false; + unsigned Idx = Extract->getIndices()[0]; + if (DeinterleaveValues[Idx]) + return false; + DeinterleaveValues[Idx] = Extract; + } if (auto *VPLoad = dyn_cast(LoadedVal)) { if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) @@ -869,7 +714,9 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; } - DeadInsts.insert_range(DeinterleaveDeadInsts); + for (Value *V : DeinterleaveValues) + DeadInsts.insert(cast(V)); + DeadInsts.insert(DI); // We now have a target-specific load, so delete the old one. DeadInsts.insert(cast(LoadedVal)); return true; @@ -883,12 +730,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( if (!isa(StoredBy)) return false; - SmallVector InterleaveValues; - SmallVector InterleaveDeadInsts; - if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts)) - return false; - - const unsigned Factor = InterleaveValues.size(); + SmallVector InterleaveValues(II->args()); + const unsigned Factor = getIntrinsicFactor(II); if (auto *VPStore = dyn_cast(StoredBy)) { if (VPStore->getIntrinsicID() != Intrinsic::vp_store) @@ -922,7 +765,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( // We now have a target-specific store, so delete the old one. DeadInsts.insert(cast(StoredBy)); - DeadInsts.insert_range(InterleaveDeadInsts); + DeadInsts.insert(II); return true; } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index c2ae1ce491389..3e822d357b667 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -293,31 +293,6 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3 } -; TODO: Remove once recursive deinterleaving support is removed -define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4_recursive(ptr %p) { -; CHECK-LABEL: vector_deinterleave_load_factor4_recursive: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: ret - %vec = load <32 x i8>, ptr %p - %d0 = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec) - %d0.0 = extractvalue { <16 x i8>, <16 x i8> } %d0, 0 - %d0.1 = extractvalue { <16 x i8>, <16 x i8> } %d0, 1 - %d1 = call {<8 x i8>, <8 x i8>} @llvm.vector.deinterleave2.v16i8(<16 x i8> %d0.0) - %t0 = extractvalue { <8 x i8>, <8 x i8> } %d1, 0 - %t2 = extractvalue { <8 x i8>, <8 x i8> } %d1, 1 - %d2 = call {<8 x i8>, <8 x i8>} @llvm.vector.deinterleave2.v16i8(<16 x i8> %d0.1) - %t1 = extractvalue { <8 x i8>, <8 x i8> } %d2, 0 - %t3 = extractvalue { <8 x i8>, <8 x i8> } %d2, 1 - - %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } undef, <8 x i8> %t0, 0 - %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1 - %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2 - %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3 - ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3 -} - define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor5: ; CHECK: # %bb.0: @@ -414,45 +389,3 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, < %res7 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6, <8 x i8> %t6, 7 ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res7 } - -; TODO: Remove once recursive deinterleaving support is removed -define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8_recursive(ptr %ptr) { -; CHECK-LABEL: vector_deinterleave_load_factor8_recursive: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v8, (a0) -; CHECK-NEXT: ret - %vec = load <16 x i32>, ptr %ptr - %d0 = call { <8 x i32>, <8 x i32> } @llvm.vector.deinterleave2.v16i32(<16 x i32> %vec) - %d0.0 = extractvalue { <8 x i32>, <8 x i32> } %d0, 0 - %d0.1 = extractvalue { <8 x i32>, <8 x i32> } %d0, 1 - %d1 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %d0.0) - %d1.0 = extractvalue { <4 x i32>, <4 x i32> } %d1, 0 - %d1.1 = extractvalue { <4 x i32>, <4 x i32> } %d1, 1 - %d2 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %d0.1) - %d2.0 = extractvalue { <4 x i32>, <4 x i32> } %d2, 0 - %d2.1 = extractvalue { <4 x i32>, <4 x i32> } %d2, 1 - - %d3 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d1.0) - %t0 = extractvalue { <2 x i32>, <2 x i32> } %d3, 0 - %t4 = extractvalue { <2 x i32>, <2 x i32> } %d3, 1 - %d4 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d1.1) - %t2 = extractvalue { <2 x i32>, <2 x i32> } %d4, 0 - %t6 = extractvalue { <2 x i32>, <2 x i32> } %d4, 1 - %d5 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d2.0) - %t1 = extractvalue { <2 x i32>, <2 x i32> } %d5, 0 - %t5 = extractvalue { <2 x i32>, <2 x i32> } %d5, 1 - %d6 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d2.1) - %t3 = extractvalue { <2 x i32>, <2 x i32> } %d6, 0 - %t7 = extractvalue { <2 x i32>, <2 x i32> } %d6, 1 - - %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } undef, <2 x i32> %t0, 0 - %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 - %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 - %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 - %res4 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3, <2 x i32> %t4, 4 - %res5 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res4, <2 x i32> %t5, 5 - %res6 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res5, <2 x i32> %t6, 6 - %res7 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res6, <2 x i32> %t7, 7 - ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res7 -} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll index c394e7aa2e3e8..a49eeed3605c5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll @@ -203,20 +203,6 @@ define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i3 ret void } -; TODO: Remove once recursive interleaving support is removed -define void @vector_interleave_store_factor4_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) { -; CHECK-LABEL: vector_interleave_store_factor4_recursive: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vsseg4e32.v v8, (a0) -; CHECK-NEXT: ret - %v0 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %c) - %v1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %b, <4 x i32> %d) - %v2 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v0, <8 x i32> %v1) - store <16 x i32> %v2, ptr %p - ret void -} - define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor5: ; CHECK: # %bb.0: @@ -260,23 +246,3 @@ define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i3 store <32 x i32> %v, ptr %p ret void } - -; TODO: Remove once recursive interleaving support is removed -define void @vector_interleave_store_factor8_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) { -; CHECK-LABEL: vector_interleave_store_factor8_recursive: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vsseg8e32.v v8, (a0) -; CHECK-NEXT: ret - %v0 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %e) - %v1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %c, <4 x i32> %g) - %v2 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v0, <8 x i32> %v1) - - %v3 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %b, <4 x i32> %f) - %v4 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %d, <4 x i32> %h) - %v5 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v3, <8 x i32> %v4) - - %v6 = call <32 x i32> @llvm.vector.interleave2.v32i32(<16 x i32> %v2, <16 x i32> %v5) - store <32 x i32> %v6, ptr %p - ret void -} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 8ac4c7447c7d4..5e3ae2faf1a53 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -302,15 +302,11 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vpload_factor4_intrinsics(p ; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8) - %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load) - %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0 - %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1 - %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0) - %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0 - %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1 - %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1) - %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0 - %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1 + %d = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.vector.deinterleave4.v8i32(<8 x i32> %wide.masked.load) + %t0 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 0 + %t1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 1 + %t2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 2 + %t3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 3 %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0 %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 9344c52098684..b11db3d61f693 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -380,31 +380,6 @@ define { , , , , , , } %res3 } -; TODO: Remove once recursive deinterleaving support is removed -define { , , , } @vector_deinterleave_load_factor4_recursive(ptr %p) { -; CHECK-LABEL: vector_deinterleave_load_factor4_recursive: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: ret - %vec = load , ptr %p - %d0 = call {, } @llvm.vector.deinterleave2.nxv32i8( %vec) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 1 - %d1 = call {, } @llvm.vector.deinterleave2.nxv16i8( %d0.0) - %t0 = extractvalue { , } %d1, 0 - %t2 = extractvalue { , } %d1, 1 - %d2 = call {, } @llvm.vector.deinterleave2.nxv16i8( %d0.1) - %t1 = extractvalue { , } %d2, 0 - %t3 = extractvalue { , } %d2, 1 - - %res0 = insertvalue { , , , } undef, %t0, 0 - %res1 = insertvalue { , , , } %res0, %t1, 1 - %res2 = insertvalue { , , , } %res1, %t2, 2 - %res3 = insertvalue { , , , } %res2, %t3, 3 - ret { , , , } %res3 -} - define { , , , , } @vector_deinterleave_load_factor5(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor5: ; CHECK: # %bb.0: @@ -500,45 +475,3 @@ define { , , , , , , , , , , } %res6, %t7, 7 ret { , , , , , , , } %res7 } - -; TODO: Remove once recursive deinterleaving support is removed -define {, , , , , , , } @vector_deinterleave_load_factor8_recursive(ptr %ptr) { -; CHECK-LABEL: vector_deinterleave_load_factor8_recursive: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v8, (a0) -; CHECK-NEXT: ret - %vec = load , ptr %ptr - %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %vec) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 1 - %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) - %d1.0 = extractvalue { , } %d1, 0 - %d1.1 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) - %d2.0 = extractvalue { , } %d2, 0 - %d2.1 = extractvalue { , } %d2, 1 - - %d3 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.0) - %t0 = extractvalue { , } %d3, 0 - %t4 = extractvalue { , } %d3, 1 - %d4 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) - %t2 = extractvalue { , } %d4, 0 - %t6 = extractvalue { , } %d4, 1 - %d5 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.0) - %t1 = extractvalue { , } %d5, 0 - %t5 = extractvalue { , } %d5, 1 - %d6 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.1) - %t3 = extractvalue { , } %d6, 0 - %t7 = extractvalue { , } %d6, 1 - - %res0 = insertvalue { , , , , , , , } undef, %t0, 0 - %res1 = insertvalue { , , , , , , , } %res0, %t1, 1 - %res2 = insertvalue { , , , , , , , } %res1, %t2, 2 - %res3 = insertvalue { , , , , , , , } %res2, %t3, 3 - %res4 = insertvalue { , , , , , , , } %res3, %t4, 4 - %res5 = insertvalue { , , , , , , , } %res4, %t5, 5 - %res6 = insertvalue { , , , , , , , } %res5, %t6, 6 - %res7 = insertvalue { , , , , , , , } %res6, %t7, 7 - ret { , , , , , , , } %res7 -} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index 3751967f18aa4..f0cbf6a006919 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -261,20 +261,6 @@ define void @vector_interleave_store_factor4( %a, %a, %b, %c, %d, ptr %p) { -; CHECK-LABEL: vector_interleave_store_factor4_recursive: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-NEXT: vsseg4e32.v v8, (a0) -; CHECK-NEXT: ret - %v0 = call @llvm.vector.interleave2.nxv8i32( %a, %c) - %v1 = call @llvm.vector.interleave2.nxv8i32( %b, %d) - %v2 = call @llvm.vector.interleave2.nxv16i32( %v0, %v1) - store %v2, ptr %p - ret void -} - define void @vector_interleave_store_factor5( %a, %b, %c, %d, %e, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor5: ; CHECK: # %bb.0: @@ -318,23 +304,3 @@ define void @vector_interleave_store_factor8( %a, %v, ptr %p ret void } - -; TODO: Remove once recursive interleaving support is removed -define void @vector_interleave_store_factor8_recursive( %a, %b, %c, %d, %e, %f, %g, %h, ptr %p) { -; CHECK-LABEL: vector_interleave_store_factor8_recursive: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg8e32.v v8, (a0) -; CHECK-NEXT: ret - %v0 = call @llvm.vector.interleave2.nxv4i32( %a, %e) - %v1 = call @llvm.vector.interleave2.nxv4i32( %c, %g) - %v2 = call @llvm.vector.interleave2.nxv8i32( %v0, %v1) - - %v3 = call @llvm.vector.interleave2.nxv4i32( %b, %f) - %v4 = call @llvm.vector.interleave2.nxv4i32( %d, %h) - %v5 = call @llvm.vector.interleave2.nxv8i32( %v3, %v4) - - %v6 = call @llvm.vector.interleave2.nxv16i32( %v2, %v5) - store %v6, ptr %p - ret void -} diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 142ee5256f9e7..4e21fcf85c2c8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -84,15 +84,11 @@ define {, , , @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) - %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 1 - %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) - %t0 = extractvalue { , } %d1, 0 - %t2 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) - %t1 = extractvalue { , } %d2, 0 - %t3 = extractvalue { , } %d2, 1 + %d = call { , , , } @llvm.vector.deinterleave4.nxv8i32( %wide.masked.load) + %t0 = extractvalue { , , , } %d, 0 + %t1 = extractvalue { , , , } %d, 1 + %t2 = extractvalue { , , , } %d, 2 + %t3 = extractvalue { , , , } %d, 3 %res0 = insertvalue { , , , } poison, %t0, 0 %res1 = insertvalue { , , , } %res0, %t1, 1 @@ -214,28 +210,15 @@ define {, , , @llvm.vp.load.nxv16i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) - %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 1 - %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) - %d1.0 = extractvalue { , } %d1, 0 - %d1.1 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) - %d2.0 = extractvalue { , } %d2, 0 - %d2.1 = extractvalue { , } %d2, 1 - - %d3 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.0) - %t0 = extractvalue { , } %d3, 0 - %t4 = extractvalue { , } %d3, 1 - %d4 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) - %t2 = extractvalue { , } %d4, 0 - %t6 = extractvalue { , } %d4, 1 - %d5 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.0) - %t1 = extractvalue { , } %d5, 0 - %t5 = extractvalue { , } %d5, 1 - %d6 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.1) - %t3 = extractvalue { , } %d6, 0 - %t7 = extractvalue { , } %d6, 1 + %d = call { , , , , , , , } @llvm.vector.deinterleave8.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , , , , , , , } %d, 0 + %t1 = extractvalue { , , , , , , , } %d, 1 + %t2 = extractvalue { , , , , , , , } %d, 2 + %t3 = extractvalue { , , , , , , , } %d, 3 + %t4 = extractvalue { , , , , , , , } %d, 4 + %t5 = extractvalue { , , , , , , , } %d, 5 + %t6 = extractvalue { , , , , , , , } %d, 6 + %t7 = extractvalue { , , , , , , , } %d, 7 %res0 = insertvalue { , , , , , , , } poison, %t0, 0 %res1 = insertvalue { , , , , , , , } %res0, %t1, 1 @@ -323,10 +306,8 @@ define void @store_factor4_v2( %v0, %v1, pt ; RV64-NEXT: vsseg4e32.v v8, (a0) ; RV64-NEXT: ret %rvl = mul i32 %evl, 8 - %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) - %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) - %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) - call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec2, ptr %ptr, splat (i1 true), i32 %rvl) + %interleaved.vec = call @llvm.vector.interleave4.nxv4i32( %v0, %v1, %v0, %v1) + call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void } @@ -430,14 +411,8 @@ define void @store_factor8_v2( %v0, %v1, pt ; RV64-NEXT: vsseg8e32.v v8, (a0) ; RV64-NEXT: ret %rvl = mul i32 %evl, 8 - %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) - %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) - %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) - %interleaved.vec3 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) - %interleaved.vec4 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) - %interleaved.vec5 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec3, %interleaved.vec4) - %interleaved.vec6 = call @llvm.vector.interleave2.nxv8i32( %interleaved.vec2, %interleaved.vec5) - call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec6, ptr %ptr, splat (i1 true), i32 %rvl) + %interleaved.vec = call @llvm.vector.interleave8.nxv8i32( %v0, %v1, %v0, %v1, %v0, %v1, %v0, %v1) + call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void } @@ -485,19 +460,13 @@ define {, , , @llvm.vector.interleave2.nxv4i1( %mask, %mask) - %interleaved.mask1 = call @llvm.vector.interleave2.nxv4i1( %mask, %mask) - %interleaved.mask2 = call @llvm.vector.interleave2.nxv8i1( %interleaved.mask0, %interleaved.mask1) - %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, %interleaved.mask2, i32 %rvl) - %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 1 - %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) - %t0 = extractvalue { , } %d1, 0 - %t2 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) - %t1 = extractvalue { , } %d2, 0 - %t3 = extractvalue { , } %d2, 1 + %interleaved.mask = call @llvm.vector.interleave4.nxv8i1( %mask, %mask, %mask, %mask) + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %d = call { , , , } @llvm.vector.deinterleave4.nxv8i32( %wide.masked.load) + %t0 = extractvalue { , , , } %d, 0 + %t1 = extractvalue { , , , } %d, 1 + %t2 = extractvalue { , , , } %d, 2 + %t3 = extractvalue { , , , } %d, 3 %res0 = insertvalue { , , , } poison, %t0, 0 %res1 = insertvalue { , , , } %res0, %t1, 1 @@ -677,181 +646,14 @@ define void @masked_store_factor4_v2( %mask, ; RV64-NEXT: vsseg4e32.v v8, (a0), v0.t ; RV64-NEXT: ret %rvl = mul i32 %evl, 4 - %interleaved.mask0 = call @llvm.vector.interleave2.nxv2i1( %mask, %mask) - %interleaved.mask1 = call @llvm.vector.interleave2.nxv2i1( %mask, %mask) - %interleaved.mask2 = call @llvm.vector.interleave2.nxv4i1( %interleaved.mask0, %interleaved.mask1) - %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) - %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) - %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) - call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec2, ptr %ptr, %interleaved.mask2, i32 %rvl) + %interleaved.mask = call @llvm.vector.interleave4.nxv4i1( %mask, %mask, %mask, %mask) + %interleaved.vec = call @llvm.vector.interleave4.nxv2i32( %v0, %v1, %v0, %v1) + call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) ret void } ; Negative tests -; We should not transform this function because the deinterleave tree is not in a desired form. -define {, , , } @incorrect_extract_value_index(ptr %ptr, i32 %evl) { -; RV32-LABEL: incorrect_extract_value_index: -; RV32: # %bb.0: -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vnsrl.wi v12, v8, 0 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wx v9, v12, a0 -; RV32-NEXT: vnsrl.wi v8, v12, 0 -; RV32-NEXT: vmv.v.v v10, v9 -; RV32-NEXT: vmv.v.v v11, v9 -; RV32-NEXT: ret -; -; RV64-LABEL: incorrect_extract_value_index: -; RV64: # %bb.0: -; RV64-NEXT: slli a1, a1, 34 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV64-NEXT: vnsrl.wi v12, v8, 0 -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV64-NEXT: vnsrl.wx v9, v12, a0 -; RV64-NEXT: vnsrl.wi v8, v12, 0 -; RV64-NEXT: vmv.v.v v10, v9 -; RV64-NEXT: vmv.v.v v11, v9 -; RV64-NEXT: ret - %rvl = mul i32 %evl, 4 - %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) - %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 0 - %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) - %t0 = extractvalue { , } %d1, 0 - %t2 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) - %t1 = extractvalue { , } %d2, 1 - %t3 = extractvalue { , } %d2, 1 - - %res0 = insertvalue { , , , } poison, %t0, 0 - %res1 = insertvalue { , , , } %res0, %t1, 1 - %res2 = insertvalue { , , , } %res1, %t2, 2 - %res3 = insertvalue { , , , } %res2, %t3, 3 - ret { , , , } %res3 -} - -; We should not transform this function because the expression is not a balanced tree. -define {, , , } @not_balanced_load_tree(ptr %ptr, i32 %evl) { -; RV32-LABEL: not_balanced_load_tree: -; RV32: # %bb.0: -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV32-NEXT: vle32.v v12, (a0) -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vnsrl.wx v8, v12, a0 -; RV32-NEXT: vnsrl.wi v16, v12, 0 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wi v10, v16, 0 -; RV32-NEXT: vnsrl.wx v11, v16, a0 -; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; RV32-NEXT: vnsrl.wx v12, v11, a0 -; RV32-NEXT: vnsrl.wi v11, v11, 0 -; RV32-NEXT: ret -; -; RV64-LABEL: not_balanced_load_tree: -; RV64: # %bb.0: -; RV64-NEXT: slli a1, a1, 34 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV64-NEXT: vle32.v v12, (a0) -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV64-NEXT: vnsrl.wx v8, v12, a0 -; RV64-NEXT: vnsrl.wi v16, v12, 0 -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV64-NEXT: vnsrl.wi v10, v16, 0 -; RV64-NEXT: vnsrl.wx v11, v16, a0 -; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; RV64-NEXT: vnsrl.wx v12, v11, a0 -; RV64-NEXT: vnsrl.wi v11, v11, 0 -; RV64-NEXT: ret - %rvl = mul i32 %evl, 4 - %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) - %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) - %d0.0 = extractvalue { , } %d0, 0 - %t0 = extractvalue { , } %d0, 1 - %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) - %t1 = extractvalue { , } %d1, 0 - %d1.1 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) - %t2 = extractvalue { , } %d2, 0 - %t3 = extractvalue { , } %d2, 1 - - %res0 = insertvalue { , , , } poison, %t0, 0 - %res1 = insertvalue { , , , } %res0, %t1, 1 - %res2 = insertvalue { , , , } %res1, %t2, 2 - %res3 = insertvalue { , , , } %res2, %t3, 3 - ret { , , , } %res3 -} - -define void @not_balanced_store_tree( %v0, %v1, %v2, ptr %ptr, i32 %evl) { -; RV32-LABEL: not_balanced_store_tree: -; RV32: # %bb.0: -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v12, v8, v8 -; RV32-NEXT: li a2, -1 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: vwmaccu.vx v12, a2, v8 -; RV32-NEXT: srli a3, a3, 3 -; RV32-NEXT: vsetvli a4, zero, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vx v8, v12, a3 -; RV32-NEXT: add a4, a3, a3 -; RV32-NEXT: vsetvli zero, a4, e32, m1, ta, ma -; RV32-NEXT: vslideup.vx v12, v8, a3 -; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV32-NEXT: vwaddu.vv v16, v12, v9 -; RV32-NEXT: vwmaccu.vx v16, a2, v9 -; RV32-NEXT: vsetvli a3, zero, e32, m2, ta, ma -; RV32-NEXT: vwaddu.vv v12, v16, v10 -; RV32-NEXT: vwmaccu.vx v12, a2, v10 -; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV32-NEXT: vse32.v v12, (a0) -; RV32-NEXT: ret -; -; RV64-LABEL: not_balanced_store_tree: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a2, zero, e32, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v12, v8, v8 -; RV64-NEXT: li a2, -1 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a1, a1, 34 -; RV64-NEXT: vwmaccu.vx v12, a2, v8 -; RV64-NEXT: srli a3, a3, 3 -; RV64-NEXT: vsetvli a4, zero, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vx v8, v12, a3 -; RV64-NEXT: add a4, a3, a3 -; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma -; RV64-NEXT: vslideup.vx v12, v8, a3 -; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV64-NEXT: vwaddu.vv v16, v12, v9 -; RV64-NEXT: vwmaccu.vx v16, a2, v9 -; RV64-NEXT: vsetvli a3, zero, e32, m2, ta, ma -; RV64-NEXT: vwaddu.vv v12, v16, v10 -; RV64-NEXT: vwmaccu.vx v12, a2, v10 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV64-NEXT: vse32.v v12, (a0) -; RV64-NEXT: ret - %rvl = mul i32 %evl, 4 - %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) - %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %interleaved.vec0, %v1) - %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec1, %v2) - call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec2, ptr %ptr, splat (i1 true), i32 %rvl) - ret void -} - define {, } @not_same_mask( %mask0, %mask1, ptr %ptr, i32 %evl) { ; RV32-LABEL: not_same_mask: ; RV32: # %bb.0: @@ -943,48 +745,58 @@ define {, } @not_same_mask( define {, , , } @invalid_evl(ptr %ptr, i32 %evl) { ; RV32-LABEL: invalid_evl: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 2 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; RV32-NEXT: ori a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vnsrl.wx v12, v8, a0 -; RV32-NEXT: vnsrl.wi v14, v8, 0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs4r.v v8, (a0) ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wx v10, v14, a0 -; RV32-NEXT: vnsrl.wi v8, v14, 0 -; RV32-NEXT: vnsrl.wx v11, v12, a0 -; RV32-NEXT: vnsrl.wi v9, v12, 0 +; RV32-NEXT: vlseg4e32.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: invalid_evl: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 2 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; RV64-NEXT: ori a1, a1, 1 ; RV64-NEXT: slli a1, a1, 32 ; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV64-NEXT: vnsrl.wx v12, v8, a0 -; RV64-NEXT: vnsrl.wi v14, v8, 0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs4r.v v8, (a0) ; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV64-NEXT: vnsrl.wx v10, v14, a0 -; RV64-NEXT: vnsrl.wi v8, v14, 0 -; RV64-NEXT: vnsrl.wx v11, v12, a0 -; RV64-NEXT: vnsrl.wi v9, v12, 0 +; RV64-NEXT: vlseg4e32.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %rvl = or i32 %evl, 1 %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) - %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 1 - %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) - %t0 = extractvalue { , } %d1, 0 - %t2 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) - %t1 = extractvalue { , } %d2, 0 - %t3 = extractvalue { , } %d2, 1 + %d = call { , , , } @llvm.vector.deinterleave4.nxv8i32( %wide.masked.load) + %t0 = extractvalue { , , , } %d, 0 + %t1 = extractvalue { , , , } %d, 1 + %t2 = extractvalue { , , , } %d, 2 + %t3 = extractvalue { , , , } %d, 3 %res0 = insertvalue { , , , } poison, %t0, 0 %res1 = insertvalue { , , , } %res0, %t1, 1 diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll index 61a68692ff5b9..c565066541d1d 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -16,17 +16,13 @@ define void @deinterleave4(ptr %src) { ; %load = load , ptr %src, align 4 - %deinterleave_src = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load) - %3 = extractvalue { , } %deinterleave_src, 0 - %4 = extractvalue { , } %deinterleave_src, 1 - %deinterleave_half1 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %3) - %5 = extractvalue { , } %deinterleave_half1, 0 - %6 = extractvalue { , } %deinterleave_half1, 1 - %deinterleave_half2 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %4) - %7 = extractvalue { , } %deinterleave_half2, 0 - %8 = extractvalue { , } %deinterleave_half2, 1 - %sum = add %5, %7 - %sub = sub %6, %8 + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4.nxv16i32( %load) + %1 = extractvalue { , , , } %deinterleave, 0 + %2 = extractvalue { , , , } %deinterleave, 1 + %3 = extractvalue { , , , } %deinterleave, 2 + %4 = extractvalue { , , , } %deinterleave, 3 + %sum = add %1, %2 + %sub = sub %3, %4 ret void } @@ -58,17 +54,13 @@ define void @wide_deinterleave4(ptr %src) { ; CHECK-NEXT: ret void ; %load = load , ptr %src, align 4 - %deinterleave_src = tail call { , } @llvm.vector.deinterleave2.nxv32i32( %load) - %3 = extractvalue { , } %deinterleave_src, 0 - %4 = extractvalue { , } %deinterleave_src, 1 - %deinterleave_half1 = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %3) - %5 = extractvalue { , } %deinterleave_half1, 0 - %6 = extractvalue { , } %deinterleave_half1, 1 - %deinterleave_half2 = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %4) - %7 = extractvalue { , } %deinterleave_half2, 0 - %8 = extractvalue { , } %deinterleave_half2, 1 - %sum = add %5, %7 - %sub = sub %6, %8 + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4.nxv32i32( %load) + %1 = extractvalue { , , , } %deinterleave, 0 + %2 = extractvalue { , , , } %deinterleave, 1 + %3 = extractvalue { , , , } %deinterleave, 2 + %4 = extractvalue { , , , } %deinterleave, 3 + %sum = add %1, %2 + %sub = sub %3, %4 ret void } @@ -87,52 +79,36 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) { ; %load = load , ptr %src, align 4 - %deinterleave_src = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load) - %3 = extractvalue { , } %deinterleave_src, 0 - %4 = extractvalue { , } %deinterleave_src, 1 - %deinterleave_half1 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %3) - %5 = extractvalue { , } %deinterleave_half1, 0 - %6 = extractvalue { , } %deinterleave_half1, 1 - %deinterleave_half2 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %4) - %7 = extractvalue { , } %deinterleave_half2, 0 - %8 = extractvalue { , } %deinterleave_half2, 1 + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4.nxv16i32( %load) + %1 = extractvalue { , , , } %deinterleave, 0 + %2 = extractvalue { , , , } %deinterleave, 1 + %3 = extractvalue { , , , } %deinterleave, 2 + %4 = extractvalue { , , , } %deinterleave, 3 %load2 = load , ptr %src, align 4 - %deinterleave_src2 = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load2) - %ld2_1 = extractvalue { , } %deinterleave_src2, 0 - %ld2_2 = extractvalue { , } %deinterleave_src2, 1 + %deinterleave2 = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load2) + %ld2_1 = extractvalue { , } %deinterleave2, 0 + %ld2_2 = extractvalue { , } %deinterleave2, 1 ret void } define void @negative_deinterleave4_test(ptr %src) { ; CHECK-LABEL: define void @negative_deinterleave4_test ; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[SRC]], i64 0 -; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( splat (i1 true), ptr [[TMP1]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP4]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr , ptr [[SRC]], i64 2 -; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( splat (i1 true), ptr [[TMP6]]) -; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[LDN1]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP3]], [[TMP7]], i64 4) -; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP5]], [[TMP9]], i64 4) -; CHECK-NEXT: [[DEINTERLEAVE_HALF1:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[DEINTERLEAVE_HALF1]], 0 -; CHECK-NEXT: [[DEINTERLEAVE_HALF2:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP10]]) -; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[DEINTERLEAVE_HALF2]], 1 +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[SRC]], align 4 +; CHECK-NEXT: [[DEINTERLEAVE:%.*]] = tail call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[LOAD]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[DEINTERLEAVE]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[DEINTERLEAVE]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[DEINTERLEAVE]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[DEINTERLEAVE]], 2 ; CHECK-NEXT: ret void ; %load = load , ptr %src, align 4 - %deinterleave_src = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %load) - %3 = extractvalue { , } %deinterleave_src, 0 - %4 = extractvalue { , } %deinterleave_src, 1 - %deinterleave_half1 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %3) - %5 = extractvalue { , } %deinterleave_half1, 0 - %deinterleave_half2 = tail call { , } @llvm.vector.deinterleave2.nxv8i32( %4) - %6 = extractvalue { , } %deinterleave_half2, 1 + %deinterleave = tail call { , , , } @llvm.vector.deinterleave4.nxv16i32( %load) + %1 = extractvalue { , , , } %deinterleave, 0 + %2 = extractvalue { , , , } %deinterleave, 1 + %3 = extractvalue { , , , } %deinterleave, 2 + %4 = extractvalue { , , , } %deinterleave, 2 ret void } diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll index 085089978d8f5..a61db6577d56d 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll @@ -8,9 +8,7 @@ define void @interleave4(ptr %dst, %a, %b, ; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[A]], [[B]], [[C]], [[D]], splat (i1 true), ptr [[DST]]) ; CHECK-NEXT: ret void ; - %interleaved.half1 = tail call @llvm.vector.interleave2.nxv8i32( %a, %c) - %interleaved.half2 = tail call @llvm.vector.interleave2.nxv8i32( %b, %d) - %interleaved.vec = tail call @llvm.vector.interleave2.nxv16i32( %interleaved.half1, %interleaved.half2) + %interleaved.vec = tail call @llvm.vector.interleave4.nxv16i32( %a, %b, %c, %d) store %interleaved.vec, ptr %dst, align 4 ret void } @@ -32,9 +30,7 @@ define void @wide_interleave4(ptr %dst, %a, [[TMP7]], [[TMP8]], [[TMP9]], [[TMP10]], splat (i1 true), ptr [[TMP6]]) ; CHECK-NEXT: ret void ; - %interleaved.half1 = tail call @llvm.vector.interleave2.nxv16i32( %a, %c) - %interleaved.half2 = tail call @llvm.vector.interleave2.nxv16i32( %b, %d) - %interleaved.vec = tail call @llvm.vector.interleave2.nxv32i32( %interleaved.half1, %interleaved.half2) + %interleaved.vec = tail call @llvm.vector.interleave4.nxv32i32( %a, %b, %c, %d) store %interleaved.vec, ptr %dst, align 4 ret void } @@ -46,9 +42,7 @@ define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, [[A]], [[C]], splat (i1 true), ptr [[DST2]]) ; CHECK-NEXT: ret void ; - %interleaved.half1 = tail call @llvm.vector.interleave2.nxv8i32( %a, %c) - %interleaved.half2 = tail call @llvm.vector.interleave2.nxv8i32( %b, %d) - %interleaved.vec = tail call @llvm.vector.interleave2.nxv16i32( %interleaved.half1, %interleaved.half2) + %interleaved.vec = tail call @llvm.vector.interleave4.nxv16i32( %a, %b, %c, %d) store %interleaved.vec, ptr %dst1, align 4 %interleaved = tail call @llvm.vector.interleave2.nxv8i32( %a, %c) @@ -64,8 +58,7 @@ define void @duplicate_by_interleave( %A, % ; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[A]], [[A]], [[B]], [[B]], splat (i1 true), ptr [[AB_DUPLICATE]]) ; CHECK-NEXT: ret void ; - %interleave = tail call @llvm.vector.interleave2.nxv8i32( %A, %B) - %duplicate_by_interleave = tail call @llvm.vector.interleave2.nxv16i32( %interleave, %interleave) - store %duplicate_by_interleave, ptr %AB_duplicate, align 4 + %interleave = tail call @llvm.vector.interleave4.nxv16i32( %A, %A, %B, %B) + store %interleave, ptr %AB_duplicate, align 4 ret void } diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll index 87b16d17aa5f0..72c1f22032bb7 100644 --- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -175,53 +175,6 @@ define void @load_factor4_vscale(ptr %ptr) { ret void } -; TODO: Remove once recursive deinterleaving support is removed -define void @load_factor4_vscale_recursive(ptr %ptr) { -; RV32-LABEL: @load_factor4_vscale_recursive( -; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i32(target("riscv.vector.tuple", , 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5) -; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 0) -; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , } poison, [[TMP2]], 0 -; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 1) -; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , } [[TMP3]], [[TMP4]], 1 -; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 2) -; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 2 -; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 3) -; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 3 -; RV32-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP9]], 0 -; RV32-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP9]], 1 -; RV32-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[TMP9]], 2 -; RV32-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP9]], 3 -; RV32-NEXT: ret void -; -; RV64-LABEL: @load_factor4_vscale_recursive( -; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", , 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5) -; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 0) -; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , } poison, [[TMP2]], 0 -; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 1) -; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , } [[TMP3]], [[TMP4]], 1 -; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 2) -; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 2 -; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", , 4) [[TMP1]], i32 3) -; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 3 -; RV64-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP9]], 0 -; RV64-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[TMP9]], 1 -; RV64-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[TMP9]], 2 -; RV64-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP9]], 3 -; RV64-NEXT: ret void -; - %interleaved.vec = load , ptr %ptr - %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %interleaved.vec) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 1 - %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) - %t0 = extractvalue { , } %d1, 0 - %t1 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) - %t2 = extractvalue { , } %d2, 0 - %t3 = extractvalue { , } %d2, 1 - ret void -} - define void @load_factor5(ptr %ptr) { ; RV32-LABEL: @load_factor5( ; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg5.load.mask.v4i32.p0.i32(ptr [[PTR:%.*]], <4 x i1> splat (i1 true), i32 4) @@ -590,91 +543,6 @@ define void @load_factor8_vscale(ptr %ptr) { ret void } -; TODO: Remove once recursive deinterleaving support is removed -define void @load_factor8_vscale_recursive(ptr %ptr) { -; RV32-LABEL: @load_factor8_vscale_recursive( -; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", , 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5) -; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 0) -; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , , } poison, [[TMP2]], 0 -; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 1) -; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , , , } [[TMP3]], [[TMP4]], 1 -; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 2) -; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , , , } [[TMP5]], [[TMP6]], 2 -; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 3) -; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , , , } [[TMP7]], [[TMP8]], 3 -; RV32-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 4) -; RV32-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , , , } [[TMP9]], [[TMP10]], 4 -; RV32-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 5) -; RV32-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , , , } [[TMP11]], [[TMP12]], 5 -; RV32-NEXT: [[TMP14:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 6) -; RV32-NEXT: [[TMP15:%.*]] = insertvalue { , , , , , , , } [[TMP13]], [[TMP14]], 6 -; RV32-NEXT: [[TMP16:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 7) -; RV32-NEXT: [[TMP17:%.*]] = insertvalue { , , , , , , , } [[TMP15]], [[TMP16]], 7 -; RV32-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 0 -; RV32-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 1 -; RV32-NEXT: [[TMP20:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 2 -; RV32-NEXT: [[TMP21:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 3 -; RV32-NEXT: [[TMP22:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 4 -; RV32-NEXT: [[TMP23:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 5 -; RV32-NEXT: [[TMP24:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 6 -; RV32-NEXT: [[TMP25:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 7 -; RV32-NEXT: ret void -; -; RV64-LABEL: @load_factor8_vscale_recursive( -; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", , 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5) -; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 0) -; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , , } poison, [[TMP2]], 0 -; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 1) -; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , , , } [[TMP3]], [[TMP4]], 1 -; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 2) -; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , , , } [[TMP5]], [[TMP6]], 2 -; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 3) -; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , , , } [[TMP7]], [[TMP8]], 3 -; RV64-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 4) -; RV64-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , , , } [[TMP9]], [[TMP10]], 4 -; RV64-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 5) -; RV64-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , , , } [[TMP11]], [[TMP12]], 5 -; RV64-NEXT: [[TMP14:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 6) -; RV64-NEXT: [[TMP15:%.*]] = insertvalue { , , , , , , , } [[TMP13]], [[TMP14]], 6 -; RV64-NEXT: [[TMP16:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", , 8) [[TMP1]], i32 7) -; RV64-NEXT: [[TMP17:%.*]] = insertvalue { , , , , , , , } [[TMP15]], [[TMP16]], 7 -; RV64-NEXT: [[TMP18:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 0 -; RV64-NEXT: [[TMP19:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 1 -; RV64-NEXT: [[TMP20:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 2 -; RV64-NEXT: [[TMP21:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 3 -; RV64-NEXT: [[TMP22:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 4 -; RV64-NEXT: [[TMP23:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 5 -; RV64-NEXT: [[TMP24:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 6 -; RV64-NEXT: [[TMP25:%.*]] = extractvalue { , , , , , , , } [[TMP17]], 7 -; RV64-NEXT: ret void -; - %interleaved.vec = load , ptr %ptr - %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %interleaved.vec) - %d0.0 = extractvalue { , } %d0, 0 - %d0.1 = extractvalue { , } %d0, 1 - - %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) - %d1.0 = extractvalue { , } %d1, 0 - %d1.1 = extractvalue { , } %d1, 1 - %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) - %d2.0 = extractvalue { , } %d2, 0 - %d2.1 = extractvalue { , } %d2, 1 - - %d3 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.0) - %t0 = extractvalue { , } %d3, 0 - %t1 = extractvalue { , } %d3, 1 - %d4 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) - %t2 = extractvalue { , } %d4, 0 - %t3 = extractvalue { , } %d4, 1 - %d5 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.0) - %t4 = extractvalue { , } %d5, 0 - %t5 = extractvalue { , } %d5, 1 - %d6 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.1) - %t6 = extractvalue { , } %d6, 0 - %t7 = extractvalue { , } %d6, 1 - ret void -} - define void @store_factor2(ptr %ptr, <8 x i8> %v0, <8 x i8> %v1) { ; RV32-LABEL: @store_factor2( @@ -808,31 +676,6 @@ define void @store_factor4_vscale(ptr %ptr, %v0, %v0, %v1) { -; RV32-LABEL: @store_factor4_vscale_recursive( -; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, [[V0:%.*]], i32 0) -; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP1]], [[V0]], i32 1) -; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP2]], [[V1:%.*]], i32 2) -; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP3]], [[V1]], i32 3) -; RV32-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i32(target("riscv.vector.tuple", , 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3) -; RV32-NEXT: ret void -; -; RV64-LABEL: @store_factor4_vscale_recursive( -; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, [[V0:%.*]], i32 0) -; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP1]], [[V0]], i32 1) -; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP2]], [[V1:%.*]], i32 2) -; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) [[TMP3]], [[V1]], i32 3) -; RV64-NEXT: call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", , 4) [[TMP4]], ptr [[PTR:%.*]], i64 -1, i64 3) -; RV64-NEXT: ret void -; - %i0 = call @llvm.vector.interleave2.nxv8i8( %v0, %v1) - %i1 = call @llvm.vector.interleave2.nxv8i8( %v0, %v1) - %i2 = call @llvm.vector.interleave2.nxv16i8( %i0, %i1) - store %i2, ptr %ptr, align 4 - ret void -} - define void @store_factor5_vscale(ptr %ptr, %v0, %v1, %v2, %v3, %v4) { ; RV32-LABEL: @store_factor5_vscale( ; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) poison, [[V0:%.*]], i32 0) @@ -1013,45 +856,6 @@ define void @store_factor8_vscale(ptr %ptr, %v0, %v0, %v1, %v2, %v3) { -; RV32-LABEL: @store_factor8_vscale_recursive( -; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) -; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP1]], [[V2:%.*]], i32 1) -; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP2]], [[V0]], i32 2) -; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP3]], [[V2]], i32 3) -; RV32-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP4]], [[V1:%.*]], i32 4) -; RV32-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP5]], [[V3:%.*]], i32 5) -; RV32-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP6]], [[V1]], i32 6) -; RV32-NEXT: [[TMP8:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP7]], [[V3]], i32 7) -; RV32-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", , 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3) -; RV32-NEXT: ret void -; -; RV64-LABEL: @store_factor8_vscale_recursive( -; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) -; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP1]], [[V2:%.*]], i32 1) -; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP2]], [[V0]], i32 2) -; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP3]], [[V2]], i32 3) -; RV64-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP4]], [[V1:%.*]], i32 4) -; RV64-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP5]], [[V3:%.*]], i32 5) -; RV64-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP6]], [[V1]], i32 6) -; RV64-NEXT: [[TMP8:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) [[TMP7]], [[V3]], i32 7) -; RV64-NEXT: call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", , 8) [[TMP8]], ptr [[PTR:%.*]], i64 -1, i64 3) -; RV64-NEXT: ret void -; - %i0 = call @llvm.vector.interleave2.nxv8i8( %v0, %v1) - %i1 = call @llvm.vector.interleave2.nxv8i8( %v0, %v1) - %i2 = call @llvm.vector.interleave2.nxv16i8( %i0, %i1) - - %i3 = call @llvm.vector.interleave2.nxv8i8( %v2, %v3) - %i4 = call @llvm.vector.interleave2.nxv8i8( %v2, %v3) - %i5 = call @llvm.vector.interleave2.nxv16i8( %i3, %i4) - - %i6 = call @llvm.vector.interleave2.nxv32i8( %i2, %i5) - store %i6, ptr %ptr, align 4 - ret void -} - define void @load_factor2_fp128(ptr %ptr) { ; RV32-LABEL: @load_factor2_fp128( ; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, ptr [[PTR:%.*]], align 16 From 83e1cd76fa14b66e67d0c5057d59770c9d230385 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 25 Jun 2025 11:18:40 +0100 Subject: [PATCH 2/2] Update test --- .../RISCV/rvv/vp-vector-interleaved-access.ll | 94 +++++++++---------- 1 file changed, 44 insertions(+), 50 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 4e21fcf85c2c8..35f01f608b56e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -536,38 +536,37 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; RV32-NEXT: vmv1r.v v8, v0 -; RV32-NEXT: slli a2, a1, 1 ; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: li a1, -1 +; RV32-NEXT: li a2, -1 ; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV32-NEXT: vmerge.vim v11, v9, 1, v0 -; RV32-NEXT: srli a3, a3, 2 ; RV32-NEXT: vwaddu.vv v12, v11, v11 -; RV32-NEXT: vwmaccu.vx v12, a1, v11 +; RV32-NEXT: vwmaccu.vx v12, a2, v11 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: srli a2, a2, 2 ; RV32-NEXT: vmsne.vi v0, v12, 0 -; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV32-NEXT: vslidedown.vx v11, v12, a3 +; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV32-NEXT: vslidedown.vx v11, v12, a2 ; RV32-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV32-NEXT: vmsne.vi v0, v11, 0 -; RV32-NEXT: add a1, a3, a3 +; RV32-NEXT: slli a3, a1, 1 ; RV32-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; RV32-NEXT: vslideup.vx v10, v9, a3 -; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vslideup.vx v10, v9, a2 +; RV32-NEXT: vsetvli zero, a3, e8, mf2, ta, ma ; RV32-NEXT: vmsne.vi v0, v10, 0 ; RV32-NEXT: vle32.v v10, (a0), v0.t ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vnsrl.wx v13, v10, a1 ; RV32-NEXT: vmv.x.s a1, v10 ; RV32-NEXT: vnsrl.wi v12, v10, 0 -; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: srli a3, a3, 1 ; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; RV32-NEXT: vsetvli zero, a3, e32, m1, ta, ma ; RV32-NEXT: vsseg2e32.v v12, (a0), v0.t ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret @@ -580,26 +579,24 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV64-NEXT: li a2, -1 ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a4, a1, 33 -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV64-NEXT: vmerge.vim v11, v9, 1, v0 -; RV64-NEXT: srli a3, a3, 2 ; RV64-NEXT: vwaddu.vv v12, v11, v11 ; RV64-NEXT: vwmaccu.vx v12, a2, v11 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a2, a2, 2 ; RV64-NEXT: vmsne.vi v0, v12, 0 -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v11, v12, a3 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vslidedown.vx v11, v12, a2 ; RV64-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV64-NEXT: vmsne.vi v0, v11, 0 -; RV64-NEXT: add a1, a3, a3 +; RV64-NEXT: slli a3, a1, 33 ; RV64-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; RV64-NEXT: vslideup.vx v10, v9, a3 ; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vslideup.vx v10, v9, a2 ; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: srli a1, a4, 32 +; RV64-NEXT: srli a1, a3, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vle32.v v10, (a0), v0.t ; RV64-NEXT: li a1, 32 @@ -607,9 +604,9 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV64-NEXT: vnsrl.wx v13, v10, a1 ; RV64-NEXT: vmv.x.s a1, v10 ; RV64-NEXT: vnsrl.wi v12, v10, 0 -; RV64-NEXT: srli a4, a4, 33 +; RV64-NEXT: srli a3, a3, 33 ; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; RV64-NEXT: vsetvli zero, a3, e32, m1, ta, ma ; RV64-NEXT: vsseg2e32.v v12, (a0), v0.t ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret @@ -660,29 +657,28 @@ define {, } @not_same_mask( ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; RV32-NEXT: vmv1r.v v9, v0 ; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: li a2, -1 ; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV32-NEXT: vmerge.vim v11, v8, 1, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmerge.vim v9, v8, 1, v0 -; RV32-NEXT: srli a3, a3, 2 ; RV32-NEXT: vwaddu.vv v12, v9, v11 ; RV32-NEXT: vwmaccu.vx v12, a2, v11 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: srli a2, a2, 2 ; RV32-NEXT: vmsne.vi v0, v12, 0 -; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; RV32-NEXT: vslidedown.vx v9, v12, a3 +; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV32-NEXT: vslidedown.vx v9, v12, a2 ; RV32-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV32-NEXT: vmsne.vi v0, v9, 0 -; RV32-NEXT: add a2, a3, a3 +; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma -; RV32-NEXT: vslideup.vx v10, v8, a3 +; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV32-NEXT: vslideup.vx v10, v8, a2 ; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; RV32-NEXT: vmsne.vi v0, v10, 0 ; RV32-NEXT: vle32.v v10, (a0), v0.t @@ -701,26 +697,24 @@ define {, } @not_same_mask( ; RV64-NEXT: li a2, -1 ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.i v10, 0 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a1, a1, 33 -; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV64-NEXT: vmerge.vim v11, v8, 1, v0 ; RV64-NEXT: vmv1r.v v0, v9 ; RV64-NEXT: vmerge.vim v9, v8, 1, v0 -; RV64-NEXT: srli a3, a3, 2 ; RV64-NEXT: vwaddu.vv v12, v9, v11 ; RV64-NEXT: vwmaccu.vx v12, a2, v11 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: srli a2, a2, 2 ; RV64-NEXT: vmsne.vi v0, v12, 0 -; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v9, v12, a3 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vslidedown.vx v9, v12, a2 ; RV64-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV64-NEXT: vmsne.vi v0, v9, 0 -; RV64-NEXT: add a2, a3, a3 +; RV64-NEXT: slli a1, a1, 33 ; RV64-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma -; RV64-NEXT: vslideup.vx v10, v8, a3 -; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vslideup.vx v10, v8, a2 ; RV64-NEXT: vmsne.vi v0, v10, 0 ; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma