From 5afaab4d474b996b5835422085ba98924b08101d Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Thu, 22 Aug 2024 04:30:05 +0000 Subject: [PATCH 01/13] [LV]: Teach LV to recursively (de)interleave. Currently available intrinsics are only ld2/st2, which don't support interleaving factor > 2. This patch teaches the LV to use ld2/st2 recursively to support high interleaving factors. Change-Id: I96af28dc6aeca0c6929d604176cc9ba29fca17df --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 136 ++++++++++++----- .../AArch64/sve-deinterleave4.ll | 144 +++++++++++++++++- .../AArch64/sve-interleaved-accesses.ll | 2 +- 4 files changed, 249 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index de164ee434d64..cfd010a6bca18 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9159,9 +9159,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported - // is 2 since we require the (de)interleave2 intrinsics instead of - // shufflevectors. - assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) && "Unsupported interleave factor for scalable vectors"); return Result; }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5903ad29af760..3b230d62aa67b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -35,6 +35,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include +#include using namespace llvm; @@ -2779,10 +2780,39 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, - Vals, - /*FMFSource=*/nullptr, Name); + unsigned InterleaveFactor = Vals.size(); + SmallVector InterleavingValues; + unsigned InterleavingValuesCount = + InterleaveFactor + (InterleaveFactor - 2); + InterleavingValues.resize(InterleaveFactor); + // Place the values to be interleaved in the correct order for the + // interleaving + for (unsigned I = 0, J = InterleaveFactor / 2, K = 0; K < InterleaveFactor; + K++) { + if (K % 2 == 0) { + InterleavingValues[K] = Vals[I]; + I++; + } else { + InterleavingValues[K] = Vals[J]; + J++; + } + } +#ifndef NDEBUG + for (Value *Val : InterleavingValues) + assert(Val && "NULL Interleaving Value"); +#endif + for (unsigned I = 1; I < InterleavingValuesCount; I += 2) { + VectorType *InterleaveTy = + cast(InterleavingValues[I]->getType()); + VectorType *WideVecTy = + VectorType::getDoubleElementsVectorType(InterleaveTy); + auto *InterleaveRes = Builder.CreateIntrinsic( + WideVecTy, Intrinsic::vector_interleave2, + {InterleavingValues[I - 1], InterleavingValues[I]}, + /*FMFSource=*/nullptr, Name); + InterleavingValues.push_back(InterleaveRes); + } + return InterleavingValues[InterleavingValuesCount]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2868,15 +2898,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); - SmallVector Ops = {ResBlockInMask, ResBlockInMask}; - auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), - State.VF.getKnownMinValue() * 2, true); - return State.Builder.CreateIntrinsic( - MaskTy, Intrinsic::vector_interleave2, Ops, - /*FMFSource=*/nullptr, "interleaved.mask"); + SmallVector Ops; + Ops.resize(InterleaveFactor, ResBlockInMask); + return interleaveVectors(State.Builder, Ops, "interleaved.mask"); } if (!BlockInMask) @@ -2916,35 +2943,76 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { ArrayRef VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. - Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoad, - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - - if (!Member) - continue; - - Value *StridedVec = State.Builder.CreateExtractValue(DI, I); - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = - createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); - } - if (Group->isReverse()) - StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); + SmallVector DeinterleavedValues; + // If the InterleaveFactor is > 2, so we will have to do recursive + // deinterleaving, because the current available deinterleave intrinsice + // supports only Factor of 2. DeinterleaveCount represent how many times + // we will do deinterleaving, we will do deinterleave on all nonleaf + // nodes in the deinterleave tree. + unsigned DeinterleaveCount = InterleaveFactor - 1; + std::queue TempDeinterleavedValues; + TempDeinterleavedValues.push(NewLoad); + for (unsigned I = 0; I < DeinterleaveCount; ++I) { + Value *ValueToDeinterleave = TempDeinterleavedValues.front(); + auto *DiTy = ValueToDeinterleave->getType(); + TempDeinterleavedValues.pop(); + Value *DI = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, DiTy, ValueToDeinterleave, + /*FMFSource=*/nullptr, "strided.vec"); + Value *StridedVec = State.Builder.CreateExtractValue(DI, 0); + TempDeinterleavedValues.push(StridedVec); + StridedVec = State.Builder.CreateExtractValue(DI, 1); + TempDeinterleavedValues.push(StridedVec); + } - State.set(VPDefs[J], StridedVec); - ++J; - } + assert(TempDeinterleavedValues.size() == InterleaveFactor && + "Num of deinterleaved values must equals to InterleaveFactor"); + // Sort deinterleaved values + DeinterleavedValues.resize(InterleaveFactor); + for (unsigned I = 0, J = InterleaveFactor / 2, K = 0; + K < InterleaveFactor; K++) { + auto *DeinterleavedValue = TempDeinterleavedValues.front(); + TempDeinterleavedValues.pop(); + if (K % 2 == 0) { + DeinterleavedValues[I] = DeinterleavedValue; + I++; + } else { + DeinterleavedValues[J] = DeinterleavedValue; + J++; + } + } +#ifndef NDEBUG + for (Value *Val : DeinterleavedValues) + assert(Val && "NULL Deinterleaved Value"); +#endif + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + Value *StridedVec = DeinterleavedValues[I]; + if (!Member) { + // This value is not needed as it's not used + static_cast(StridedVec)->eraseFromParent(); + continue; + } + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + if (Group->isReverse()) + StridedVec = + State.Builder.CreateVectorReverse(StridedVec, "reverse"); + + State.set(VPDefs[J], StridedVec); + ++J; + } return; } diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll index 61a68692ff5b9..b2d603b0a735d 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize,interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s define void @deinterleave4(ptr %src) { @@ -136,3 +136,145 @@ define void @negative_deinterleave4_test(ptr %src) { ret void } + +%struct.xyzt = type { i32, i32, i32, i32 } + +define void @interleave_deinterleave(ptr writeonly %dst, ptr readonly %a, ptr readonly %b) { +; CHECK-LABEL: define void @interleave_deinterleave +; CHECK-SAME: (ptr writeonly [[DST:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 16384 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 16384 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i64 16384 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP9]]) +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[LDN14:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN14]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN14]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN14]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[LDN14]], 3 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP10]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP11]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP12]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i64 12 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 -3 +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP26]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP29]], [[TMP28]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP34]], [[TMP35]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 28c9c5398a7a0..30e7435c74aae 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] From d0f3ae0d3bb14aeefa0d3022eb4408d871f8f187 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Tue, 8 Oct 2024 00:08:35 +0000 Subject: [PATCH 02/13] [resolve review comments] do multiple ordering during (de)interleaving to put the nodes in the correct order for (de)interleaving. Change-Id: I151a53c459a7f69e35feb428c1dface2fe57e9ce --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 109 ++++++++++-------- 1 file changed, 63 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3b230d62aa67b..96a4f710fb1d5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2780,28 +2780,39 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { + if (Vals.size() == 2) { + VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); + return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, + Vals, + /*FMFSource=*/nullptr, Name); + } unsigned InterleaveFactor = Vals.size(); - SmallVector InterleavingValues; - unsigned InterleavingValuesCount = - InterleaveFactor + (InterleaveFactor - 2); - InterleavingValues.resize(InterleaveFactor); - // Place the values to be interleaved in the correct order for the - // interleaving - for (unsigned I = 0, J = InterleaveFactor / 2, K = 0; K < InterleaveFactor; - K++) { - if (K % 2 == 0) { - InterleavingValues[K] = Vals[I]; - I++; - } else { - InterleavingValues[K] = Vals[J]; - J++; + SmallVector InterleavingValues(Vals); + // The total number of nodes in a balanced binary tree is calculated as 2n - + // 1, where `n` is the number of leaf nodes (`InterleaveFactor`). In this + // context, we exclude the root node because it will serve as the final + // interleaved value. Thus, the number of nodes to be processed/interleaved + // is: (2n - 1) - 1 = 2n - 2. + + unsigned NumInterleavingValues = 2 * InterleaveFactor - 2; + for (unsigned I = 1; I < NumInterleavingValues; I += 2) { + // values that haven't been processed yet: + unsigned Remaining = InterleavingValues.size() - I + 1; + if (Remaining > 2 && isPowerOf2_32(Remaining)) { + + // The remaining values form a new level in the interleaving tree. + // Arrange these values in the correct interleaving order for this + // level. The interleaving order places alternating elements from the + // first and second halves, + std::vector RemainingValues(InterleavingValues.begin() + I - 1, + InterleavingValues.end()); + unsigned Middle = Remaining / 2; + for (unsigned J = I - 1, K = 0; J < InterleavingValues.size(); + J += 2, K++) { + InterleavingValues[J] = RemainingValues[K]; + InterleavingValues[J + 1] = RemainingValues[Middle + K]; + } } - } -#ifndef NDEBUG - for (Value *Val : InterleavingValues) - assert(Val && "NULL Interleaving Value"); -#endif - for (unsigned I = 1; I < InterleavingValuesCount; I += 2) { VectorType *InterleaveTy = cast(InterleavingValues[I]->getType()); VectorType *WideVecTy = @@ -2812,7 +2823,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, /*FMFSource=*/nullptr, Name); InterleavingValues.push_back(InterleaveRes); } - return InterleavingValues[InterleavingValuesCount]; + return InterleavingValues[NumInterleavingValues]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2951,42 +2962,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { SmallVector DeinterleavedValues; // If the InterleaveFactor is > 2, so we will have to do recursive - // deinterleaving, because the current available deinterleave intrinsice + // deinterleaving, because the current available deinterleave intrinsic // supports only Factor of 2. DeinterleaveCount represent how many times // we will do deinterleaving, we will do deinterleave on all nonleaf // nodes in the deinterleave tree. unsigned DeinterleaveCount = InterleaveFactor - 1; - std::queue TempDeinterleavedValues; - TempDeinterleavedValues.push(NewLoad); + std::vector TempDeinterleavedValues; + TempDeinterleavedValues.push_back(NewLoad); for (unsigned I = 0; I < DeinterleaveCount; ++I) { - Value *ValueToDeinterleave = TempDeinterleavedValues.front(); - auto *DiTy = ValueToDeinterleave->getType(); - TempDeinterleavedValues.pop(); + auto *DiTy = TempDeinterleavedValues[I]->getType(); Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, DiTy, ValueToDeinterleave, + Intrinsic::vector_deinterleave2, DiTy, TempDeinterleavedValues[I], /*FMFSource=*/nullptr, "strided.vec"); Value *StridedVec = State.Builder.CreateExtractValue(DI, 0); - TempDeinterleavedValues.push(StridedVec); + TempDeinterleavedValues.push_back(StridedVec); StridedVec = State.Builder.CreateExtractValue(DI, 1); - TempDeinterleavedValues.push(StridedVec); - } - - assert(TempDeinterleavedValues.size() == InterleaveFactor && - "Num of deinterleaved values must equals to InterleaveFactor"); - // Sort deinterleaved values - DeinterleavedValues.resize(InterleaveFactor); - for (unsigned I = 0, J = InterleaveFactor / 2, K = 0; - K < InterleaveFactor; K++) { - auto *DeinterleavedValue = TempDeinterleavedValues.front(); - TempDeinterleavedValues.pop(); - if (K % 2 == 0) { - DeinterleavedValues[I] = DeinterleavedValue; - I++; - } else { - DeinterleavedValues[J] = DeinterleavedValue; - J++; + TempDeinterleavedValues.push_back(StridedVec); + // Perform sorting at the start of each new level in the tree. + // A new level begins when the number of remaining values is a power + // of 2 and greater than 2. If a level has only 2 nodes, no sorting is + // needed as they are already in order. Number of remaining values to + // be processed: + unsigned NumRemainingValues = TempDeinterleavedValues.size() - I - 1; + if (NumRemainingValues > 2 && isPowerOf2_32(NumRemainingValues)) { + // these remaining values represent a new level in the tree, + // Reorder the values to match the correct deinterleaving order. + std::vector RemainingValues( + TempDeinterleavedValues.begin() + I + 1, + TempDeinterleavedValues.end()); + unsigned Middle = NumRemainingValues / 2; + for (unsigned J = 0, K = I + 1; J < NumRemainingValues; + J += 2, K++) { + TempDeinterleavedValues[K] = RemainingValues[J]; + TempDeinterleavedValues[Middle + K] = RemainingValues[J + 1]; + } } } + // Final deinterleaved values: + DeinterleavedValues.insert(DeinterleavedValues.begin(), + TempDeinterleavedValues.begin() + + InterleaveFactor - 1, + TempDeinterleavedValues.end()); + #ifndef NDEBUG for (Value *Val : DeinterleavedValues) assert(Val && "NULL Deinterleaved Value"); From 08f0cda250dbbaeecb3e64a582f89fc0bc5f3e55 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Thu, 31 Oct 2024 19:53:38 +0000 Subject: [PATCH 03/13] refactoring Change-Id: If2a3789ed76c98a5f1d1be729f5051a2c54af2a7 --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 111 +++++------------- 1 file changed, 32 insertions(+), 79 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 96a4f710fb1d5..c311eda86905b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -35,7 +35,6 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include -#include using namespace llvm; @@ -2780,50 +2779,22 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - if (Vals.size() == 2) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, - Vals, - /*FMFSource=*/nullptr, Name); - } unsigned InterleaveFactor = Vals.size(); SmallVector InterleavingValues(Vals); - // The total number of nodes in a balanced binary tree is calculated as 2n - - // 1, where `n` is the number of leaf nodes (`InterleaveFactor`). In this - // context, we exclude the root node because it will serve as the final - // interleaved value. Thus, the number of nodes to be processed/interleaved - // is: (2n - 1) - 1 = 2n - 2. - - unsigned NumInterleavingValues = 2 * InterleaveFactor - 2; - for (unsigned I = 1; I < NumInterleavingValues; I += 2) { - // values that haven't been processed yet: - unsigned Remaining = InterleavingValues.size() - I + 1; - if (Remaining > 2 && isPowerOf2_32(Remaining)) { - - // The remaining values form a new level in the interleaving tree. - // Arrange these values in the correct interleaving order for this - // level. The interleaving order places alternating elements from the - // first and second halves, - std::vector RemainingValues(InterleavingValues.begin() + I - 1, - InterleavingValues.end()); - unsigned Middle = Remaining / 2; - for (unsigned J = I - 1, K = 0; J < InterleavingValues.size(); - J += 2, K++) { - InterleavingValues[J] = RemainingValues[K]; - InterleavingValues[J + 1] = RemainingValues[Middle + K]; - } - } + // As we are interleaving, the values sz will be shrinked until we have the + // single final interleaved value. + for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { VectorType *InterleaveTy = - cast(InterleavingValues[I]->getType()); + cast(InterleavingValues[0]->getType()); VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(InterleaveTy); - auto *InterleaveRes = Builder.CreateIntrinsic( - WideVecTy, Intrinsic::vector_interleave2, - {InterleavingValues[I - 1], InterleavingValues[I]}, - /*FMFSource=*/nullptr, Name); - InterleavingValues.push_back(InterleaveRes); + for (unsigned I = 0; I < Midpoint; ++I) + InterleavingValues[I] = Builder.CreateIntrinsic( + WideVecTy, Intrinsic::vector_interleave2, + {InterleavingValues[I], InterleavingValues[Midpoint + I]}, + /*FMFSource=*/nullptr, Name); } - return InterleavingValues[NumInterleavingValues]; + return InterleavingValues[0]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2960,49 +2931,31 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. - SmallVector DeinterleavedValues; - // If the InterleaveFactor is > 2, so we will have to do recursive + SmallVector DeinterleavedValues(InterleaveFactor); + DeinterleavedValues[0] = NewLoad; + // For the case of InterleaveFactor > 2, we will have to do recursive // deinterleaving, because the current available deinterleave intrinsic - // supports only Factor of 2. DeinterleaveCount represent how many times - // we will do deinterleaving, we will do deinterleave on all nonleaf - // nodes in the deinterleave tree. - unsigned DeinterleaveCount = InterleaveFactor - 1; - std::vector TempDeinterleavedValues; - TempDeinterleavedValues.push_back(NewLoad); - for (unsigned I = 0; I < DeinterleaveCount; ++I) { - auto *DiTy = TempDeinterleavedValues[I]->getType(); - Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, DiTy, TempDeinterleavedValues[I], - /*FMFSource=*/nullptr, "strided.vec"); - Value *StridedVec = State.Builder.CreateExtractValue(DI, 0); - TempDeinterleavedValues.push_back(StridedVec); - StridedVec = State.Builder.CreateExtractValue(DI, 1); - TempDeinterleavedValues.push_back(StridedVec); - // Perform sorting at the start of each new level in the tree. - // A new level begins when the number of remaining values is a power - // of 2 and greater than 2. If a level has only 2 nodes, no sorting is - // needed as they are already in order. Number of remaining values to - // be processed: - unsigned NumRemainingValues = TempDeinterleavedValues.size() - I - 1; - if (NumRemainingValues > 2 && isPowerOf2_32(NumRemainingValues)) { - // these remaining values represent a new level in the tree, - // Reorder the values to match the correct deinterleaving order. - std::vector RemainingValues( - TempDeinterleavedValues.begin() + I + 1, - TempDeinterleavedValues.end()); - unsigned Middle = NumRemainingValues / 2; - for (unsigned J = 0, K = I + 1; J < NumRemainingValues; - J += 2, K++) { - TempDeinterleavedValues[K] = RemainingValues[J]; - TempDeinterleavedValues[Middle + K] = RemainingValues[J + 1]; - } + // supports only Factor of 2, otherwise it will bailout after first + // iteration. + // As we are deinterleaving, the values will be doubled until reachingt + // to the InterleaveFactor. + for (int NumVectors = 1; NumVectors < InterleaveFactor; + NumVectors *= 2) { + // deinterleave the elements within the vector + std::vector TempDeinterleavedValues(NumVectors); + for (int I = 0; I < NumVectors; ++I) { + auto *DiTy = DeinterleavedValues[I]->getType(); + TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], + /*FMFSource=*/nullptr, "strided.vec"); } + // Extract the deinterleaved values: + for (int I = 0; I < 2; ++I) + for (int J = 0; J < NumVectors; ++J) + DeinterleavedValues[NumVectors * I + J] = + State.Builder.CreateExtractValue(TempDeinterleavedValues[J], + I); } - // Final deinterleaved values: - DeinterleavedValues.insert(DeinterleavedValues.begin(), - TempDeinterleavedValues.begin() + - InterleaveFactor - 1, - TempDeinterleavedValues.end()); #ifndef NDEBUG for (Value *Val : DeinterleavedValues) From 78077d4fb0774acbd079fcbc9aeaf3b165639a14 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Mon, 4 Nov 2024 11:31:21 +0000 Subject: [PATCH 04/13] resolve conflicts with upstream --- .../Transforms/Vectorize/LoopVectorize.cpp | 8 +- .../AArch64/sve-deinterleave4.ll | 66 ++++--- .../RISCV/interleaved-accesses.ll | 162 +++++++++++------- 3 files changed, 135 insertions(+), 101 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cfd010a6bca18..d78f19ec235c9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3523,10 +3523,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; - // We currently only know how to emit interleave/deinterleave with - // Factor=2 for scalable vectors. This is purely an implementation - // limit. - if (VF.isScalable() && InterleaveFactor != 2) + // For scalable vectors, the only interleave factor currently supported + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor)) return false; // If the group involves a non-integral pointer, we may not be able to diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll index b2d603b0a735d..b9ef7050cbd78 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -172,30 +172,26 @@ define void @interleave_deinterleave(ptr writeonly %dst, ptr readonly %a, ptr re ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP9]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 2 -; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[LDN]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[LDN14:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP15]]) -; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN14]], 0 -; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN14]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN14]], 2 -; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[LDN14]], 3 -; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP10]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP12]], [[TMP18]] -; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP13]], [[TMP19]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i64 12 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 -3 -; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP26]]) +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[TMP7]] +; CHECK-NEXT: [[LDN14:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[LDN14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[LDN14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN14]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN14]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP20:%.*]] = sub nsw [[TMP10]], [[TMP15]] +; CHECK-NEXT: [[TMP21:%.*]] = shl [[TMP11]], [[TMP16]] +; CHECK-NEXT: [[TMP22:%.*]] = ashr [[TMP12]], [[TMP17]] +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP18]], [[TMP20]], [[TMP21]], [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP19]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -205,31 +201,31 @@ define void @interleave_deinterleave(ptr writeonly %dst, ptr readonly %a, ptr re ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP29]], [[TMP28]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP25]], [[TMP24]] ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y]], align 4 ; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 -; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Y11]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP26]], [[TMP27]] ; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 ; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 ; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 -; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z]], align 4 ; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 -; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Z19]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP28]], [[TMP29]] ; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 ; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 ; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 -; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T]], align 4 ; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 -; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[T27]], align 4 -; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP34]], [[TMP35]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP30]], [[TMP31]] ; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 ; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index bda4839dead51..c37badf42ed8c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -740,47 +740,66 @@ exit: define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], shufflevector ( insertelement ( poison, i64 4, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], shufflevector ( insertelement ( poison, i64 5, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], shufflevector ( insertelement ( poison, i64 6, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], shufflevector ( insertelement ( poison, i64 7, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP27:%.*]] = add [[TMP19]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; CHECK-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; CHECK-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[TMP2]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -924,47 +943,66 @@ define void @load_store_factor8(ptr %p) { ; ; SCALABLE-LABEL: @load_store_factor8( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: +; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; SCALABLE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 ; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; SCALABLE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; SCALABLE-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; SCALABLE-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; SCALABLE-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; SCALABLE-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; SCALABLE-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; SCALABLE-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; SCALABLE-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; SCALABLE-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; SCALABLE-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; SCALABLE-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; SCALABLE-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], shufflevector ( insertelement ( poison, i64 4, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], shufflevector ( insertelement ( poison, i64 5, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], shufflevector ( insertelement ( poison, i64 6, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], shufflevector ( insertelement ( poison, i64 7, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP27:%.*]] = add [[TMP19]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: loop: ; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] From ead5b28e6411ccef368ba59df0b8f011b122bd7a Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Mon, 4 Nov 2024 11:40:30 +0000 Subject: [PATCH 05/13] fix format --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 97 +++++++++---------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c311eda86905b..6cb93be4daf22 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2928,62 +2928,59 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - - SmallVector DeinterleavedValues(InterleaveFactor); - DeinterleavedValues[0] = NewLoad; - // For the case of InterleaveFactor > 2, we will have to do recursive - // deinterleaving, because the current available deinterleave intrinsic - // supports only Factor of 2, otherwise it will bailout after first - // iteration. - // As we are deinterleaving, the values will be doubled until reachingt - // to the InterleaveFactor. - for (int NumVectors = 1; NumVectors < InterleaveFactor; - NumVectors *= 2) { - // deinterleave the elements within the vector - std::vector TempDeinterleavedValues(NumVectors); - for (int I = 0; I < NumVectors; ++I) { - auto *DiTy = DeinterleavedValues[I]->getType(); - TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], - /*FMFSource=*/nullptr, "strided.vec"); - } - // Extract the deinterleaved values: - for (int I = 0; I < 2; ++I) - for (int J = 0; J < NumVectors; ++J) - DeinterleavedValues[NumVectors * I + J] = - State.Builder.CreateExtractValue(TempDeinterleavedValues[J], - I); + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + + SmallVector DeinterleavedValues(InterleaveFactor); + DeinterleavedValues[0] = NewLoad; + // For the case of InterleaveFactor > 2, we will have to do recursive + // deinterleaving, because the current available deinterleave intrinsic + // supports only Factor of 2, otherwise it will bailout after first + // iteration. + // As we are deinterleaving, the values will be doubled until reachingt + // to the InterleaveFactor. + for (int NumVectors = 1; NumVectors < InterleaveFactor; NumVectors *= 2) { + // deinterleave the elements within the vector + std::vector TempDeinterleavedValues(NumVectors); + for (int I = 0; I < NumVectors; ++I) { + auto *DiTy = DeinterleavedValues[I]->getType(); + TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], + /*FMFSource=*/nullptr, "strided.vec"); } + // Extract the deinterleaved values: + for (int I = 0; I < 2; ++I) + for (int J = 0; J < NumVectors; ++J) + DeinterleavedValues[NumVectors * I + J] = + State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); + } #ifndef NDEBUG - for (Value *Val : DeinterleavedValues) - assert(Val && "NULL Deinterleaved Value"); + for (Value *Val : DeinterleavedValues) + assert(Val && "NULL Deinterleaved Value"); #endif - for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - Value *StridedVec = DeinterleavedValues[I]; - if (!Member) { - // This value is not needed as it's not used - static_cast(StridedVec)->eraseFromParent(); - continue; - } - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); - StridedVec = - createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); - } - - if (Group->isReverse()) - StridedVec = - State.Builder.CreateVectorReverse(StridedVec, "reverse"); - - State.set(VPDefs[J], StridedVec); - ++J; + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + Value *StridedVec = DeinterleavedValues[I]; + if (!Member) { + // This value is not needed as it's not used + static_cast(StridedVec)->eraseFromParent(); + continue; + } + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); } + if (Group->isReverse()) + StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse"); + + State.set(VPDefs[J], StridedVec); + ++J; + } + return; } From 82c05bd220d188a80bb1aff17e167cdc039842d1 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Mon, 25 Nov 2024 20:55:05 +0000 Subject: [PATCH 06/13] resolve comments --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 18 +-- .../AArch64/sve-deinterleave4.ll | 140 +---------------- .../AArch64/sve-interleaved-accesses.ll | 142 ++++++++++++++++++ .../AArch64/sve-interleave-vectorization.ll | 135 +++++++++++++++++ 4 files changed, 286 insertions(+), 149 deletions(-) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6cb93be4daf22..f35fe673574df 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2781,16 +2781,15 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, if (VecTy->isScalableTy()) { unsigned InterleaveFactor = Vals.size(); SmallVector InterleavingValues(Vals); - // As we are interleaving, the values sz will be shrinked until we have the + // When interleaving, the number of values will be shrunk until we have the // single final interleaved value. + VectorType *InterleaveTy = + cast(InterleavingValues[0]->getType()); for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { - VectorType *InterleaveTy = - cast(InterleavingValues[0]->getType()); - VectorType *WideVecTy = - VectorType::getDoubleElementsVectorType(InterleaveTy); + InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); for (unsigned I = 0; I < Midpoint; ++I) InterleavingValues[I] = Builder.CreateIntrinsic( - WideVecTy, Intrinsic::vector_interleave2, + InterleaveTy, Intrinsic::vector_interleave2, {InterleavingValues[I], InterleavingValues[Midpoint + I]}, /*FMFSource=*/nullptr, Name); } @@ -2883,8 +2882,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); - SmallVector Ops; - Ops.resize(InterleaveFactor, ResBlockInMask); + SmallVector Ops(InterleaveFactor, ResBlockInMask); return interleaveVectors(State.Builder, Ops, "interleaved.mask"); } @@ -2937,8 +2935,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // deinterleaving, because the current available deinterleave intrinsic // supports only Factor of 2, otherwise it will bailout after first // iteration. - // As we are deinterleaving, the values will be doubled until reachingt - // to the InterleaveFactor. + // When deinterleaving, the number of values will double until we + // have "InterleaveFactor". for (int NumVectors = 1; NumVectors < InterleaveFactor; NumVectors *= 2) { // deinterleave the elements within the vector std::vector TempDeinterleavedValues(NumVectors); diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll index b9ef7050cbd78..61a68692ff5b9 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -passes=loop-vectorize,interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s +; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s define void @deinterleave4(ptr %src) { @@ -136,141 +136,3 @@ define void @negative_deinterleave4_test(ptr %src) { ret void } - -%struct.xyzt = type { i32, i32, i32, i32 } - -define void @interleave_deinterleave(ptr writeonly %dst, ptr readonly %a, ptr readonly %b) { -; CHECK-LABEL: define void @interleave_deinterleave -; CHECK-SAME: (ptr writeonly [[DST:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]]) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 16384 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 16384 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i64 16384 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] -; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP8]]) -; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 -; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[TMP7]] -; CHECK-NEXT: [[LDN14:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP13]]) -; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[LDN14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[LDN14]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN14]], 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN14]], 3 -; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[TMP14]], [[TMP9]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP20:%.*]] = sub nsw [[TMP10]], [[TMP15]] -; CHECK-NEXT: [[TMP21:%.*]] = shl [[TMP11]], [[TMP16]] -; CHECK-NEXT: [[TMP22:%.*]] = ashr [[TMP12]], [[TMP17]] -; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP18]], [[TMP20]], [[TMP21]], [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP19]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP25]], [[TMP24]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 -; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y]], align 4 -; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Y11]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 -; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 -; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z]], align 4 -; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 -; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Z19]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP28]], [[TMP29]] -; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 -; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 -; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T]], align 4 -; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 -; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[T27]], align 4 -; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 -; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv - %0 = load i32, ptr %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv - %1 = load i32, ptr %arrayidx2, align 4 - %add = add nsw i32 %1, %0 - %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv - store i32 %add, ptr %arrayidx5, align 4 - %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 - %2 = load i32, ptr %y, align 4 - %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 - %3 = load i32, ptr %y11, align 4 - %sub = sub nsw i32 %2, %3 - %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 - store i32 %sub, ptr %y14, align 4 - %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 - %4 = load i32, ptr %z, align 4 - %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 - %5 = load i32, ptr %z19, align 4 - %shl = shl i32 %4, %5 - %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 - store i32 %shl, ptr %z22, align 4 - %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 - %6 = load i32, ptr %t, align 4 - %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 - %7 = load i32, ptr %t27, align 4 - %shr = ashr i32 %6, %7 - %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 - store i32 %shr, ptr %t30, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body - -for.cond.cleanup: - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 30e7435c74aae..d8bf03e77e8ce 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1548,5 +1548,147 @@ end: ret void } +%struct.xyzt = type { i32, i32, i32, i32 } +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } + +define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP7]]) +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load , ptr [[TMP13]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC8]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 1 +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP14]]) +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP20]], [[TMP23]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP22]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll new file mode 100644 index 0000000000000..fcccbd9ce02a6 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +%struct.xyzt = type { i32, i32, i32, i32 } +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } + +define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN9:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP13]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN9]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN9]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN9]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[LDN9]], 3 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP21]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} From 010e7988b5f6e68d0676f95abc4c444b4e283fdc Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Tue, 10 Dec 2024 18:34:40 +0000 Subject: [PATCH 07/13] update by latest changes --- .../AArch64/sve-interleaved-accesses.ll | 14 +- .../RISCV/interleaved-accesses.ll | 1232 ++++++++--------- .../AArch64/sve-interleave-vectorization.ll | 6 +- 3 files changed, 626 insertions(+), 626 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index d8bf03e77e8ce..9b24c13ae2219 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -36,14 +36,14 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[TMP3]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw [[TMP4]], [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP6]], [[TMP7]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] @@ -127,7 +127,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 { ; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0( [[TMP8]], i32 2, splat (i1 true), poison) ; CHECK-NEXT: [[TMP9:%.*]] = sext [[WIDE_MASKED_GATHER]] to ; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[BROADCAST_SPLAT]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_MASKED_GATHER1]] to ; CHECK-NEXT: [[TMP12:%.*]] = mul nsw [[BROADCAST_SPLAT3]], [[TMP11]] ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP10]], [[TMP12]]) @@ -209,7 +209,7 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 @@ -1615,12 +1615,12 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 ; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index c37badf42ed8c..b1ff589fe51bf 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; FIXED-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> -; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> +; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; FIXED-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; SCALABLE-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -360,42 +360,42 @@ exit: define void @load_store_factor3_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; FIXED-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; SCALABLE-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -550,42 +550,42 @@ exit: define void @load_store_factor3_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; FIXED-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; SCALABLE-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -740,21 +740,21 @@ exit: define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 @@ -776,14 +776,14 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 ; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 ; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 -; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], shufflevector ( insertelement ( poison, i64 4, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], shufflevector ( insertelement ( poison, i64 5, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], shufflevector ( insertelement ( poison, i64 6, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], shufflevector ( insertelement ( poison, i64 7, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP27:%.*]] = add [[TMP19]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; CHECK-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) ; CHECK-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) ; CHECK-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) @@ -791,24 +791,24 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) ; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) ; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; CHECK-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -843,23 +843,23 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; CHECK-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor8( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> @@ -868,39 +868,39 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; FIXED-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; FIXED-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; FIXED-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; FIXED-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; FIXED-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; FIXED-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; FIXED-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; FIXED-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; FIXED-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; FIXED-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; FIXED-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FIXED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) +; FIXED-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) +; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) +; FIXED-NEXT: [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) +; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) +; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> +; FIXED-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> +; FIXED-NEXT: [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> +; FIXED-NEXT: [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> +; FIXED-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> +; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 2 +; FIXED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; FIXED-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -935,29 +935,29 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; FIXED-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; FIXED-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor8( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) ; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 @@ -979,14 +979,14 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 ; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 ; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 -; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], shufflevector ( insertelement ( poison, i64 4, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], shufflevector ( insertelement ( poison, i64 5, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], shufflevector ( insertelement ( poison, i64 6, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], shufflevector ( insertelement ( poison, i64 7, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP27:%.*]] = add [[TMP19]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; SCALABLE-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) ; SCALABLE-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) ; SCALABLE-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) @@ -994,24 +994,24 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) ; SCALABLE-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) ; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -1046,9 +1046,9 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; SCALABLE-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; SCALABLE-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1118,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1126,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 8 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 -; FIXED-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 -; FIXED-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 +; FIXED-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4 +; FIXED-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 16 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1222,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1230,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1301,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1309,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 4 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4 -; FIXED-NEXT: store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4 +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1405,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1413,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll index fcccbd9ce02a6..b400b27df0839 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll @@ -31,13 +31,13 @@ define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP7]]) +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP7]]) ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 ; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[LDN9:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP13]]) +; CHECK-NEXT: [[LDN9:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP13]]) ; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN9]], 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN9]], 1 ; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN9]], 2 @@ -47,7 +47,7 @@ define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] ; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] ; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] -; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP21]]) +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], splat (i1 true), ptr [[TMP21]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] From 73febb78255dd2de4bc8f162d9730339923affe6 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Mon, 4 Nov 2024 11:31:21 +0000 Subject: [PATCH 08/13] resolve conflicts with upstream --- .../AArch64/sve-deinterleave4.ll | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll index 61a68692ff5b9..7f615f24a6cce 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -136,3 +136,141 @@ define void @negative_deinterleave4_test(ptr %src) { ret void } + +%struct.xyzt = type { i32, i32, i32, i32 } + +define void @interleave_deinterleave(ptr writeonly %dst, ptr readonly %a, ptr readonly %b) { +; CHECK-LABEL: define void @interleave_deinterleave +; CHECK-SAME: (ptr writeonly [[DST:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 16384 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 16384 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i64 16384 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[TMP7]] +; CHECK-NEXT: [[LDN14:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[LDN14]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[LDN14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN14]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN14]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP20:%.*]] = sub nsw [[TMP10]], [[TMP15]] +; CHECK-NEXT: [[TMP21:%.*]] = shl [[TMP11]], [[TMP16]] +; CHECK-NEXT: [[TMP22:%.*]] = ashr [[TMP12]], [[TMP17]] +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP18]], [[TMP20]], [[TMP21]], [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP19]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP25]], [[TMP24]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} From 7195ce0f18b74608081df039ee363bf69c845077 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Mon, 25 Nov 2024 20:55:05 +0000 Subject: [PATCH 09/13] resolve comments --- .../AArch64/sve-deinterleave4.ll | 137 ------------------ .../AArch64/sve-interleaved-accesses.ll | 6 +- 2 files changed, 3 insertions(+), 140 deletions(-) diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll index 7f615f24a6cce..33d2bbd6062bc 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -137,140 +137,3 @@ define void @negative_deinterleave4_test(ptr %src) { ret void } -%struct.xyzt = type { i32, i32, i32, i32 } - -define void @interleave_deinterleave(ptr writeonly %dst, ptr readonly %a, ptr readonly %b) { -; CHECK-LABEL: define void @interleave_deinterleave -; CHECK-SAME: (ptr writeonly [[DST:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]]) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 16384 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 16384 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i64 16384 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] -; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] -; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP8]]) -; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 -; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[TMP7]] -; CHECK-NEXT: [[LDN14:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP13]]) -; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[LDN14]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[LDN14]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN14]], 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN14]], 3 -; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[TMP14]], [[TMP9]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP20:%.*]] = sub nsw [[TMP10]], [[TMP15]] -; CHECK-NEXT: [[TMP21:%.*]] = shl [[TMP11]], [[TMP16]] -; CHECK-NEXT: [[TMP22:%.*]] = ashr [[TMP12]], [[TMP17]] -; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP18]], [[TMP20]], [[TMP21]], [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP19]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP25]], [[TMP24]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 -; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y]], align 4 -; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Y11]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 -; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 -; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z]], align 4 -; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 -; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Z19]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP28]], [[TMP29]] -; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 -; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 -; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T]], align 4 -; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 -; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[T27]], align 4 -; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 -; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; -entry: - br label %for.body - -for.body: - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv - %0 = load i32, ptr %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv - %1 = load i32, ptr %arrayidx2, align 4 - %add = add nsw i32 %1, %0 - %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv - store i32 %add, ptr %arrayidx5, align 4 - %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 - %2 = load i32, ptr %y, align 4 - %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 - %3 = load i32, ptr %y11, align 4 - %sub = sub nsw i32 %2, %3 - %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 - store i32 %sub, ptr %y14, align 4 - %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 - %4 = load i32, ptr %z, align 4 - %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 - %5 = load i32, ptr %z19, align 4 - %shl = shl i32 %4, %5 - %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 - store i32 %shl, ptr %z22, align 4 - %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 - %6 = load i32, ptr %t, align 4 - %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 - %7 = load i32, ptr %t27, align 4 - %shr = ashr i32 %6, %7 - %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 - store i32 %shr, ptr %t30, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body - -for.cond.cleanup: - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 9b24c13ae2219..fdfbe187c54ef 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1615,12 +1615,12 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 ; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 From 155cc1ebf3f8240471e52abccd3b127e7322aeb3 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Wed, 11 Dec 2024 08:56:17 +0000 Subject: [PATCH 10/13] rebase --- .../AArch64/sve-interleaved-accesses.ll | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index fdfbe187c54ef..9af1df7068f8e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -36,14 +36,14 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP2]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[TMP3]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw [[TMP4]], [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP6]], [[TMP7]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] @@ -127,7 +127,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 { ; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0( [[TMP8]], i32 2, splat (i1 true), poison) ; CHECK-NEXT: [[TMP9:%.*]] = sext [[WIDE_MASKED_GATHER]] to ; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[BROADCAST_SPLAT]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_MASKED_GATHER1]] to ; CHECK-NEXT: [[TMP12:%.*]] = mul nsw [[BROADCAST_SPLAT3]], [[TMP11]] ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP10]], [[TMP12]]) @@ -209,7 +209,7 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 @@ -1615,12 +1615,12 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 ; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 From aec4b9585a520207178919a28ab30e469da805c5 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Thu, 12 Dec 2024 05:05:53 +0000 Subject: [PATCH 11/13] [resolve comments]: add tests for masked interleaved accesses --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 +- .../AArch64/sve-interleaved-accesses.ll | 118 +++++++- .../sve-interleaved-masked-accesses.ll | 252 ++++++++++++++++++ 3 files changed, 376 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f35fe673574df..7fe6ff230ab42 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2779,11 +2779,10 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - unsigned InterleaveFactor = Vals.size(); SmallVector InterleavingValues(Vals); // When interleaving, the number of values will be shrunk until we have the // single final interleaved value. - VectorType *InterleaveTy = + auto *InterleaveTy = cast(InterleavingValues[0]->getType()); for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); @@ -2937,18 +2936,18 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // iteration. // When deinterleaving, the number of values will double until we // have "InterleaveFactor". - for (int NumVectors = 1; NumVectors < InterleaveFactor; NumVectors *= 2) { - // deinterleave the elements within the vector - std::vector TempDeinterleavedValues(NumVectors); - for (int I = 0; I < NumVectors; ++I) { + for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; NumVectors *= 2) { + // Deinterleave the elements within the vector + SmallVector TempDeinterleavedValues(NumVectors); + for (unsigned I = 0; I < NumVectors; ++I) { auto *DiTy = DeinterleavedValues[I]->getType(); TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], /*FMFSource=*/nullptr, "strided.vec"); } // Extract the deinterleaved values: - for (int I = 0; I < 2; ++I) - for (int J = 0; J < NumVectors; ++J) + for (unsigned I = 0; I < 2; ++I) + for (unsigned J = 0; J < NumVectors; ++J) DeinterleavedValues[NumVectors * I + J] = State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 9af1df7068f8e..02081a20cd40d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1548,13 +1548,15 @@ end: ret void } -%struct.xyzt = type { i32, i32, i32, i32 } +; Check vectorization on an interleaved load/store groups of factor 4 + ; for (int i = 0; i < 1024; ++i) { ; dst[i].x = a[i].x + b[i].x; ; dst[i].y = a[i].y - b[i].y; ; dst[i].z = a[i].z << b[i].z; ; dst[i].t = a[i].t >> b[i].t; ; } +%struct.xyzt = type { i32, i32, i32, i32 } define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) { ; CHECK-LABEL: @interleave_deinterleave( @@ -1690,5 +1692,119 @@ for.end: ret void } +; Check vectorization on a reverse interleaved load/store groups of factor 4 + +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; +; int b = A[i].y - i; +; int c = A[i].z * i; +; int d = A[i].t << i; +; B[i].x = a; +; B[i].y = b; +; B[i].z = c; +; B[i].t = d; +; } + +define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{ +; CHECK-LABEL: @interleave_deinterleave_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub splat (i32 1023), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP11]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) +; CHECK-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) +; CHECK-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw [[REVERSE3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; CHECK-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; CHECK-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) +; CHECK-NEXT: [[REVERSE9:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE6]], [[REVERSE8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE7]], [[REVERSE9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC10]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]] +; +entry: + br label %for.body +for.cond.cleanup: ; preds = %for.body + ret void +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0 + %load1 = load i32, ptr %x, align 4 + %trunc = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %load1, %trunc + %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1 + %load2 = load i32, ptr %y, align 4 + %sub = sub nsw i32 %load2, %trunc + %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2 + %load3 = load i32, ptr %z, align 4 + %mul = mul nsw i32 %load3, %trunc + %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3 + %load4 = load i32, ptr %t, align 4 + %shl = shl nuw nsw i32 %load4, %trunc + %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0 + store i32 %add, ptr %x5, align 4 + %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1 + store i32 %sub, ptr %y8, align 4 + %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2 + store i32 %mul, ptr %z5, align 4 + %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3 + store i32 %shl, ptr %t8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +} attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 1a281fe7c6f7f..d4392bebdf37b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -529,3 +529,255 @@ for.inc: for.end: ret void } + +; Expected to contain interleave2/deinterleave2 instructions +; +; void masked_strided_factor4(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left1 = p[4*ix]; +; char right1 = p[4*ix + 1]; +; char left2 = p[4*ix + 2]; +; char right2 = p[4*ix + 3]; +; char max1 = max(left1, right1); +; char max2 = max(left2, right2); +; q[4*ix] = max1; +; q[4*ix + 1] = 0 - max1; +; q[4*ix + 2] = max2; +; q[4*ix + 3] = 0 - max2; +; } +; } +;} +define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SCALAR_TAIL_FOLDING-NEXT: entry: +; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: vector.body: +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALAR_TAIL_FOLDING: middle.block: +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING: scalar.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: for.body: +; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: if.then: +; SCALAR_TAIL_FOLDING-NEXT: [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]] +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: for.inc: +; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALAR_TAIL_FOLDING: for.end: +; SCALAR_TAIL_FOLDING-NEXT: ret void +; +; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; PREDICATED_TAIL_FOLDING-NEXT: entry: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_TAIL_FOLDING: vector.ph: +; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: vector.body: +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; PREDICATED_TAIL_FOLDING: middle.block: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING: scalar.ph: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: for.body: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; PREDICATED_TAIL_FOLDING: if.then: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; PREDICATED_TAIL_FOLDING: for.inc: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PREDICATED_TAIL_FOLDING: for.end: +; PREDICATED_TAIL_FOLDING-NEXT: ret void +; +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %idx0 = shl nuw nsw i32 %ix.024, 2 + %idx1 = add i32 %idx0, 1 + %idx2 = add i32 %idx0, 2 + %idx3 = add i32 %idx0, 3 + + %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0 + %0 = load i8, ptr %array1idx0, align 1 + %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1 + %1 = load i8, ptr %array1idx1, align 1 + %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2 + %2 = load i8, ptr %array1idx2, align 1 + %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3 + %3 = load i8, ptr %array1idx3, align 1 + + %cmp.i1 = icmp slt i8 %0, %1 + %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0 + %sub1 = sub i8 0, %spec.select.i1 + %cmp.i2 = icmp slt i8 %2, %3 + %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2 + %sub2 = sub i8 0, %spec.select.i2 + + %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0 + store i8 %spec.select.i1, ptr %array3idx0, align 1 + %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1 + store i8 %sub1, ptr %array3idx1, align 1 + %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2 + store i8 %spec.select.i2, ptr %array3idx2, align 1 + %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3 + store i8 %sub2, ptr %array3idx3, align 1 + + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} From 89b158b58123e3ac5252f44aea85572a6c4885dc Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Thu, 12 Dec 2024 07:00:47 +0000 Subject: [PATCH 12/13] format --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7fe6ff230ab42..a39dcb043442d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2782,8 +2782,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, SmallVector InterleavingValues(Vals); // When interleaving, the number of values will be shrunk until we have the // single final interleaved value. - auto *InterleaveTy = - cast(InterleavingValues[0]->getType()); + auto *InterleaveTy = cast(InterleavingValues[0]->getType()); for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); for (unsigned I = 0; I < Midpoint; ++I) @@ -2936,7 +2935,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // iteration. // When deinterleaving, the number of values will double until we // have "InterleaveFactor". - for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; NumVectors *= 2) { + for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; + NumVectors *= 2) { // Deinterleave the elements within the vector SmallVector TempDeinterleavedValues(NumVectors); for (unsigned I = 0; I < NumVectors; ++I) { From 4766679f282352f84534607ed11f4aeb8c858df1 Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Tue, 17 Dec 2024 10:11:40 +0000 Subject: [PATCH 13/13] Resolve review comments --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 ++- .../Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index a39dcb043442d..d0ed33a3bc3ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2779,6 +2779,8 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { + assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " + "scalable vectors, must be power of 2"); SmallVector InterleavingValues(Vals); // When interleaving, the number of values will be shrunk until we have the // single final interleaved value. @@ -2926,7 +2928,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. - SmallVector DeinterleavedValues(InterleaveFactor); DeinterleavedValues[0] = NewLoad; // For the case of InterleaveFactor > 2, we will have to do recursive diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll index 33d2bbd6062bc..61a68692ff5b9 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -136,4 +136,3 @@ define void @negative_deinterleave4_test(ptr %src) { ret void } -