diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 56ead92187b04..1ae14b5e565c3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -979,12 +979,14 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) const { - // The interleaved memory access pass will lower interleaved memory ops (i.e - // a load and store followed by a specific shuffle) to vlseg/vsseg - // intrinsics. - if (!UseMaskForCond && !UseMaskForGaps && + auto *VTy = cast(VecTy); + + // The interleaved memory access pass will lower (de)interleave ops combined + // with an adjacent appropriate memory to vlseg/vsseg intrinsics. We + // currently only support masking for the scalable path. vlseg/vsseg only + // support masking per-iteration (i.e. condition), not per-segment (i.e. gap). + if ((VTy->isScalableTy() || !UseMaskForCond) && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { - auto *VTy = cast(VecTy); std::pair LT = getTypeLegalizationCost(VTy); // Need to make sure type has't been scalarized if (LT.second.isVector()) { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99cf31899..05d504cbcb6bb 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -398,6 +398,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool enableInterleavedAccessVectorization() const override { return true; } + bool enableMaskedInterleavedAccessVectorization() const override { + return ST->hasVInstructions(); + } + unsigned getMinTripCountTailFoldingThreshold() const override; enum RISCVRegisterClass { GPRRC, FPRRC, VRRC }; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index dbe6f27d61749..d56d858cfb91f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -30,26 +30,25 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP9]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP13]], [[TMP15]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP16]], [[TMP18]], i32 1, [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_EPILOGUE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP12]], [[TMP15]]) +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK3]]) ; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: ; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -80,26 +79,25 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) ; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer -; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP9]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP13]], [[TMP15]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP16]], [[TMP18]], i32 1, [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] +; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP12]], [[TMP15]]) +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_DATA: middle.block: ; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA: scalar.ph: @@ -129,28 +127,29 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP9]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP12]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP13]], align 1 [[TMP15]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP16]], align 1 [[TMP18]], [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ult [[VEC_IND]], splat (i32 1024) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = select [[TMP6]], [[TMP7]], zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = shl i32 [[EVL_BASED_IV]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP8]], [[TMP8]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP11]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = sext i32 [[TMP9]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP15]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP14]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP14]], [[TMP17]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP8]], [[TMP8]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP16]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: @@ -215,42 +214,29 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP14]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP16]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP18]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; SCALAR_EPILOGUE-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; SCALAR_EPILOGUE-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; SCALAR_EPILOGUE-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP24]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP26]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; SCALAR_EPILOGUE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP28]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP30]], i32 1, [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_EPILOGUE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 1, [[INTERLEAVED_MASK3]]) ; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: ; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -281,42 +267,29 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) ; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer -; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP14]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP16]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP18]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; PREDICATED_DATA-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; PREDICATED_DATA-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; PREDICATED_DATA-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP24]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP26]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP28]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP30]], i32 1, [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] +; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_DATA-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_DATA: middle.block: ; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA: scalar.ph: @@ -346,44 +319,33 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP12]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP16]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP18]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP19]], align 1 [[TMP24]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP20]], align 1 [[TMP26]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP21]], align 1 [[TMP28]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP22]], align 1 [[TMP30]], [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ult [[VEC_IND]], splat (i32 1024) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = select [[TMP6]], [[TMP7]], zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = shl i32 [[EVL_BASED_IV]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP8]], [[TMP8]], [[TMP8]], [[TMP8]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP11]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = call @llvm.smax.nxv16i8( [[TMP14]], [[TMP15]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = sub zeroinitializer, [[TMP18]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP20:%.*]] = sext i32 [[TMP9]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP20]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP8]], [[TMP8]], [[TMP8]], [[TMP8]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP21]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll index acfcf90b813ef..1e25947d5eaf6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll @@ -21,26 +21,29 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = mul [[TMP10]], splat (i64 1) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP13]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP21]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[VEC_IND]], i32 1 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP23]], splat (i1 true), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP12:%.*]] = add zeroinitializer, [[TMP10]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT2]], [[TMP12]] +; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0 +; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) +; IF-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP9]], i32 4, [[INTERLEAVED_MASK]], poison) +; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]] ; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0 @@ -48,7 +51,6 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: