From a74859dfb7fa9d908c22f5811ffa0c0a0d53c357 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Tue, 22 Jul 2025 01:55:27 -0700 Subject: [PATCH 1/2] enable masked interleave --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 3 +- .../Target/RISCV/RISCVTargetTransformInfo.h | 4 + .../Transforms/Vectorize/LoopVectorize.cpp | 2 + .../RISCV/interleaved-masked-access.ll | 288 ++++++++---------- ...ectorize-force-tail-with-evl-interleave.ll | 41 ++- 5 files changed, 147 insertions(+), 191 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fd634b50bd7d2..14a5c1955afc4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -982,8 +982,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( // The interleaved memory access pass will lower interleaved memory ops (i.e // a load and store followed by a specific shuffle) to vlseg/vsseg // intrinsics. - if (!UseMaskForCond && !UseMaskForGaps && - Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { auto *VTy = cast(VecTy); std::pair LT = getTypeLegalizationCost(VTy); // Need to make sure type has't been scalarized diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99cf31899..f54da7862b686 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -398,6 +398,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool enableInterleavedAccessVectorization() const override { return true; } + bool enableMaskedInterleavedAccessVectorization() const override { + return true; + } + unsigned getMinTripCountTailFoldingThreshold() const override; enum RISCVRegisterClass { GPRRC, FPRRC, VRRC }; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 99a96a8beb9f4..9c9201cae1506 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1359,7 +1359,9 @@ class LoopVectorizationCostModel { return; // Override EVL styles if needed. // FIXME: Investigate opportunity for fixed vector factor. + // FIXME: Support interleave accesses. bool EVLIsLegal = UserIC <= 1 && IsScalableVF && + !InterleaveInfo.hasGroups() && TTI.hasActiveVectorLength() && !EnableVPlanNativePath; if (EVLIsLegal) return; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index dbe6f27d61749..2a4cd28bbdbce 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -30,26 +30,25 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP9]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP13]], [[TMP15]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP16]], [[TMP18]], i32 1, [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_EPILOGUE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP12]], [[TMP15]]) +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK3]]) ; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: ; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -80,26 +79,25 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) ; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer -; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP9]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP13]], [[TMP15]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP16]], [[TMP18]], i32 1, [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] +; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP12]], [[TMP15]]) +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_DATA: middle.block: ; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA: scalar.ph: @@ -120,37 +118,34 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_DATA-WITH-EVL: vector.body: ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_DATA-WITH-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = icmp ult [[VEC_IND]], splat (i32 1024) ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP9]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP12]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP13]], align 1 [[TMP15]], [[TMP6]], i32 [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = select [[TMP5]], [[TMP6]], zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP16]], align 1 [[TMP18]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP13]], [[TMP16]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: @@ -215,42 +210,29 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP14]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP16]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP18]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; SCALAR_EPILOGUE-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; SCALAR_EPILOGUE-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; SCALAR_EPILOGUE-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP24]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP26]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; SCALAR_EPILOGUE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP28]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP30]], i32 1, [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_EPILOGUE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 1, [[INTERLEAVED_MASK3]]) ; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: ; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -281,42 +263,29 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) ; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer -; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP14]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP16]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP18]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; PREDICATED_DATA-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; PREDICATED_DATA-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; PREDICATED_DATA-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP24]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP26]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP28]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP30]], i32 1, [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] +; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_DATA-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_DATA: middle.block: ; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA: scalar.ph: @@ -337,53 +306,38 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_DATA-WITH-EVL: vector.body: ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_DATA-WITH-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = icmp ult [[VEC_IND]], splat (i32 1024) ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP12]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP16]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP18]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP19]], align 1 [[TMP24]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP20]], align 1 [[TMP26]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP21]], align 1 [[TMP28]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP22]], align 1 [[TMP30]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = select [[TMP5]], [[TMP6]], zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP7]], [[TMP7]], [[TMP7]], [[TMP7]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP15]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP7]], [[TMP7]], [[TMP7]], [[TMP7]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll index acfcf90b813ef..556446c3e1af8 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll @@ -21,36 +21,33 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = mul [[TMP10]], splat (i64 1) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: -; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP13]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP21]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[VEC_IND]], i32 1 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP23]], splat (i1 true), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = add zeroinitializer, [[TMP9]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT2]], [[TMP10]] +; IF-EVL-NEXT: [[TMP11:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0 +; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP11]], [[TMP11]]) +; IF-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP12]], i32 4, [[INTERLEAVED_MASK]], poison) +; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]] ; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP26]], ptr align 4 [[TMP29]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP26]], ptr [[TMP29]], i32 4, [[TMP11]]) +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] +; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: @@ -67,7 +64,7 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: for.cond.cleanup: ; IF-EVL-NEXT: ret void ; From 4688b4cc6effea95c6f13a76a66940e17b016ecb Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Thu, 24 Jul 2025 02:58:26 -0700 Subject: [PATCH 2/2] Pick some good change from #150074, thanks Co-authored-by: Philip Reames --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 11 +++++++---- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 14a5c1955afc4..1624f12a102e3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -979,10 +979,13 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) const { - // The interleaved memory access pass will lower interleaved memory ops (i.e - // a load and store followed by a specific shuffle) to vlseg/vsseg - // intrinsics. - if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { + // The interleaved memory access pass will lower (de)interleave ops combined + // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg + // only support masking per-iteration (i.e. condition), not per-segment (i.e. + // gap). + // TODO: Support masked interleaved access for fixed length vector. + if ((isa(VecTy) || !UseMaskForCond) && !UseMaskForGaps && + Factor <= TLI->getMaxSupportedInterleaveFactor()) { auto *VTy = cast(VecTy); std::pair LT = getTypeLegalizationCost(VTy); // Need to make sure type has't been scalarized diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index f54da7862b686..05d504cbcb6bb 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -399,7 +399,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool enableInterleavedAccessVectorization() const override { return true; } bool enableMaskedInterleavedAccessVectorization() const override { - return true; + return ST->hasVInstructions(); } unsigned getMinTripCountTailFoldingThreshold() const override;