diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index db77d6c955792..791006c48b5dd 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1713,7 +1713,12 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost << "\n"); - if (NewCost >= OldCost) + + // If either shuffle will constant fold away, then fold for the same cost as + // we will reduce the instruction count. + bool ReducedInstCount = (isa(X) && isa(Z)) || + (isa(Y) && isa(W)); + if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost)) return false; Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0); diff --git a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll index b158d4a3676b3..057d9af314ba3 100644 --- a/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll +++ b/llvm/test/Transforms/VectorCombine/X86/concat-boolmasks.ll @@ -80,30 +80,13 @@ define i64 @movmsk_i64_v8i32_v4i32(<4 x i32> %v0, <4 x i32> %v1) { } define i64 @movmsk_i64_v64i8_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { -; SSE-LABEL: @movmsk_i64_v64i8_v16i8( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <64 x i32> -; SSE-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer -; SSE-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 -; SSE-NEXT: ret i64 [[OR]] -; -; AVX2-LABEL: @movmsk_i64_v64i8_v16i8( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = icmp slt <32 x i8> [[TMP1]], zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = icmp slt <32 x i8> [[TMP3]], zeroinitializer -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <32 x i1> [[TMP4]], <32 x i1> [[TMP2]], <64 x i32> -; AVX2-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP5]] to i64 -; AVX2-NEXT: ret i64 [[OR]] -; -; AVX512-LABEL: @movmsk_i64_v64i8_v16i8( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> -; AVX512-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer -; AVX512-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 -; AVX512-NEXT: ret i64 [[OR]] +; CHECK-LABEL: @movmsk_i64_v64i8_v16i8( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> [[V0:%.*]], <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V3:%.*]], <16 x i8> [[V2:%.*]], <32 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> [[TMP1]], <64 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <64 x i8> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[OR:%.*]] = bitcast <64 x i1> [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[OR]] ; %c0 = icmp slt <16 x i8> %v0, zeroinitializer %c1 = icmp slt <16 x i8> %v1, zeroinitializer