From 1d031ea0234499e5cd42df4933508379a2e286fd Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 1 May 2025 15:32:07 +0800 Subject: [PATCH 01/10] Precommit tests --- .../RISCV/intrinsic-scalarize.ll | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll new file mode 100644 index 0000000000000..55b78c4716bc0 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -S -p vector-combine | FileCheck %s + +define <4 x i32> @umax_fixed(i32 %x, i32 %y) { +; CHECK-LABEL: define <4 x i32> @umax_fixed( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> [[Y_INSERT]]) +; CHECK-NEXT: ret <4 x i32> [[V]] +; + %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0 + %y.insert = insertelement <4 x i32> poison, i32 %y, i32 0 + %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> %y.insert) + ret <4 x i32> %v +} + +define @umax_scalable(i32 %x, i32 %y) { +; CHECK-LABEL: define @umax_scalable( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement poison, i32 [[Y]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call @llvm.umax.nxv4i32( [[X_INSERT]], [[Y_INSERT]]) +; CHECK-NEXT: ret [[V]] +; + %x.insert = insertelement poison, i32 %x, i32 0 + %y.insert = insertelement poison, i32 %y, i32 0 + %v = call @llvm.umax( %x.insert, %y.insert) + ret %v +} + +define <4 x i32> @umax_fixed_lhs_const(i32 %x) { +; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> , <4 x i32> [[X_INSERT]]) +; CHECK-NEXT: ret <4 x i32> [[V]] +; + %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0 + %v = call <4 x i32> @llvm.umax(<4 x i32> , <4 x i32> %x.insert) + ret <4 x i32> %v +} + +define <4 x i32> @umax_fixed_rhs_const(i32 %x) { +; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> ) +; CHECK-NEXT: ret <4 x i32> [[V]] +; + %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0 + %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> ) + ret <4 x i32> %v +} + +define @umax_scalable_lhs_const(i32 %x) { +; CHECK-LABEL: define @umax_scalable_lhs_const( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call @llvm.umax.nxv4i32( splat (i32 42), [[X_INSERT]]) +; CHECK-NEXT: ret [[V]] +; + %x.insert = insertelement poison, i32 %x, i32 0 + %v = call @llvm.umax( splat (i32 42), %x.insert) + ret %v +} + +define @umax_scalable_rhs_const(i32 %x) { +; CHECK-LABEL: define @umax_scalable_rhs_const( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call @llvm.umax.nxv4i32( [[X_INSERT]], splat (i32 42)) +; CHECK-NEXT: ret [[V]] +; + %x.insert = insertelement poison, i32 %x, i32 0 + %v = call @llvm.umax( %x.insert, splat (i32 42)) + ret %v +} + +; Shouldn't be scalarized, not a "trivially vectorizable" intrinsic. +define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) { +; CHECK-LABEL: define <4 x i32> @non_trivially_vectorizable( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]]) +; CHECK-NEXT: ret <4 x i32> [[V]] +; + %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0 + %y.insert = insertelement <8 x i32> poison, i32 %y, i32 0 + %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert) + ret <4 x i32> %v +} From ebfcbe452b7657e54c4c4797b452136afb87a9b3 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 1 May 2025 15:32:38 +0800 Subject: [PATCH 02/10] [VectorCombine] Scalarize binop-like intrinsics --- .../Transforms/Vectorize/VectorCombine.cpp | 64 ++++++++++++++----- .../RISCV/intrinsic-scalarize.ll | 32 ++++++---- 2 files changed, 66 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 04c084ffdda97..7a7c533267f6f 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -48,6 +48,7 @@ STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); STATISTIC(NumScalarBO, "Number of scalar binops formed"); STATISTIC(NumScalarCmp, "Number of scalar compares formed"); +STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed"); static cl::opt DisableVectorCombine( "disable-vector-combine", cl::init(false), cl::Hidden, @@ -1016,21 +1017,29 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { return true; } -/// Match a vector binop or compare instruction with at least one inserted -/// scalar operand and convert to scalar binop/cmp followed by insertelement. +/// Match a vector binop, compare or binop-like intrinsic with at least one +/// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed +/// by insertelement. bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; Value *Ins0, *Ins1; if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && - !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) - return false; + !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) { + if (auto *II = dyn_cast(&I); + II && II->arg_size() == 2 && + isTriviallyVectorizable(II->getIntrinsicID())) { + Ins0 = II->getArgOperand(0); + Ins1 = II->getArgOperand(1); + } else { + return false; + } + } // Do not convert the vector condition of a vector select into a scalar // condition. That may cause problems for codegen because of differences in // boolean formats and register-file transfers. // TODO: Can we account for that in the cost model? - bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; - if (IsCmp) + if (isa(I)) for (User *U : I.users()) if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) return false; @@ -1085,15 +1094,24 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { unsigned Opcode = I.getOpcode(); InstructionCost ScalarOpCost, VectorOpCost; - if (IsCmp) { + if (isa(I)) { CmpInst::Predicate Pred = cast(I).getPredicate(); ScalarOpCost = TTI.getCmpSelInstrCost( Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind); VectorOpCost = TTI.getCmpSelInstrCost( Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind); - } else { + } else if (isa(I)) { ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind); VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind); + } else if (auto *II = dyn_cast(&I)) { + IntrinsicCostAttributes ScalarICA( + II->getIntrinsicID(), ScalarTy, + SmallVector(II->arg_size(), ScalarTy)); + ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind); + IntrinsicCostAttributes VectorICA( + II->getIntrinsicID(), VecTy, + SmallVector(II->arg_size(), VecTy)); + VectorOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind); } // Get cost estimate for the insert element. This cost will factor into @@ -1112,10 +1130,12 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> // inselt NewVecC, (scalar_op V0, V1), Index - if (IsCmp) + if (isa(I)) ++NumScalarCmp; - else + else if (isa(I)) ++NumScalarBO; + else if (isa(I)) + ++NumScalarIntrinsic; // For constant cases, extract the scalar element, this should constant fold. if (IsConst0) @@ -1123,9 +1143,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { if (IsConst1) V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); - Value *Scalar = - IsCmp ? Builder.CreateCmp(Pred, V0, V1) - : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); + Value *Scalar; + if (isa(I)) + Scalar = Builder.CreateCmp(Pred, V0, V1); + else if (isa(I)) + Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); + else if (auto *II = dyn_cast(&I)) + Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), {V0, V1}); + else + llvm_unreachable("Unexpected instruction type"); Scalar->setName(I.getName() + ".scalar"); @@ -1135,9 +1161,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { ScalarInst->copyIRFlags(&I); // Fold the vector constants in the original vectors into a new base vector. - Value *NewVecC = - IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) - : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); + Value *NewVecC; + if (isa(I)) + NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1); + else if (isa(I)) + NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); + else if (auto *II = dyn_cast(&I)) + NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1}); + else + llvm_unreachable("Unexpected instruction type"); Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); replaceValue(I, *Insert); return true; diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll index 55b78c4716bc0..5a25f5faf8911 100644 --- a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll +++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll @@ -4,9 +4,9 @@ define <4 x i32> @umax_fixed(i32 %x, i32 %y) { ; CHECK-LABEL: define <4 x i32> @umax_fixed( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { -; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 -; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i32 0 -; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> [[Y_INSERT]]) +; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison) +; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0 ; CHECK-NEXT: ret <4 x i32> [[V]] ; %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0 @@ -18,9 +18,9 @@ define <4 x i32> @umax_fixed(i32 %x, i32 %y) { define @umax_scalable(i32 %x, i32 %y) { ; CHECK-LABEL: define @umax_scalable( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { -; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement poison, i32 [[X]], i32 0 -; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement poison, i32 [[Y]], i32 0 -; CHECK-NEXT: [[V:%.*]] = call @llvm.umax.nxv4i32( [[X_INSERT]], [[Y_INSERT]]) +; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.umax.nxv4i32( poison, poison) +; CHECK-NEXT: [[V:%.*]] = insertelement [[TMP1]], i32 [[V_SCALAR]], i64 0 ; CHECK-NEXT: ret [[V]] ; %x.insert = insertelement poison, i32 %x, i32 0 @@ -32,8 +32,9 @@ define @umax_scalable(i32 %x, i32 %y) { define <4 x i32> @umax_fixed_lhs_const(i32 %x) { ; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const( ; CHECK-SAME: i32 [[X:%.*]]) { -; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 -; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> , <4 x i32> [[X_INSERT]]) +; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> , <4 x i32> poison) +; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0 ; CHECK-NEXT: ret <4 x i32> [[V]] ; %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0 @@ -44,8 +45,9 @@ define <4 x i32> @umax_fixed_lhs_const(i32 %x) { define <4 x i32> @umax_fixed_rhs_const(i32 %x) { ; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const( ; CHECK-SAME: i32 [[X:%.*]]) { -; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 -; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> ) +; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> ) +; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0 ; CHECK-NEXT: ret <4 x i32> [[V]] ; %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0 @@ -56,8 +58,9 @@ define <4 x i32> @umax_fixed_rhs_const(i32 %x) { define @umax_scalable_lhs_const(i32 %x) { ; CHECK-LABEL: define @umax_scalable_lhs_const( ; CHECK-SAME: i32 [[X:%.*]]) { -; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement poison, i32 [[X]], i32 0 -; CHECK-NEXT: [[V:%.*]] = call @llvm.umax.nxv4i32( splat (i32 42), [[X_INSERT]]) +; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]]) +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.umax.nxv4i32( splat (i32 42), poison) +; CHECK-NEXT: [[V:%.*]] = insertelement [[TMP1]], i32 [[V_SCALAR]], i64 0 ; CHECK-NEXT: ret [[V]] ; %x.insert = insertelement poison, i32 %x, i32 0 @@ -68,8 +71,9 @@ define @umax_scalable_lhs_const(i32 %x) { define @umax_scalable_rhs_const(i32 %x) { ; CHECK-LABEL: define @umax_scalable_rhs_const( ; CHECK-SAME: i32 [[X:%.*]]) { -; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement poison, i32 [[X]], i32 0 -; CHECK-NEXT: [[V:%.*]] = call @llvm.umax.nxv4i32( [[X_INSERT]], splat (i32 42)) +; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42) +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.umax.nxv4i32( poison, splat (i32 42)) +; CHECK-NEXT: [[V:%.*]] = insertelement [[TMP1]], i32 [[V_SCALAR]], i64 0 ; CHECK-NEXT: ret [[V]] ; %x.insert = insertelement poison, i32 %x, i32 0 From 43743048ab11bfd108b377b8d6ba4f6d55472fd9 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 1 May 2025 15:50:24 +0800 Subject: [PATCH 03/10] clang-format --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 7a7c533267f6f..4f018f5af03a5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1167,7 +1167,8 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { else if (isa(I)) NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); else if (auto *II = dyn_cast(&I)) - NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1}); + NewVecC = + Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1}); else llvm_unreachable("Unexpected instruction type"); Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); From 968f0613d4960ad4bceb556a3cee90193f28d621 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 1 May 2025 17:21:14 +0800 Subject: [PATCH 04/10] Check isVectorIntrinsicWithScalarOpAtArg --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 6 +++++- .../VectorCombine/RISCV/intrinsic-scalarize.ll | 13 +++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 4f018f5af03a5..345283862ec60 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1027,7 +1027,11 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) { if (auto *II = dyn_cast(&I); II && II->arg_size() == 2 && - isTriviallyVectorizable(II->getIntrinsicID())) { + isTriviallyVectorizable(II->getIntrinsicID()) && + none_of(index_range(0, II->arg_size()), [this, &II](size_t OpIdx) { + return isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), OpIdx, + &TTI); + })) { Ins0 = II->getArgOperand(0); Ins1 = II->getArgOperand(1); } else { diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll index 5a25f5faf8911..e12b1ca99c6d1 100644 --- a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll +++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll @@ -95,3 +95,16 @@ define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) { %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert) ret <4 x i32> %v } + +; TODO: We should be able to scalarize this if we preserve the scalar argument. +define <4 x float> @scalar_argument(float %x) { +; CHECK-LABEL: define <4 x float> @scalar_argument( +; CHECK-SAME: float [[X:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[X_INSERT]], i32 42) +; CHECK-NEXT: ret <4 x float> [[V]] +; + %x.insert = insertelement <4 x float> poison, float %x, i32 0 + %v = call <4 x float> @llvm.powi(<4 x float> %x.insert, i32 42) + ret <4 x float> %v +} From d559e157d34c3a90921ead867f4f576e5826cb7d Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 1 May 2025 17:30:59 +0800 Subject: [PATCH 05/10] Just check all arguments have same type as return --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 9 +++++---- .../VectorCombine/RISCV/intrinsic-scalarize.ll | 12 ++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 345283862ec60..57a0ca80361bf 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1025,13 +1025,14 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { Value *Ins0, *Ins1; if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) { + // TODO: Allow unary and ternary intrinsics + // TODO: Allow intrinsics with different arguments types + // TODO: Allow intrinsics with scalar arguments if (auto *II = dyn_cast(&I); II && II->arg_size() == 2 && isTriviallyVectorizable(II->getIntrinsicID()) && - none_of(index_range(0, II->arg_size()), [this, &II](size_t OpIdx) { - return isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), OpIdx, - &TTI); - })) { + all_of(II->args(), + [&II](Value *Arg) { return Arg->getType() == II->getType(); })) { Ins0 = II->getArgOperand(0); Ins1 = II->getArgOperand(1); } else { diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll index e12b1ca99c6d1..e7683d72a052d 100644 --- a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll +++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll @@ -108,3 +108,15 @@ define <4 x float> @scalar_argument(float %x) { %v = call <4 x float> @llvm.powi(<4 x float> %x.insert, i32 42) ret <4 x float> %v } + +define <4 x i2> @scmp(i32 %x) { +; CHECK-LABEL: define <4 x i2> @scmp( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[V:%.*]] = call <4 x i2> @llvm.scmp.v4i2.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> zeroinitializer) +; CHECK-NEXT: ret <4 x i2> [[V]] +; + %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0 + %v = call <4 x i2> @llvm.scmp(<4 x i32> %x.insert, <4 x i32> splat (i32 0)) + ret <4 x i2> %v +} From 3c3f7e32184e6c89919a6a59ca9c8ded35bd6b10 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 1 May 2025 17:47:43 +0800 Subject: [PATCH 06/10] Fix vector ICA type, add llvm_unreachable --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 57a0ca80361bf..3d9aac56d959b 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1116,8 +1116,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { IntrinsicCostAttributes VectorICA( II->getIntrinsicID(), VecTy, SmallVector(II->arg_size(), VecTy)); - VectorOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind); - } + VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind); + } else + llvm_unreachable("Unexpected instrucion type"); // Get cost estimate for the insert element. This cost will factor into // both sequences. From fea2417523b0bc6bf7ddecde509f8258b23e1d72 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 1 May 2025 17:48:57 +0800 Subject: [PATCH 07/10] Fix comment typo --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 3d9aac56d959b..39dd5141b245d 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1026,7 +1026,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) { // TODO: Allow unary and ternary intrinsics - // TODO: Allow intrinsics with different arguments types + // TODO: Allow intrinsics with different argument types // TODO: Allow intrinsics with scalar arguments if (auto *II = dyn_cast(&I); II && II->arg_size() == 2 && From fbce2ad422a90c5d5fb307e43168788ab7ac7120 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Sat, 3 May 2025 20:50:26 +0800 Subject: [PATCH 08/10] Replace llvm_unreachable with cast --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 39dd5141b245d..a2ffbdacb26d1 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1108,7 +1108,8 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { } else if (isa(I)) { ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind); VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind); - } else if (auto *II = dyn_cast(&I)) { + } else { + auto *II = cast(&I); IntrinsicCostAttributes ScalarICA( II->getIntrinsicID(), ScalarTy, SmallVector(II->arg_size(), ScalarTy)); @@ -1117,8 +1118,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { II->getIntrinsicID(), VecTy, SmallVector(II->arg_size(), VecTy)); VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind); - } else - llvm_unreachable("Unexpected instrucion type"); + } // Get cost estimate for the insert element. This cost will factor into // both sequences. @@ -1154,10 +1154,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { Scalar = Builder.CreateCmp(Pred, V0, V1); else if (isa(I)) Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); - else if (auto *II = dyn_cast(&I)) - Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), {V0, V1}); else - llvm_unreachable("Unexpected instruction type"); + Scalar = Builder.CreateIntrinsic( + ScalarTy, cast(I).getIntrinsicID(), {V0, V1}); Scalar->setName(I.getName() + ".scalar"); @@ -1172,11 +1171,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1); else if (isa(I)) NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); - else if (auto *II = dyn_cast(&I)) - NewVecC = - Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1}); else - llvm_unreachable("Unexpected instruction type"); + NewVecC = Builder.CreateIntrinsic( + VecTy, cast(I).getIntrinsicID(), {VecC0, VecC1}); Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); replaceValue(I, *Insert); return true; From c2f403d0c11065d748e85a5eecf4aa0c8236702b Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Sat, 3 May 2025 23:00:52 +0800 Subject: [PATCH 09/10] Move tests out of RISC-V folder --- .../Transforms/VectorCombine/{RISCV => }/intrinsic-scalarize.ll | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llvm/test/Transforms/VectorCombine/{RISCV => }/intrinsic-scalarize.ll (100%) diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll similarity index 100% rename from llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll rename to llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll From 3f305fd070fc18f0e8e31b1f9adf0a83d08da481 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 15 May 2025 10:40:55 +0100 Subject: [PATCH 10/10] Add test case for x86 maxnum Should be profitable on AVX2 but not SSE2 --- .../VectorCombine/X86/intrinsic-scalarize.ll | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll diff --git a/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll new file mode 100644 index 0000000000000..5f3229398792a --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -S -p vector-combine -mtriple=x86_64 -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -S -p vector-combine -mtriple=x86_64 -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 + +define <2 x float> @maxnum(float %x, float %y) { +; SSE2-LABEL: define <2 x float> @maxnum( +; SSE2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; SSE2-NEXT: [[X_INSERT:%.*]] = insertelement <2 x float> poison, float [[X]], i32 0 +; SSE2-NEXT: [[Y_INSERT:%.*]] = insertelement <2 x float> poison, float [[Y]], i32 0 +; SSE2-NEXT: [[V:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[X_INSERT]], <2 x float> [[Y_INSERT]]) +; SSE2-NEXT: ret <2 x float> [[V]] +; +; AVX2-LABEL: define <2 x float> @maxnum( +; AVX2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; AVX2-NEXT: [[V_SCALAR:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]]) +; AVX2-NEXT: [[TMP1:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> poison, <2 x float> poison) +; AVX2-NEXT: [[V:%.*]] = insertelement <2 x float> [[TMP1]], float [[V_SCALAR]], i64 0 +; AVX2-NEXT: ret <2 x float> [[V]] +; + %x.insert = insertelement <2 x float> poison, float %x, i32 0 + %y.insert = insertelement <2 x float> poison, float %y, i32 0 + %v = call <2 x float> @llvm.maxnum(<2 x float> %x.insert, <2 x float> %y.insert) + ret <2 x float> %v +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}}