[PowerPC] Replace vspltisw+vadduwm instructions with xxleqv+vsubuwm for adding the vector {1, 1, 1, 1} (#160882)

Himadhith · himadhith · tonykuttai · web-flow · commit e4a4bb0f6d3b · 2025-11-21T12:26:58.000+05:30
This patch optimizes vector addition operations involving **`all-ones`**
vectors by leveraging the generation of vectors of -1s(using `xxleqv`,
which is cheaper than generating vectors of 1s(`vspltisw`). These are
the respective vector types.
`v2i64`: **`A + vector {1, 1}`**
`v4i32`: **`A + vector {1, 1, 1, 1}`**
`v8i16`: **`A + vector {1, 1, 1, 1, 1, 1, 1, 1}`**
`v16i8`: **`A + vector {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1}`**

The optimized version replaces `vspltisw (4 cycles)` with `xxleqv (2
cycles)` using the following identity:
`A - (-1) = A + 1`.

---------

Co-authored-by: himadhith &lt;himadhith.v@ibm.com&gt;
Co-authored-by: Tony Varghese &lt;tonypalampalliyil@gmail.com&gt;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19280,13 +19280,58 @@ static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
   return MatPCRel;
 }
 
+// Transform (add X, (build_vector (T 1), (T 1), ...)) -> (sub X, (XXLEQVOnes))
+// XXLEQVOnes creates an all-1s vector (0xFFFFFFFF...) efficiently via xxleqv
+// Mathematical identity: X + 1 = X - (-1)
+// Applies to v4i32, v2i64, v8i16, v16i8 where all elements are constant 1
+// Requirement: VSX feature for efficient xxleqv generation
+static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
+                               const PPCSubtarget &Subtarget) {
+
+  EVT VT = N->getValueType(0);
+  if (!Subtarget.hasVSX())
+    return SDValue();
+
+  // Handle v2i64, v4i32, v8i16 and v16i8 types
+  if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
+        VT == MVT::v2i64))
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Check if RHS is BUILD_VECTOR
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Check if all the elements are 1
+  unsigned NumOfEles = RHS.getNumOperands();
+  for (unsigned i = 0; i < NumOfEles; ++i) {
+    auto *CN = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
+    if (!CN || CN->getSExtValue() != 1)
+      return SDValue();
+  }
+  SDLoc DL(N);
+
+  SDValue MinusOne = DAG.getConstant(APInt::getAllOnes(32), DL, MVT::i32);
+  SmallVector<SDValue, 4> Ops(4, MinusOne);
+  SDValue AllOnesVec = DAG.getBuildVector(MVT::v4i32, DL, Ops);
+
+  // Bitcast to the target vector type
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT, AllOnesVec);
+
+  return DAG.getNode(ISD::SUB, DL, VT, LHS, Bitcast);
+}
+
 SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
     return Value;
 
   if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
     return Value;
 
+  if (auto Value = combineADDToSUB(N, DCI.DAG, Subtarget))
+    return Value;
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
@@ -8,15 +8,14 @@
 ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
 ; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
 
-; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation.
+; Optimized version which `xxleqv` and `vsubu` to generate vector of -1s to leverage the identity A - (-1) = A + 1.
 
 ; Function for the vector type v2i64 `a + {1, 1}`
 define <2 x i64> @test_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: test_v2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltisw v3, 1
-; CHECK-NEXT:    vupklsw v3, v3
-; CHECK-NEXT:    vaddudm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsubudm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <2 x i64> %a, splat (i64 1)
@@ -27,8 +26,8 @@ entry:
 define <4 x i32> @test_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: test_v4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltisw v3, 1
-; CHECK-NEXT:    vadduwm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsubuwm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <4 x i32> %a, splat (i32 1)
@@ -39,8 +38,8 @@ entry:
 define <8 x i16> @test_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: test_v8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltish v3, 1
-; CHECK-NEXT:    vadduhm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsubuhm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <8 x i16> %a, splat (i16 1)
@@ -51,8 +50,8 @@ entry:
 define <16 x i8> @test_16i8(<16 x i8> %a) {
 ; CHECK-LABEL: test_16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xxspltib v3, 1
-; CHECK-NEXT:    vaddubm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsububm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <16 x i8> %a, splat (i8 1)
diff --git a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
@@ -16,9 +16,8 @@ define <2 x i64> @test_add(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind {
 ; VSX-LABEL: increment_by_one:
 ; VSX:       # %bb.0:
-; VSX-NEXT:    vspltisw 3, 1
-; VSX-NEXT:    vupklsw 3, 3
-; VSX-NEXT:    vaddudm 2, 2, 3
+; VSX-NEXT:    xxleqv 35, 35, 35
+; VSX-NEXT:    vsubudm 2, 2, 3
 ; VSX-NEXT:    blr
 ;
 ; NOVSX-LABEL: increment_by_one: