Skip to content

Commit e4a4bb0

Browse files
Himadhithhimadhithtonykuttai
authored
[PowerPC] Replace vspltisw+vadduwm instructions with xxleqv+vsubuwm for adding the vector {1, 1, 1, 1} (#160882)
This patch optimizes vector addition operations involving **`all-ones`** vectors by leveraging the generation of vectors of -1s(using `xxleqv`, which is cheaper than generating vectors of 1s(`vspltisw`). These are the respective vector types. `v2i64`: **`A + vector {1, 1}`** `v4i32`: **`A + vector {1, 1, 1, 1}`** `v8i16`: **`A + vector {1, 1, 1, 1, 1, 1, 1, 1}`** `v16i8`: **`A + vector {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}`** The optimized version replaces `vspltisw (4 cycles)` with `xxleqv (2 cycles)` using the following identity: `A - (-1) = A + 1`. --------- Co-authored-by: himadhith <[email protected]> Co-authored-by: Tony Varghese <[email protected]>
1 parent cc5185b commit e4a4bb0

File tree

3 files changed

+56
-13
lines changed

3 files changed

+56
-13
lines changed

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19280,13 +19280,58 @@ static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
1928019280
return MatPCRel;
1928119281
}
1928219282

19283+
// Transform (add X, (build_vector (T 1), (T 1), ...)) -> (sub X, (XXLEQVOnes))
19284+
// XXLEQVOnes creates an all-1s vector (0xFFFFFFFF...) efficiently via xxleqv
19285+
// Mathematical identity: X + 1 = X - (-1)
19286+
// Applies to v4i32, v2i64, v8i16, v16i8 where all elements are constant 1
19287+
// Requirement: VSX feature for efficient xxleqv generation
19288+
static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
19289+
const PPCSubtarget &Subtarget) {
19290+
19291+
EVT VT = N->getValueType(0);
19292+
if (!Subtarget.hasVSX())
19293+
return SDValue();
19294+
19295+
// Handle v2i64, v4i32, v8i16 and v16i8 types
19296+
if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
19297+
VT == MVT::v2i64))
19298+
return SDValue();
19299+
19300+
SDValue LHS = N->getOperand(0);
19301+
SDValue RHS = N->getOperand(1);
19302+
19303+
// Check if RHS is BUILD_VECTOR
19304+
if (RHS.getOpcode() != ISD::BUILD_VECTOR)
19305+
return SDValue();
19306+
19307+
// Check if all the elements are 1
19308+
unsigned NumOfEles = RHS.getNumOperands();
19309+
for (unsigned i = 0; i < NumOfEles; ++i) {
19310+
auto *CN = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
19311+
if (!CN || CN->getSExtValue() != 1)
19312+
return SDValue();
19313+
}
19314+
SDLoc DL(N);
19315+
19316+
SDValue MinusOne = DAG.getConstant(APInt::getAllOnes(32), DL, MVT::i32);
19317+
SmallVector<SDValue, 4> Ops(4, MinusOne);
19318+
SDValue AllOnesVec = DAG.getBuildVector(MVT::v4i32, DL, Ops);
19319+
19320+
// Bitcast to the target vector type
19321+
SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT, AllOnesVec);
19322+
19323+
return DAG.getNode(ISD::SUB, DL, VT, LHS, Bitcast);
19324+
}
19325+
1928319326
SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
1928419327
if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
1928519328
return Value;
1928619329

1928719330
if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
1928819331
return Value;
1928919332

19333+
if (auto Value = combineADDToSUB(N, DCI.DAG, Subtarget))
19334+
return Value;
1929019335
return SDValue();
1929119336
}
1929219337

llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,14 @@
88
; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
99
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
1010

11-
; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation.
11+
; Optimized version which `xxleqv` and `vsubu` to generate vector of -1s to leverage the identity A - (-1) = A + 1.
1212

1313
; Function for the vector type v2i64 `a + {1, 1}`
1414
define <2 x i64> @test_v2i64(<2 x i64> %a) {
1515
; CHECK-LABEL: test_v2i64:
1616
; CHECK: # %bb.0: # %entry
17-
; CHECK-NEXT: vspltisw v3, 1
18-
; CHECK-NEXT: vupklsw v3, v3
19-
; CHECK-NEXT: vaddudm v2, v2, v3
17+
; CHECK-NEXT: xxleqv v3, v3, v3
18+
; CHECK-NEXT: vsubudm v2, v2, v3
2019
; CHECK-NEXT: blr
2120
entry:
2221
%add = add <2 x i64> %a, splat (i64 1)
@@ -27,8 +26,8 @@ entry:
2726
define <4 x i32> @test_v4i32(<4 x i32> %a) {
2827
; CHECK-LABEL: test_v4i32:
2928
; CHECK: # %bb.0: # %entry
30-
; CHECK-NEXT: vspltisw v3, 1
31-
; CHECK-NEXT: vadduwm v2, v2, v3
29+
; CHECK-NEXT: xxleqv v3, v3, v3
30+
; CHECK-NEXT: vsubuwm v2, v2, v3
3231
; CHECK-NEXT: blr
3332
entry:
3433
%add = add <4 x i32> %a, splat (i32 1)
@@ -39,8 +38,8 @@ entry:
3938
define <8 x i16> @test_v8i16(<8 x i16> %a) {
4039
; CHECK-LABEL: test_v8i16:
4140
; CHECK: # %bb.0: # %entry
42-
; CHECK-NEXT: vspltish v3, 1
43-
; CHECK-NEXT: vadduhm v2, v2, v3
41+
; CHECK-NEXT: xxleqv v3, v3, v3
42+
; CHECK-NEXT: vsubuhm v2, v2, v3
4443
; CHECK-NEXT: blr
4544
entry:
4645
%add = add <8 x i16> %a, splat (i16 1)
@@ -51,8 +50,8 @@ entry:
5150
define <16 x i8> @test_16i8(<16 x i8> %a) {
5251
; CHECK-LABEL: test_16i8:
5352
; CHECK: # %bb.0: # %entry
54-
; CHECK-NEXT: xxspltib v3, 1
55-
; CHECK-NEXT: vaddubm v2, v2, v3
53+
; CHECK-NEXT: xxleqv v3, v3, v3
54+
; CHECK-NEXT: vsububm v2, v2, v3
5655
; CHECK-NEXT: blr
5756
entry:
5857
%add = add <16 x i8> %a, splat (i8 1)

llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@ define <2 x i64> @test_add(<2 x i64> %x, <2 x i64> %y) nounwind {
1616
define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind {
1717
; VSX-LABEL: increment_by_one:
1818
; VSX: # %bb.0:
19-
; VSX-NEXT: vspltisw 3, 1
20-
; VSX-NEXT: vupklsw 3, 3
21-
; VSX-NEXT: vaddudm 2, 2, 3
19+
; VSX-NEXT: xxleqv 35, 35, 35
20+
; VSX-NEXT: vsubudm 2, 2, 3
2221
; VSX-NEXT: blr
2322
;
2423
; NOVSX-LABEL: increment_by_one:

0 commit comments

Comments
 (0)