Skip to content

Commit 5d7f324

Browse files
[SLP]Enable Shl as a base opcode in copyables (#156766)
Enables Shl matching for the nodes, where copyable can be modelled as shl %v, 0
1 parent 5547c0c commit 5d7f324

File tree

5 files changed

+54
-55
lines changed

5 files changed

+54
-55
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10620,7 +10620,8 @@ class InstructionsCompatibilityAnalysis {
1062010620
/// Checks if the opcode is supported as the main opcode for copyable
1062110621
/// elements.
1062210622
static bool isSupportedOpcode(const unsigned Opcode) {
10623-
return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10623+
return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10624+
Opcode == Instruction::Shl;
1062410625
}
1062510626

1062610627
/// Identifies the best candidate value, which represents main opcode
@@ -10937,6 +10938,7 @@ class InstructionsCompatibilityAnalysis {
1093710938
switch (MainOpcode) {
1093810939
case Instruction::Add:
1093910940
case Instruction::LShr:
10941+
case Instruction::Shl:
1094010942
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
1094110943
break;
1094210944
default:
@@ -22006,6 +22008,8 @@ bool BoUpSLP::collectValuesToDemote(
2200622008
return all_of(E.Scalars, [&](Value *V) {
2200722009
if (isa<PoisonValue>(V))
2200822010
return true;
22011+
if (E.isCopyableElement(V))
22012+
return true;
2200922013
auto *I = cast<Instruction>(V);
2201022014
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
2201122015
return AmtKnownBits.getMaxValue().ult(BitWidth);

llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,10 @@ define void @test() {
1010
; CHECK-NEXT: [[SUB4_I_I65_US:%.*]] = or i64 0, 1
1111
; CHECK-NEXT: br label [[BODY:%.*]]
1212
; CHECK: body:
13-
; CHECK-NEXT: [[ADD_I_I62_US:%.*]] = shl i64 0, 0
14-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 1>, i64 [[ADD_I_I62_US]], i32 0
15-
; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]]
16-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> [[TMP1]]
17-
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
18-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
19-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
20-
; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]]
13+
; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
14+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
15+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
16+
; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
2117
; CHECK-NEXT: [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0
2218
; CHECK-NEXT: br label [[BODY]]
2319
;

llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ define i32 @test() {
88
; CHECK-NEXT: [[ENTRY:.*:]]
99
; CHECK-NEXT: store i32 152, ptr @f, align 4
1010
; CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD_I:%.*]] = load i32, ptr @f, align 4
11-
; CHECK-NEXT: [[ADD_I_I:%.*]] = shl i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], 24
12-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[ADD_I_I]], i32 0
11+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], i32 0
12+
; CHECK-NEXT: [[TMP0:%.*]] = shl <8 x i32> [[TMP3]], <i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1313
; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> <i32 83886080, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, [[TMP0]]
1414
; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[TMP1]], splat (i32 24)
1515
; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i32> [[TMP2]], <i32 66440127, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>

llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -247,32 +247,12 @@ entry:
247247
}
248248

249249
define void @shl0(ptr noalias %dst, ptr noalias %src) {
250-
; NON-POW2-LABEL: @shl0(
251-
; NON-POW2-NEXT: entry:
252-
; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
253-
; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
254-
; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
255-
; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
256-
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
257-
; NON-POW2-NEXT: [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
258-
; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
259-
; NON-POW2-NEXT: ret void
260-
;
261-
; POW2-ONLY-LABEL: @shl0(
262-
; POW2-ONLY-NEXT: entry:
263-
; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
264-
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
265-
; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
266-
; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
267-
; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
268-
; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
269-
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
270-
; POW2-ONLY-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
271-
; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
272-
; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
273-
; POW2-ONLY-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3
274-
; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
275-
; POW2-ONLY-NEXT: ret void
250+
; CHECK-LABEL: @shl0(
251+
; CHECK-NEXT: entry:
252+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
253+
; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
254+
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
255+
; CHECK-NEXT: ret void
276256
;
277257
entry:
278258
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1

llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,44 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
2-
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
3-
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
2+
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
3+
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
44

55

66
define i1 @test(i32 %0, i32 %1, i32 %p) {
7-
; CHECK-LABEL: define i1 @test(
8-
; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
9-
; CHECK-NEXT: entry:
10-
; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
11-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
12-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
13-
; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
14-
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
15-
; CHECK-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]]
16-
; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
17-
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
18-
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
19-
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
20-
; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
21-
; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
22-
; CHECK-NEXT: ret i1 [[OP_RDX2]]
7+
; X86-LABEL: define i1 @test(
8+
; X86-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
9+
; X86-NEXT: entry:
10+
; X86-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
11+
; X86-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
12+
; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
13+
; X86-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
14+
; X86-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
15+
; X86-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]]
16+
; X86-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
17+
; X86-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
18+
; X86-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
19+
; X86-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
20+
; X86-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
21+
; X86-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
22+
; X86-NEXT: ret i1 [[OP_RDX2]]
23+
;
24+
; AARCH64-LABEL: define i1 @test(
25+
; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
26+
; AARCH64-NEXT: entry:
27+
; AARCH64-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
28+
; AARCH64-NEXT: [[SHL4:%.*]] = shl i32 0, [[TMP1]]
29+
; AARCH64-NEXT: [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0
30+
; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP1]], i32 1
31+
; AARCH64-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
32+
; AARCH64-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
33+
; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[P]], i32 0
34+
; AARCH64-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]]
35+
; AARCH64-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
36+
; AARCH64-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
37+
; AARCH64-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]]
38+
; AARCH64-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
39+
; AARCH64-NEXT: [[TMP9:%.*]] = freeze i1 [[OP_RDX]]
40+
; AARCH64-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]]
41+
; AARCH64-NEXT: ret i1 [[OP_RDX2]]
2342
;
2443
entry:
2544
%cmp1 = icmp sgt i32 %0, 0

0 commit comments

Comments
 (0)