Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10573,7 +10573,8 @@ class InstructionsCompatibilityAnalysis {
/// Checks if the opcode is supported as the main opcode for copyable
/// elements.
static bool isSupportedOpcode(const unsigned Opcode) {
return Opcode == Instruction::Add || Opcode == Instruction::LShr;
return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
Opcode == Instruction::Shl;
}

/// Identifies the best candidate value, which represents main opcode
Expand Down Expand Up @@ -10890,6 +10891,7 @@ class InstructionsCompatibilityAnalysis {
switch (MainOpcode) {
case Instruction::Add:
case Instruction::LShr:
case Instruction::Shl:
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
break;
default:
Expand Down Expand Up @@ -21902,6 +21904,8 @@ bool BoUpSLP::collectValuesToDemote(
return all_of(E.Scalars, [&](Value *V) {
if (isa<PoisonValue>(V))
return true;
if (E.isCopyableElement(V))
return true;
auto *I = cast<Instruction>(V);
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
return AmtKnownBits.getMaxValue().ult(BitWidth);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,10 @@ define void @test() {
; CHECK-NEXT: [[SUB4_I_I65_US:%.*]] = or i64 0, 1
; CHECK-NEXT: br label [[BODY:%.*]]
; CHECK: body:
; CHECK-NEXT: [[ADD_I_I62_US:%.*]] = shl i64 0, 0
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 1>, i64 [[ADD_I_I62_US]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0
; CHECK-NEXT: br label [[BODY]]
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ define i32 @test() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 152, ptr @f, align 4
; CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD_I:%.*]] = load i32, ptr @f, align 4
; CHECK-NEXT: [[ADD_I_I:%.*]] = shl i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], 24
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[ADD_I_I]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = shl <8 x i32> [[TMP3]], <i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> <i32 83886080, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[TMP1]], splat (i32 24)
; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i32> [[TMP2]], <i32 66440127, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
Expand Down
32 changes: 6 additions & 26 deletions llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -247,32 +247,12 @@ entry:
}

define void @shl0(ptr noalias %dst, ptr noalias %src) {
; NON-POW2-LABEL: @shl0(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @shl0(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
; POW2-ONLY-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3
; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
; POW2-ONLY-NEXT: ret void
; CHECK-LABEL: @shl0(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}


define i1 @test(i32 %0, i32 %1, i32 %p) {
; CHECK-LABEL: define i1 @test(
; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]]
; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
; CHECK-NEXT: ret i1 [[OP_RDX2]]
; X86-LABEL: define i1 @test(
; X86-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
; X86-NEXT: entry:
; X86-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
; X86-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
; X86-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
; X86-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
; X86-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]]
; X86-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
; X86-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; X86-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
; X86-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
; X86-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
; X86-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
; X86-NEXT: ret i1 [[OP_RDX2]]
;
; AARCH64-LABEL: define i1 @test(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is surprising given aarch64 has better non-uniform vector shift handling than x86/SSE

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexey-bataev were you able to get to the bottom of this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, this test shows it. X86 fails to vectorize initial reduction

    %cmp6 = icmp slt i32 0, %p
    %cmp2 = icmp slt i32 %shl1, 0
    %cmp3 = icmp slt i32 %shl2, 0
    %cmp4 = icmp slt i32 %shl3, 0

while AArch64 is able to vectorize it.
X86 instead vectorizes next candidate:

    %cmp2 = icmp slt i32 %shl1, 0
    %cmp3 = icmp slt i32 %shl2, 0
    %cmp4 = icmp slt i32 %shl3, 0
    %cmp5 = icmp slt i32 %shl4, 0

AArch64 should do it better with enabled non-power-of-2

; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
; AARCH64-NEXT: entry:
; AARCH64-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
; AARCH64-NEXT: [[SHL4:%.*]] = shl i32 0, [[TMP1]]
; AARCH64-NEXT: [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0
; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP1]], i32 1
; AARCH64-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
; AARCH64-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[P]], i32 0
; AARCH64-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]]
; AARCH64-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
; AARCH64-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
; AARCH64-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]]
; AARCH64-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
; AARCH64-NEXT: [[TMP9:%.*]] = freeze i1 [[OP_RDX]]
; AARCH64-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]]
; AARCH64-NEXT: ret i1 [[OP_RDX2]]
;
entry:
%cmp1 = icmp sgt i32 %0, 0
Expand Down
Loading