Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14195,6 +14195,61 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) {
return DAG.getZExtOrTrunc(NewAbs, SDLoc(Extend), VT);
}

// Try to widen the build vector and bitcast it to the type of zext.
// This is a special case for the 128-bit vector types. Intention is to remove
// the zext and replace it with a bitcast the wider type. While lowering
// the bitcast is removed and extra commutation due to zext is avoided.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DAG format comment of the matched pattern

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arsenm Add such comment like zext(build_vec) -> bitcast(build_vec). which visualize the transformation, right?

static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG) {

assert(Extend->getOpcode() == ISD::ZERO_EXTEND && "Expected zero extend.");

EVT ExtendVT = Extend->getValueType(0);

SDValue BV = Extend->getOperand(0);
if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
return SDValue();

SDLoc dl(BV);
EVT VT = BV.getValueType();
EVT EltVT = BV.getOperand(0).getValueType();
unsigned NumElts = VT.getVectorNumElements();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
unsigned WidenNumElts = WidenVT.getVectorNumElements();

SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
// Fill the new elements with Zero.
NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
// Compute the step to place the elements in the right place and control the
// iteration.
unsigned step = WidenNumElts / NumElts;
if (WidenVT.is128BitVector()) {
if (Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
i--, j -= step) {
SDValue temp = NewOps[i];
NewOps[i] = NewOps[j];
NewOps[j] = temp;
}
// Create new build vector with WidenVT and NewOps
SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
// Replace the old build vector with the new one. Bitcast the
// new build vector to the type of the zext.
SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
LLVM_DEBUG(
dbgs() << DAG.getMachineFunction().getFunction().getName()
<< " - Widening buildvector and replace zext with bitcast\n";
BV.dump(); Extend->dump(); dbgs() << " to \n";
NewBV.getNode()->dump(); NewBVBitcast->dump(););
return NewBV;
}
}
return SDValue();
}

SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
Expand Down Expand Up @@ -14521,6 +14576,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
return SDValue(CSENode, 0);
}

if (SDValue V = widenBuildVec(N, DAG))
return V;

return SDValue();
}

Expand Down
258 changes: 258 additions & 0 deletions llvm/test/CodeGen/X86/WidenBuildVector.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mcpu=znver5 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should be able to remove this WidenBuildVector.ll file now - I've pushed 8b2d269 which contains these (cleaned up) tests for a wider range of x86 targets - please can you merge with trunk to see the codegen diffs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure will do it


; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
define dso_local i32 @foov8i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
; CHECK-LABEL: foov8i8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
; CHECK-NEXT: leaq (%rsi,%rsi,4), %r8
; CHECK-NEXT: leaq (,%rsi,8), %r9
; CHECK-NEXT: subq %rsi, %r9
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $14, (%rdi,%r9), %xmm0, %xmm0
; CHECK-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: retq
entry:
%var0 = load i8, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
%var1 = load i8, ptr %arrayidx.1, align 1
%mul.2 = shl nsw i64 %a_stride, 1
%arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
%var2 = load i8, ptr %arrayidx.2, align 1
%mul.3 = mul nsw i64 %a_stride, 3
%arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
%var3 = load i8, ptr %arrayidx.3, align 1
%mul.4 = shl nsw i64 %a_stride, 2
%arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 %mul.4
%var4 = load i8, ptr %arrayidx.4, align 1
%mul.5 = mul nsw i64 %a_stride, 5
%arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 %mul.5
%var5 = load i8, ptr %arrayidx.5, align 1
%mul.6 = mul nsw i64 %a_stride, 6
%arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 %mul.6
%var6 = load i8, ptr %arrayidx.6, align 1
%mul.7 = mul nsw i64 %a_stride, 7
%arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 %mul.7
%var7 = load i8, ptr %arrayidx.7, align 1
%var8 = insertelement <8 x i8> poison, i8 %var0, i64 0
%var9 = insertelement <8 x i8> %var8, i8 %var1, i64 1
%var10 = insertelement <8 x i8> %var9, i8 %var2, i64 2
%var11 = insertelement <8 x i8> %var10, i8 %var3, i64 3
%var12 = insertelement <8 x i8> %var11, i8 %var4, i64 4
%var13 = insertelement <8 x i8> %var12, i8 %var5, i64 5
%var14 = insertelement <8 x i8> %var13, i8 %var6, i64 6
%var15 = insertelement <8 x i8> %var14, i8 %var7, i64 7
%var16 = zext <8 x i8> %var15 to <8 x i32>
%var17 = load <8 x i16>, ptr %b, align 2
%var18 = sext <8 x i16> %var17 to <8 x i32>
%var19 = mul nsw <8 x i32> %var18, %var16
%var20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %var19)
ret i32 %var20
}

; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
define dso_local i32 @foov4i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
; CHECK-LABEL: foov4i8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0
; CHECK-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0
; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: retq
entry:
%var0 = load i8, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
%var1 = load i8, ptr %arrayidx.1, align 1
%mul.2 = shl nsw i64 %a_stride, 1
%arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
%var2 = load i8, ptr %arrayidx.2, align 1
%mul.3 = mul nsw i64 %a_stride, 3
%arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
%var3 = load i8, ptr %arrayidx.3, align 1
%var8 = insertelement <4 x i8> poison, i8 %var0, i64 0
%var9 = insertelement <4 x i8> %var8, i8 %var1, i64 1
%var10 = insertelement <4 x i8> %var9, i8 %var2, i64 2
%var11 = insertelement <4 x i8> %var10, i8 %var3, i64 3
%var16 = zext <4 x i8> %var11 to <4 x i32>
%var17 = load <4 x i16>, ptr %b, align 2
%var18 = sext <4 x i16> %var17 to <4 x i32>
%var19 = mul nsw <4 x i32> %var18, %var16
%var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19)
ret i32 %var20
}

; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
define dso_local i32 @foov2i8(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
; CHECK-LABEL: foov2i8:
; CHECK: # %bb.0:
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; CHECK-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: retq
%var0 = load i8, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
%var1 = load i8, ptr %arrayidx.1, align 1
%var8 = insertelement <2 x i8> poison, i8 %var0, i64 0
%var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1
%var16 = zext <2 x i8> %var9 to <2 x i32>
%var17 = load <2 x i16>, ptr %b, align 2
%var18 = sext <2 x i16> %var17 to <2 x i32>
%var19 = mul nsw <2 x i32> %var18, %var16
%var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19)
ret i32 %var20
}

; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
define dso_local i64 @foov2i8_v2i64(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
; CHECK-LABEL: foov2i8_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: vpmovsxbq (%rdx), %xmm1
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpmuldq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: retq
%var0 = load i8, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
%var1 = load i8, ptr %arrayidx.1, align 1
%var8 = insertelement <2 x i8> poison, i8 %var0, i64 0
%var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1
%var16 = zext <2 x i8> %var9 to <2 x i64>
%var17 = load <2 x i8>, ptr %b, align 2
%var18 = sext <2 x i8> %var17 to <2 x i64>
%var19 = mul nsw <2 x i64> %var18, %var16
%var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19)
ret i64 %var20
}


; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
define dso_local i32 @foov4i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
; CHECK-LABEL: foov4i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: leaq (%rsi,%rsi,2), %rcx
; CHECK-NEXT: vpmovsxwd (%rdx), %xmm1
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0
; CHECK-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0
; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: retq
entry:
%var0 = load i16, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
%var1 = load i16, ptr %arrayidx.1, align 1
%mul.2 = shl nsw i64 %a_stride, 1
%arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2
%var2 = load i16, ptr %arrayidx.2, align 1
%mul.3 = mul nsw i64 %a_stride, 3
%arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3
%var3 = load i16, ptr %arrayidx.3, align 1
%var8 = insertelement <4 x i16> poison, i16 %var0, i64 0
%var9 = insertelement <4 x i16> %var8, i16 %var1, i64 1
%var10 = insertelement <4 x i16> %var9, i16 %var2, i64 2
%var11 = insertelement <4 x i16> %var10, i16 %var3, i64 3
%var16 = zext <4 x i16> %var11 to <4 x i32>
%var17 = load <4 x i16>, ptr %b, align 2
%var18 = sext <4 x i16> %var17 to <4 x i32>
%var19 = mul nsw <4 x i32> %var18, %var16
%var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19)
ret i32 %var20
}

; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
define dso_local i32 @foov2i16(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
; CHECK-LABEL: foov2i16:
; CHECK: # %bb.0:
; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0
; CHECK-NEXT: vpmovsxwd %xmm1, %xmm1
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: retq
%var0 = load i16, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
%var1 = load i16, ptr %arrayidx.1, align 1
%var8 = insertelement <2 x i16> poison, i16 %var0, i64 0
%var9 = insertelement <2 x i16> %var8, i16 %var1, i64 1
%var16 = zext <2 x i16> %var9 to <2 x i32>
%var17 = load <2 x i16>, ptr %b, align 2
%var18 = sext <2 x i16> %var17 to <2 x i32>
%var19 = mul nsw <2 x i32> %var18, %var16
%var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19)
ret i32 %var20
}

; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable
define dso_local i64 @foov2i32(ptr nocapture noundef readonly %a, i64 noundef %a_stride, ptr nocapture noundef readonly %b) local_unnamed_addr {
; CHECK-LABEL: foov2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; CHECK-NEXT: vpmovsxdq (%rdx), %xmm1
; CHECK-NEXT: vpmullq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: retq
%var0 = load i32, ptr %a, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride
%var1 = load i32, ptr %arrayidx.1, align 1
%var8 = insertelement <2 x i32> poison, i32 %var0, i64 0
%var9 = insertelement <2 x i32> %var8, i32 %var1, i64 1
%var16 = zext <2 x i32> %var9 to <2 x i64>
%var17 = load <2 x i32>, ptr %b, align 2
%var18 = sext <2 x i32> %var17 to <2 x i64>
%var19 = mul nsw <2 x i64> %var18, %var16
%var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19)
ret i64 %var20
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #1
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) #1
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) #1
Loading