-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[X86] combineConcatVectorOps - recursively call combineConcatVectorOps instead of predicting when ops will freely concat #130275
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThe IsConcatFree helper is limited on estimates on where concatenating the subvector operands is beneficial, this patch replaces (F)ADD/SUB/MUL concatenation checks with a recursive call to combineConcatVectorOps to see if it will profitably concatenate. Other opcodes can be moved to using the CombineSubOperand helper in future patches. Patch is 29.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130275.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 57a4c6f7a4869..4821459d7dba8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41903,7 +41903,8 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ArrayRef<SDValue> Ops, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget);
+ const X86Subtarget &Subtarget,
+ unsigned Depth = 0);
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
@@ -57791,7 +57792,8 @@ CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ArrayRef<SDValue> Ops, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget,
+ unsigned Depth) {
assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
unsigned EltSizeInBits = VT.getScalarSizeInBits();
@@ -57803,6 +57805,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}))
return getZeroVector(VT, Subtarget, DAG, DL);
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue(); // Limit search depth.
+
SDValue Op0 = Ops[0];
bool IsSplat = llvm::all_equal(Ops);
unsigned NumOps = Ops.size();
@@ -57933,6 +57938,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
return AllConstants || AllSubs;
};
+ auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
+ bool AllConstants = true;
+ SmallVector<SDValue> Subs;
+ for (SDValue SubOp : SubOps) {
+ SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
+ AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode());
+ Subs.push_back(SubOp.getOperand(I));
+ }
+ if (AllConstants)
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
+ return combineConcatVectorOps(DL, VT, Subs, DAG, DCI, Subtarget,
+ Depth + 1);
+ };
switch (Op0.getOpcode()) {
case ISD::VECTOR_SHUFFLE: {
@@ -58343,9 +58362,12 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
(EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
- return DAG.getNode(Op0.getOpcode(), DL, VT,
- ConcatSubOperand(VT, Ops, 0),
- ConcatSubOperand(VT, Ops, 1));
+ SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
+ SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
+ if (Concat0 || Concat1)
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
+ Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
}
break;
// Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
@@ -58354,12 +58376,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
- if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
- (VT.is256BitVector() ||
- (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
- return DAG.getNode(Op0.getOpcode(), DL, VT,
- ConcatSubOperand(VT, Ops, 0),
- ConcatSubOperand(VT, Ops, 1));
+ if (!IsSplat && (VT.is256BitVector() ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+ SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
+ SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
+ if (Concat0 || Concat1)
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
+ Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
}
break;
case ISD::FDIV:
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 7a5819c2978ae..1ee03c5f1223f 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1OR2,AVX1
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX1OR2,AVX2
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
;
; Basic matrix multiply tests based on the pattern:
@@ -117,22 +117,38 @@ define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwin
; SSE-NEXT: movapd %xmm4, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_mul2x2_f64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0]
-; AVX-NEXT: vmulpd %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,1]
-; AVX-NEXT: vmulpd %xmm4, %xmm2, %xmm4
-; AVX-NEXT: vaddpd %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
-; AVX-NEXT: vmulpd %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,1]
-; AVX-NEXT: vmulpd %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_mul2x2_f64:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mul2x2_f64:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_mul2x2_f64:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX512-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX512-NEXT: vmulpd %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: retq
entry:
%split = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 0, i32 1>
%split1 = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 2, i32 3>
@@ -958,227 +974,58 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin
; SSE-NEXT: movaps %xmm5, %xmm2
; SSE-NEXT: retq
;
-; AVX1-LABEL: test_mul4x4_f32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm2[0,0,0,0]
-; AVX1-NEXT: vmulps %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
-; AVX1-NEXT: vmulps %xmm7, %xmm5, %xmm7
-; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
-; AVX1-NEXT: vmulps %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; AVX1-NEXT: vmulps %xmm7, %xmm4, %xmm7
-; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,0,0,0]
-; AVX1-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
-; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
-; AVX1-NEXT: vmulps %xmm1, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX1-NEXT: vmulps %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vaddps %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm3[0,0,0,0]
-; AVX1-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
-; AVX1-NEXT: vmulps %xmm1, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
-; AVX1-NEXT: vmulps %xmm4, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[0,0,0,0]
-; AVX1-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vaddps %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
-; AVX1-NEXT: vmulps %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
-; AVX1-NEXT: vmulps %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_mul4x4_f32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vbroadcastss %xmm2, %xmm6
-; AVX2-NEXT: vmulps %xmm6, %xmm0, %xmm6
-; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
-; AVX2-NEXT: vmulps %xmm7, %xmm5, %xmm7
-; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
-; AVX2-NEXT: vmulps %xmm7, %xmm1, %xmm7
-; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; AVX2-NEXT: vmulps %xmm7, %xmm4, %xmm7
-; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX2-NEXT: vbroadcastss %xmm2, %xmm7
-; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
-; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
-; AVX2-NEXT: vmulps %xmm1, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-NEXT: vmulps %xmm2, %xmm4, %xmm2
-; AVX2-NEXT: vaddps %xmm2, %xmm7, %xmm2
-; AVX2-NEXT: vbroadcastss %xmm3, %xmm7
-; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
-; AVX2-NEXT: vmulps %xmm1, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
-; AVX2-NEXT: vmulps %xmm4, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX2-NEXT: vbroadcastss %xmm3, %xmm8
-; AVX2-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm5
-; AVX2-NEXT: vaddps %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
-; AVX2-NEXT: vmulps %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
-; AVX2-NEXT: vmulps %xmm1, %xmm4, %xmm1
-; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: test_mul4x4_f32:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT: vbroadcastss %xmm1, %xmm5
-; AVX512F-NEXT: vmulps %xmm5, %xmm0, %xmm5
-; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
-; AVX512F-NEXT: vmulps %xmm6, %xmm4, %xmm6
-; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
-; AVX512F-NEXT: vmulps %xmm6, %xmm3, %xmm6
-; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm6
-; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX512F-NEXT: vbroadcastss %xmm6, %xmm7
-; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
-; AVX512F-NEXT: vmulps %xmm4, %xmm8, %xmm8
-; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
-; AVX512F-NEXT: vmulps %xmm3, %xmm8, %xmm8
-; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
-; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm6
-; AVX512F-NEXT: vaddps %xmm6, %xmm7, %xmm6
-; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm7
-; AVX512F-NEXT: vbroadcastss %xmm7, %xmm8
-; AVX512F-NEXT: vmulps %xmm0, %xmm8, %xmm8
-; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
-; AVX512F-NEXT: vmulps %xmm4, %xmm9, %xmm9
-; AVX512F-NEXT: vaddps %xmm9, %xmm8, %xmm8
-; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
-; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9
-; AVX512F-NEXT: vaddps %xmm9, %xmm8, %xmm8
-; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX512F-NEXT: vmulps %xmm7, %xmm2, %xmm7
-; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm1
-; AVX512F-NEXT: vbroadcastss %xmm1, %xmm8
-; AVX512F-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
-; AVX512F-NEXT: vmulps %xmm4, %xmm8, %xmm4
-; AVX512F-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
-; AVX512F-NEXT: vmulps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vaddps %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX512F-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm1
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: retq
+; AVX1OR2-LABEL: test_mul4x4_f32:
+; AVX1OR2: # %bb.0: # %entry
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
+; AVX1OR2-NEXT: vmulps %ymm4, %ymm5, %ymm4
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0,0,0,4,4,4,4]
+; AVX1OR2-NEXT: vmulps %ymm0, %ymm6, %ymm0
+; AVX1OR2-NEXT: vaddps %ymm4, %ymm0, %ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6]
+; AVX1OR2-NEXT: vmulps %ymm7, %ymm4, %ymm7
+; AVX1OR2-NEXT: vaddps %ymm7, %ymm0, %ymm0
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX1OR2-NEXT: vmulps %ymm2, %ymm1, %ymm2
+; AVX1OR2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX1OR2-NEXT: vmulps %ymm2, %ymm5, %ymm2
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
+; AVX1OR2-NEXT: vmulps %ymm5, %ymm6, %ymm5
+; AVX1OR2-NEXT: vaddps %ymm2, %ymm5, %ymm2
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6]
+; AVX1OR2-NEXT: vmulps %ymm5, %ymm4, %ymm4
+; AVX1OR2-NEXT: vaddps %ymm4, %ymm2, %ymm2
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX1OR2-NEXT: vmulps %ymm3, %ymm1, %ymm1
+; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX1OR2-NEXT: retq
;
-; AVX512VL-LABEL: test_mul4x4_f32:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm4
-; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm5
-; AVX512VL-NEXT: vmulps %xmm5, %xmm0, %xmm5
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
-; AVX512VL-NEXT: vmulps %xmm6, %xmm2, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
-; AVX512VL-NEXT: vmulps %xmm6, %xmm3, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; AVX512VL-NEXT: vmulps %xmm6, %xmm4, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX512VL-NEXT: vbroadcastss %xmm6, %xmm7
-; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm8, %xmm8
-; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
-; AVX512VL-NEXT: vmulps %xmm3, %xmm8, %xmm8
-; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
-; AVX512VL-NEXT: vmulps %xmm6, %xmm4, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm7, %xmm6
-; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm7
-; AVX512VL-NEXT: vbroadcastss %xmm7, %xmm8
-; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm8
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm9, %xmm9
-; AVX512VL-NEXT: vaddps %xmm9, %xmm8, %xmm8
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
-; AVX512VL-NEXT: vmulps %xmm3, %xmm9, %xmm9
-; AVX512VL-NEXT: vaddps %xmm9, %xmm8, %xmm8
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX512VL-NEXT: vmulps %xmm7, %xmm4, %xmm7
-; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vextractf32x4 $3, %zmm1, %xmm1
-; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm8
-; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm8, %xmm2
-; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT: ...
[truncated]
|
| if (Concat0 || Concat1) | ||
| return DAG.getNode(Op0.getOpcode(), DL, VT, | ||
| Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0), | ||
| Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems the behavior changed if both Concat0 and Concat1 are false?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes - this case was always concatenating the binops even if there wasn't any benefit (there's a few similar ones to this that also need addressing) - I'm happy to remove this handling for now and just start with the FADD/FUB/MUL case to simplify the patch?
…s instead of predicting when ops will freely concat The IsConcatFree helper is limited on estimates on where concatenating the subvector operands is beneficial, this patch replaces FADD/FSUB/FMUL concatenation checks with a recursive call to combineConcatVectorOps to see if it will profitably concatenate. Other opcodes can be moved to using the CombineSubOperand helper in future patches.
7ecabfc to
3bd273d
Compare
|
@phoebewang I've restricted this to FADD/FSUB/FMUL instructions for the initial patch (ADD/SUB/MUL will still need test coverage in a later PR). |
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/12660 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/4/builds/5495 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/2/builds/18777 Here is the relevant piece of the build log for the reference |
The IsConcatFree helper is limited on estimates on where concatenating the subvector operands is beneficial, this patch replaces FADD/FSUB/FMUL concatenation checks with a recursive call to combineConcatVectorOps to see if it will profitably concatenate.
Other opcodes can be moved to using the CombineSubOperand helper in future patches.