Skip to content

Conversation

@RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Mar 7, 2025

The IsConcatFree helper is limited on estimates on where concatenating the subvector operands is beneficial, this patch replaces FADD/FSUB/FMUL concatenation checks with a recursive call to combineConcatVectorOps to see if it will profitably concatenate.

Other opcodes can be moved to using the CombineSubOperand helper in future patches.

@llvmbot
Copy link
Member

llvmbot commented Mar 7, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

The IsConcatFree helper is limited on estimates on where concatenating the subvector operands is beneficial, this patch replaces (F)ADD/SUB/MUL concatenation checks with a recursive call to combineConcatVectorOps to see if it will profitably concatenate.

Other opcodes can be moved to using the CombineSubOperand helper in future patches.


Patch is 29.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130275.diff

2 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+35-11)
  • (modified) llvm/test/CodeGen/X86/matrix-multiply.ll (+123-347)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 57a4c6f7a4869..4821459d7dba8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41903,7 +41903,8 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                                       ArrayRef<SDValue> Ops, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
-                                      const X86Subtarget &Subtarget);
+                                      const X86Subtarget &Subtarget,
+                                      unsigned Depth = 0);
 
 /// Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
@@ -57791,7 +57792,8 @@ CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                                       ArrayRef<SDValue> Ops, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
-                                      const X86Subtarget &Subtarget) {
+                                      const X86Subtarget &Subtarget,
+                                      unsigned Depth) {
   assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
 
@@ -57803,6 +57805,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       }))
     return getZeroVector(VT, Subtarget, DAG, DL);
 
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue(); // Limit search depth.
+
   SDValue Op0 = Ops[0];
   bool IsSplat = llvm::all_equal(Ops);
   unsigned NumOps = Ops.size();
@@ -57933,6 +57938,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       }
       return AllConstants || AllSubs;
     };
+    auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
+      bool AllConstants = true;
+      SmallVector<SDValue> Subs;
+      for (SDValue SubOp : SubOps) {
+        SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
+        AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
+                        ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode());
+        Subs.push_back(SubOp.getOperand(I));
+      }
+      if (AllConstants)
+        return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
+      return combineConcatVectorOps(DL, VT, Subs, DAG, DCI, Subtarget,
+                                    Depth + 1);
+    };
 
     switch (Op0.getOpcode()) {
     case ISD::VECTOR_SHUFFLE: {
@@ -58343,9 +58362,12 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
                        (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
                         (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
-        return DAG.getNode(Op0.getOpcode(), DL, VT,
-                           ConcatSubOperand(VT, Ops, 0),
-                           ConcatSubOperand(VT, Ops, 1));
+        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
+        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
+        if (Concat0 || Concat1)
+          return DAG.getNode(Op0.getOpcode(), DL, VT,
+                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
+                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
       }
       break;
     // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
@@ -58354,12 +58376,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     case ISD::FADD:
     case ISD::FSUB:
     case ISD::FMUL:
-      if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
-          (VT.is256BitVector() ||
-           (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
-        return DAG.getNode(Op0.getOpcode(), DL, VT,
-                           ConcatSubOperand(VT, Ops, 0),
-                           ConcatSubOperand(VT, Ops, 1));
+      if (!IsSplat && (VT.is256BitVector() ||
+                       (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+        SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
+        SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
+        if (Concat0 || Concat1)
+          return DAG.getNode(Op0.getOpcode(), DL, VT,
+                             Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
+                             Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
       }
       break;
     case ISD::FDIV:
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 7a5819c2978ae..1ee03c5f1223f 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1OR2,AVX1
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX1OR2,AVX2
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
 
 ;
 ; Basic matrix multiply tests based on the pattern:
@@ -117,22 +117,38 @@ define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwin
 ; SSE-NEXT:    movapd %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_mul2x2_f64:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm1[0,0]
-; AVX-NEXT:    vmulpd %xmm3, %xmm0, %xmm3
-; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,1]
-; AVX-NEXT:    vmulpd %xmm4, %xmm2, %xmm4
-; AVX-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = xmm1[0,0]
-; AVX-NEXT:    vmulpd %xmm4, %xmm0, %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,1]
-; AVX-NEXT:    vmulpd %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_mul2x2_f64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vmulpd %ymm2, %ymm3, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_mul2x2_f64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vmulpd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_mul2x2_f64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX512-NEXT:    vmulpd %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    retq
 entry:
   %split = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 0, i32 1>
   %split1 = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 2, i32 3>
@@ -958,227 +974,58 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin
 ; SSE-NEXT:    movaps %xmm5, %xmm2
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_mul4x4_f32:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm2[0,0,0,0]
-; AVX1-NEXT:    vmulps %xmm6, %xmm0, %xmm6
-; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
-; AVX1-NEXT:    vmulps %xmm7, %xmm5, %xmm7
-; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
-; AVX1-NEXT:    vmulps %xmm7, %xmm1, %xmm7
-; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; AVX1-NEXT:    vmulps %xmm7, %xmm4, %xmm7
-; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[0,0,0,0]
-; AVX1-NEXT:    vmulps %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
-; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm8
-; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
-; AVX1-NEXT:    vmulps %xmm1, %xmm8, %xmm8
-; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX1-NEXT:    vmulps %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vaddps %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm3[0,0,0,0]
-; AVX1-NEXT:    vmulps %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm8
-; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
-; AVX1-NEXT:    vmulps %xmm1, %xmm8, %xmm8
-; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
-; AVX1-NEXT:    vmulps %xmm4, %xmm8, %xmm8
-; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[0,0,0,0]
-; AVX1-NEXT:    vmulps %xmm0, %xmm8, %xmm0
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vaddps %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
-; AVX1-NEXT:    vmulps %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
-; AVX1-NEXT:    vmulps %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_mul4x4_f32:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT:    vbroadcastss %xmm2, %xmm6
-; AVX2-NEXT:    vmulps %xmm6, %xmm0, %xmm6
-; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
-; AVX2-NEXT:    vmulps %xmm7, %xmm5, %xmm7
-; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
-; AVX2-NEXT:    vmulps %xmm7, %xmm1, %xmm7
-; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; AVX2-NEXT:    vmulps %xmm7, %xmm4, %xmm7
-; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vbroadcastss %xmm2, %xmm7
-; AVX2-NEXT:    vmulps %xmm7, %xmm0, %xmm7
-; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
-; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm8
-; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
-; AVX2-NEXT:    vmulps %xmm1, %xmm8, %xmm8
-; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-NEXT:    vmulps %xmm2, %xmm4, %xmm2
-; AVX2-NEXT:    vaddps %xmm2, %xmm7, %xmm2
-; AVX2-NEXT:    vbroadcastss %xmm3, %xmm7
-; AVX2-NEXT:    vmulps %xmm7, %xmm0, %xmm7
-; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm8
-; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
-; AVX2-NEXT:    vmulps %xmm1, %xmm8, %xmm8
-; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
-; AVX2-NEXT:    vmulps %xmm4, %xmm8, %xmm8
-; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX2-NEXT:    vbroadcastss %xmm3, %xmm8
-; AVX2-NEXT:    vmulps %xmm0, %xmm8, %xmm0
-; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm5
-; AVX2-NEXT:    vaddps %xmm5, %xmm0, %xmm0
-; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
-; AVX2-NEXT:    vmulps %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
-; AVX2-NEXT:    vmulps %xmm1, %xmm4, %xmm1
-; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm0
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test_mul4x4_f32:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
-; AVX512F-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm5
-; AVX512F-NEXT:    vmulps %xmm5, %xmm0, %xmm5
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
-; AVX512F-NEXT:    vmulps %xmm6, %xmm4, %xmm6
-; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
-; AVX512F-NEXT:    vmulps %xmm6, %xmm3, %xmm6
-; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm6
-; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX512F-NEXT:    vbroadcastss %xmm6, %xmm7
-; AVX512F-NEXT:    vmulps %xmm7, %xmm0, %xmm7
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
-; AVX512F-NEXT:    vmulps %xmm4, %xmm8, %xmm8
-; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
-; AVX512F-NEXT:    vmulps %xmm3, %xmm8, %xmm8
-; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
-; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm6
-; AVX512F-NEXT:    vaddps %xmm6, %xmm7, %xmm6
-; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm7
-; AVX512F-NEXT:    vbroadcastss %xmm7, %xmm8
-; AVX512F-NEXT:    vmulps %xmm0, %xmm8, %xmm8
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
-; AVX512F-NEXT:    vmulps %xmm4, %xmm9, %xmm9
-; AVX512F-NEXT:    vaddps %xmm9, %xmm8, %xmm8
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
-; AVX512F-NEXT:    vmulps %xmm3, %xmm9, %xmm9
-; AVX512F-NEXT:    vaddps %xmm9, %xmm8, %xmm8
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX512F-NEXT:    vmulps %xmm7, %xmm2, %xmm7
-; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
-; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm8
-; AVX512F-NEXT:    vmulps %xmm0, %xmm8, %xmm0
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
-; AVX512F-NEXT:    vmulps %xmm4, %xmm8, %xmm4
-; AVX512F-NEXT:    vaddps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
-; AVX512F-NEXT:    vmulps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vaddps %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX512F-NEXT:    vmulps %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX512F-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm1
-; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    retq
+; AVX1OR2-LABEL: test_mul4x4_f32:
+; AVX1OR2:       # %bb.0: # %entry
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
+; AVX1OR2-NEXT:    vmulps %ymm4, %ymm5, %ymm4
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm6
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0,0,0,4,4,4,4]
+; AVX1OR2-NEXT:    vmulps %ymm0, %ymm6, %ymm0
+; AVX1OR2-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm4
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6]
+; AVX1OR2-NEXT:    vmulps %ymm7, %ymm4, %ymm7
+; AVX1OR2-NEXT:    vaddps %ymm7, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX1OR2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; AVX1OR2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX1OR2-NEXT:    vmulps %ymm2, %ymm5, %ymm2
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
+; AVX1OR2-NEXT:    vmulps %ymm5, %ymm6, %ymm5
+; AVX1OR2-NEXT:    vaddps %ymm2, %ymm5, %ymm2
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6]
+; AVX1OR2-NEXT:    vmulps %ymm5, %ymm4, %ymm4
+; AVX1OR2-NEXT:    vaddps %ymm4, %ymm2, %ymm2
+; AVX1OR2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX1OR2-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; AVX1OR2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX1OR2-NEXT:    retq
 ;
-; AVX512VL-LABEL: test_mul4x4_f32:
-; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
-; AVX512VL-NEXT:    vextractf32x4 $3, %zmm0, %xmm4
-; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm5
-; AVX512VL-NEXT:    vmulps %xmm5, %xmm0, %xmm5
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
-; AVX512VL-NEXT:    vmulps %xmm6, %xmm2, %xmm6
-; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
-; AVX512VL-NEXT:    vmulps %xmm6, %xmm3, %xmm6
-; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; AVX512VL-NEXT:    vmulps %xmm6, %xmm4, %xmm6
-; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX512VL-NEXT:    vbroadcastss %xmm6, %xmm7
-; AVX512VL-NEXT:    vmulps %xmm7, %xmm0, %xmm7
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
-; AVX512VL-NEXT:    vmulps %xmm2, %xmm8, %xmm8
-; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
-; AVX512VL-NEXT:    vmulps %xmm3, %xmm8, %xmm8
-; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
-; AVX512VL-NEXT:    vmulps %xmm6, %xmm4, %xmm6
-; AVX512VL-NEXT:    vaddps %xmm6, %xmm7, %xmm6
-; AVX512VL-NEXT:    vextractf32x4 $2, %zmm1, %xmm7
-; AVX512VL-NEXT:    vbroadcastss %xmm7, %xmm8
-; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm8
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
-; AVX512VL-NEXT:    vmulps %xmm2, %xmm9, %xmm9
-; AVX512VL-NEXT:    vaddps %xmm9, %xmm8, %xmm8
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
-; AVX512VL-NEXT:    vmulps %xmm3, %xmm9, %xmm9
-; AVX512VL-NEXT:    vaddps %xmm9, %xmm8, %xmm8
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX512VL-NEXT:    vmulps %xmm7, %xmm4, %xmm7
-; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
-; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm8
-; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
-; AVX512VL-NEXT:    vmulps %xmm2, %xmm8, %xmm2
-; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2]
-; AVX512VL-NEXT:    vmulps %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT:  ...
[truncated]

Comment on lines 58367 to 58370
if (Concat0 || Concat1)
return DAG.getNode(Op0.getOpcode(), DL, VT,
Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems the behavior changed if both Concat0 and Concat1 are false?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes - this case was always concatenating the binops even if there wasn't any benefit (there's a few similar ones to this that also need addressing) - I'm happy to remove this handling for now and just start with the FADD/FUB/MUL case to simplify the patch?

…s instead of predicting when ops will freely concat

The IsConcatFree helper is limited on estimates on where concatenating the subvector operands is beneficial, this patch replaces FADD/FSUB/FMUL concatenation checks with a recursive call to combineConcatVectorOps to see if it will profitably concatenate.

Other opcodes can be moved to using the CombineSubOperand helper in future patches.
@RKSimon RKSimon force-pushed the x86-concat-ops-recurse branch from 7ecabfc to 3bd273d Compare March 7, 2025 15:24
@RKSimon
Copy link
Collaborator Author

RKSimon commented Mar 7, 2025

@phoebewang I've restricted this to FADD/FSUB/FMUL instructions for the initial patch (ADD/SUB/MUL will still need test coverage in a later PR).

Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@RKSimon RKSimon merged commit 73e14de into llvm:main Mar 8, 2025
8 of 11 checks passed
@RKSimon RKSimon deleted the x86-concat-ops-recurse branch March 8, 2025 11:50
@llvm-ci
Copy link
Collaborator

llvm-ci commented Mar 8, 2025

LLVM Buildbot has detected a new failure on builder lld-x86_64-ubuntu-fast running on as-builder-4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/12660

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM-Unit :: Support/./SupportTests/48/88' FAILED ********************
Script(shard):
--
GTEST_OUTPUT=json:/home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/unittests/Support/./SupportTests-LLVM-Unit-1932613-48-88.json GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=88 GTEST_SHARD_INDEX=48 /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/unittests/Support/./SupportTests
--

Note: This is test shard 49 of 88.
[==========] Running 16 tests from 16 test suites.
[----------] Global test environment set-up.
[----------] 1 test from CPUAlignBuildAttr
[ RUN      ] CPUAlignBuildAttr.testBuildAttr
[       OK ] CPUAlignBuildAttr.testBuildAttr (0 ms)
[----------] 1 test from CPUAlignBuildAttr (0 ms total)

[----------] 1 test from Caching
[ RUN      ] Caching.NoCommit
/home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/llvm-project/llvm/unittests/Support/Caching.cpp:145: Failure
Value of: bool(FileOrErr)
  Actual: false
Expected: true

Expected<T> must be checked before access or destruction.
Unchecked Expected<T> contained error:
No such file or directory: LLVMTestCache: Can't get a temporary fileStack dump without symbol names (ensure you have llvm-symbolizer in your PATH or set the environment var `LLVM_SYMBOLIZER_PATH` to point to it):
0  SupportTests 0x00005597b4336050
1  SupportTests 0x00005597b433349f
2  SupportTests 0x00005597b43335ea
3  libc.so.6    0x00007f64f003d520
4  libc.so.6    0x00007f64f00919fc pthread_kill + 300
5  libc.so.6    0x00007f64f003d476 raise + 22
6  libc.so.6    0x00007f64f00237f3 abort + 211
7  SupportTests 0x00005597b3dd2c1e
8  SupportTests 0x00005597b3dd6740
9  SupportTests 0x00005597b4357633
10 SupportTests 0x00005597b4362472
11 SupportTests 0x00005597b4362a11
12 SupportTests 0x00005597b436e24a
13 SupportTests 0x00005597b435672f
14 SupportTests 0x00005597b3d3a2ca
15 libc.so.6    0x00007f64f0024d90
16 libc.so.6    0x00007f64f0024e40 __libc_start_main + 128
17 SupportTests 0x00005597b3d3a925

--
exit: -6
--
shard JSON output does not exist: /home/buildbot/worker/as-builder-4/ramdisk/lld-x86_64/build/unittests/Support/./SupportTests-LLVM-Unit-1932613-48-88.json
********************


@llvm-ci
Copy link
Collaborator

llvm-ci commented Mar 8, 2025

LLVM Buildbot has detected a new failure on builder clang-aarch64-sve-vls-2stage running on linaro-g3-03 while building llvm at step 12 "ninja check 2".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/4/builds/5495

Here is the relevant piece of the build log for the reference
Step 12 (ninja check 2) failure: stage 2 checked (failure)
******************** TEST 'LLVM-Unit :: Support/./SupportTests/48/88' FAILED ********************
Script(shard):
--
GTEST_OUTPUT=json:/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/unittests/Support/./SupportTests-LLVM-Unit-1372265-48-88.json GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=88 GTEST_SHARD_INDEX=48 /home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/unittests/Support/./SupportTests
--

Script:
--
/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/stage2/unittests/Support/./SupportTests --gtest_filter=Caching.NoCommit
--
/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/llvm/unittests/Support/Caching.cpp:149: Failure
Value of: llvm::detail::TakeError(CFS->commit())
Expected: succeeded
  Actual: failed  (Failed to rename temporary file /tmp/lit-tmp-0egekfpk/llvm_test_cache/LLVMTest-742b3d.tmp.o to /tmp/lit-tmp-0egekfpk/llvm_test_cache/llvmcache-foo: No such file or directory
)


/home/tcwg-buildbot/worker/clang-aarch64-sve-vls-2stage/llvm/llvm/unittests/Support/Caching.cpp:149
Value of: llvm::detail::TakeError(CFS->commit())
Expected: succeeded
  Actual: failed  (Failed to rename temporary file /tmp/lit-tmp-0egekfpk/llvm_test_cache/LLVMTest-742b3d.tmp.o to /tmp/lit-tmp-0egekfpk/llvm_test_cache/llvmcache-foo: No such file or directory
)



********************


@llvm-ci
Copy link
Collaborator

llvm-ci commented Mar 30, 2025

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-win-fast running on as-builder-3 while building llvm at step 7 "test-build-unified-tree-check-llvm-unit".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/2/builds/18777

Here is the relevant piece of the build log for the reference
Step 7 (test-build-unified-tree-check-llvm-unit) failure: test (failure)
******************** TEST 'LLVM-Unit :: Support/./SupportTests.exe/49/86' FAILED ********************
Script(shard):
--
GTEST_OUTPUT=json:C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\build\unittests\Support\.\SupportTests.exe-LLVM-Unit-1468-49-86.json GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=86 GTEST_SHARD_INDEX=49 C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\build\unittests\Support\.\SupportTests.exe
--

Script:
--
C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\build\unittests\Support\.\SupportTests.exe --gtest_filter=Caching.NoCommit
--
C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\llvm-project\llvm\unittests\Support\Caching.cpp(142): error: Value of: AddStream
  Actual: false
Expected: true


C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\llvm-project\llvm\unittests\Support\Caching.cpp:142
Value of: AddStream
  Actual: false
Expected: true



********************


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants