[X86] Remove single-use checks when combining xor and vfmulc/vcfmulc.

daniel-zabawa · daniel-zabawa · commit 2e913a8fdd5e · 2025-02-26T08:52:41.000-08:00
The current implementation to combine xor patterns for conjugation with
complex multiplies will not perform the transformation when either the
conjugate xor result or other multiplicand have other uses. This change
eliminates both single-use checks.

The xor result check isn't required as even if the conjugate result is
needed elsewhere, the transformation eliminates the dependence. The
check of the other multiplicand isn't required for correctness and
has no apparent performance implications.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53621,9 +53621,9 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
   int CombineOpcode =
       N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
   auto combineConjugation = [&](SDValue &r) {
-    if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
+    if (LHS->getOpcode() == ISD::BITCAST) {
       SDValue XOR = LHS.getOperand(0);
-      if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
+      if (XOR->getOpcode() == ISD::XOR) {
         KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
         if (XORRHS.isConstant()) {
           APInt ConjugationInt32 = APInt(32, 0x80000000);
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
@@ -83,6 +83,26 @@ entry:
   ret <32 x half> %3
 }
 
+define dso_local <32 x half> @test6(<16 x i32> %a) local_unnamed_addr #0 {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vfcmulcph %zmm0, %zmm3, %zmm1
+; CHECK-NEXT:    vfcmaddcph %zmm0, %zmm2, %zmm1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = xor <16 x i32> %a, splat (i32 -2147483648)
+  %1 = bitcast <16 x i32> %0 to <16 x float>
+  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %3 = bitcast <16 x float> %2 to <32 x half>
+  %4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> zeroinitializer, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %5 = bitcast <16 x float> %4 to <32 x half>
+  %6 = fadd <32 x half> %3, %5
+  ret <32 x half> %6
+}
+
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)