Skip to content

Commit 2e913a8

Browse files
committed
[X86] Remove single-use checks when combining xor and vfmulc/vcfmulc.
The current implementation to combine xor patterns for conjugation with complex multiplies will not perform the transformation when either the conjugate xor result or other multiplicand have other uses. This change eliminates both single-use checks. The xor result check isn't required as even if the conjugate result is needed elsewhere, the transformation eliminates the dependence. The check of the other multiplicand isn't required for correctness and has no apparent performance implications.
1 parent 1d583ed commit 2e913a8

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53621,9 +53621,9 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
5362153621
int CombineOpcode =
5362253622
N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
5362353623
auto combineConjugation = [&](SDValue &r) {
53624-
if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
53624+
if (LHS->getOpcode() == ISD::BITCAST) {
5362553625
SDValue XOR = LHS.getOperand(0);
53626-
if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
53626+
if (XOR->getOpcode() == ISD::XOR) {
5362753627
KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
5362853628
if (XORRHS.isConstant()) {
5362953629
APInt ConjugationInt32 = APInt(32, 0x80000000);

llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,26 @@ entry:
8383
ret <32 x half> %3
8484
}
8585

86+
define dso_local <32 x half> @test6(<16 x i32> %a) local_unnamed_addr #0 {
87+
; CHECK-LABEL: test6:
88+
; CHECK: # %bb.0: # %entry
89+
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
90+
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
91+
; CHECK-NEXT: vfcmulcph %zmm0, %zmm3, %zmm1
92+
; CHECK-NEXT: vfcmaddcph %zmm0, %zmm2, %zmm1
93+
; CHECK-NEXT: vmovaps %zmm1, %zmm0
94+
; CHECK-NEXT: retq
95+
entry:
96+
%0 = xor <16 x i32> %a, splat (i32 -2147483648)
97+
%1 = bitcast <16 x i32> %0 to <16 x float>
98+
%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
99+
%3 = bitcast <16 x float> %2 to <32 x half>
100+
%4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> zeroinitializer, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
101+
%5 = bitcast <16 x float> %4 to <32 x half>
102+
%6 = fadd <32 x half> %3, %5
103+
ret <32 x half> %6
104+
}
105+
86106
declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
87107
declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
88108
declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)

0 commit comments

Comments
 (0)