Skip to content

Commit 5981a65

Browse files
committed
[X86] Remove single-use checks when combining xor and vfmulc/vcfmulc.
The current implementation to combine xor patterns for conjugation with complex multiplies will not perform the transformation when either the conjugate xor result or other multiplicand have other uses. This change eliminates both single-use checks. The xor result check isn't required as even if the conjugate result is needed elsewhere, the transformation eliminates the dependence. The check of the other multiplicand isn't required for correctness and has no apparent performance implications.
1 parent 4396237 commit 5981a65

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53633,9 +53633,9 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
5363353633
int CombineOpcode =
5363453634
N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
5363553635
auto combineConjugation = [&](SDValue &r) {
53636-
if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
53636+
if (LHS->getOpcode() == ISD::BITCAST) {
5363753637
SDValue XOR = LHS.getOperand(0);
53638-
if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
53638+
if (XOR->getOpcode() == ISD::XOR) {
5363953639
KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
5364053640
if (XORRHS.isConstant()) {
5364153641
APInt ConjugationInt32 = APInt(32, 0x80000000);

llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,26 @@ entry:
8383
ret <32 x half> %3
8484
}
8585

86+
define dso_local <32 x half> @test6(<16 x i32> %a) local_unnamed_addr #0 {
87+
; CHECK-LABEL: test6:
88+
; CHECK: # %bb.0: # %entry
89+
; CHECK-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
90+
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
91+
; CHECK-NEXT: vfcmulcph %zmm0, %zmm3, %zmm1
92+
; CHECK-NEXT: vfcmaddcph %zmm0, %zmm2, %zmm1
93+
; CHECK-NEXT: vmovaps %zmm1, %zmm0
94+
; CHECK-NEXT: retq
95+
entry:
96+
%0 = xor <16 x i32> %a, splat (i32 -2147483648)
97+
%1 = bitcast <16 x i32> %0 to <16 x float>
98+
%2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> splat (float 1.000000e+00), <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
99+
%3 = bitcast <16 x float> %2 to <32 x half>
100+
%4 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> zeroinitializer, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
101+
%5 = bitcast <16 x float> %4 to <32 x half>
102+
%6 = fadd <32 x half> %3, %5
103+
ret <32 x half> %6
104+
}
105+
86106
declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
87107
declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
88108
declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)

0 commit comments

Comments
 (0)