Skip to content

Commit bfb34a2

Browse files
committed
Apply review feedback, simplify, add more tests
1 parent d3581fa commit bfb34a2

File tree

2 files changed

+126
-48
lines changed

2 files changed

+126
-48
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 17 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "llvm/ADT/StringSwitch.h"
2828
#include "llvm/Analysis/BlockFrequencyInfo.h"
2929
#include "llvm/Analysis/ProfileSummaryInfo.h"
30+
#include "llvm/Analysis/ValueTracking.h"
3031
#include "llvm/Analysis/VectorUtils.h"
3132
#include "llvm/CodeGen/IntrinsicLowering.h"
3233
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -57966,47 +57967,28 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
5796657967
Cmov.getOperand(3));
5796757968
}
5796857969

57969-
static SDValue matchIntegerMultiplyAdd(SDNode *N, SelectionDAG &DAG,
57970-
SDValue Op0, SDValue Op1,
57971-
const SDLoc &DL, EVT VT,
57972-
const X86Subtarget &Subtarget) {
57970+
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
57971+
EVT VT, const X86Subtarget &Subtarget) {
5797357972
using namespace SDPatternMatch;
57974-
if (!VT.isVector() || VT.getScalarType() != MVT::i64 ||
57975-
!Subtarget.hasAVX512() ||
57976-
(!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()) ||
57977-
!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(X86ISD::VPMADD52L,
57978-
VT) ||
57979-
Op0.getValueType() != VT || Op1.getValueType() != VT)
57973+
if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
57974+
(!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
57975+
return SDValue();
57976+
57977+
// Need AVX-512VL vector length extensions if operating on XMM/YMM registers
57978+
if (!Subtarget.hasVLX() && VT.getSizeInBits() < 512)
5798057979
return SDValue();
5798157980

5798257981
SDValue X, Y, Acc;
5798357982
if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
5798457983
return SDValue();
5798557984

57986-
auto CheckMulOperand = [&DAG, &VT](const SDValue &M, SDValue &Xval,
57987-
SDValue &Yval) -> bool {
57988-
if (M.getOpcode() != ISD::MUL)
57989-
return false;
57990-
const SDValue A = M.getOperand(0);
57991-
const SDValue B = M.getOperand(1);
57992-
const APInt Top12Set = APInt::getHighBitsSet(64, 12);
57993-
if (A.getValueType() != VT || B.getValueType() != VT ||
57994-
!DAG.MaskedValueIsZero(A, Top12Set) ||
57995-
!DAG.MaskedValueIsZero(B, Top12Set) ||
57996-
!DAG.MaskedValueIsZero(M, Top12Set))
57997-
return false;
57998-
Xval = A;
57999-
Yval = B;
58000-
return true;
58001-
};
58002-
58003-
if (CheckMulOperand(Op0, X, Y)) {
58004-
Acc = Op1;
58005-
} else if (CheckMulOperand(Op1, X, Y)) {
58006-
Acc = Op0;
58007-
} else {
57985+
KnownBits KnownX = DAG.computeKnownBits(X);
57986+
KnownBits KnownY = DAG.computeKnownBits(Y);
57987+
KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
57988+
if (KnownX.countMinLeadingZeros() < 12 ||
57989+
KnownY.countMinLeadingZeros() < 12 ||
57990+
KnownMul.countMinLeadingZeros() < 12)
5800857991
return SDValue();
58009-
}
5801057992

5801157993
return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Acc, X, Y);
5801257994
}
@@ -58114,10 +58096,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
5811458096
Op0.getOperand(0), Op0.getOperand(2));
5811558097
}
5811658098

58117-
if (SDValue node =
58118-
matchIntegerMultiplyAdd(N, DAG, Op0, Op1, DL, VT, Subtarget)) {
58119-
return node;
58120-
}
58099+
if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58100+
return IFMA52;
5812158101

5812258102
return combineAddOrSubToADCOrSBB(N, DL, DAG);
5812358103
}

llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll

Lines changed: 109 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=X64
2+
; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
33

44
; 67108863 == (1 << 26) - 1
5+
; 4503599627370496 == (1 << 52)
6+
; 4503599627370495 == (1 << 52) - 1
57

68
define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
79
; X64-LABEL: test_512_combine_evex:
@@ -22,14 +24,16 @@ define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64
2224
ret <8 x i64> %8
2325
}
2426

25-
define dso_local <8 x i64> @fff(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
26-
%4 = and <8 x i64> %0, splat (i64 67108863)
27-
%5 = and <8 x i64> %1, splat (i64 67108863)
28-
%6 = and <8 x i64> %2, splat (i64 67108863)
27+
define dso_local <8 x i64> @test_512_no_combine_evex_v2(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
28+
; X64-LABEL: test_512_no_combine_evex_v2:
29+
; X64-NOT: vpmadd52luq
30+
; X64: retq
31+
%4 = and <8 x i64> %0, splat (i64 4503599627370495)
32+
%5 = and <8 x i64> %1, splat (i64 4503599627370495)
33+
%6 = and <8 x i64> %2, splat (i64 4503599627370495)
2934
%7 = mul nuw nsw <8 x i64> %5, %4
30-
%8 = mul nuw nsw <8 x i64> %7, %6
31-
%9 = add nuw nsw <8 x i64> %8, %7
32-
ret <8 x i64> %9
35+
%8 = add nuw nsw <8 x i64> %7, %6
36+
ret <8 x i64> %8
3337
}
3438

3539
define dso_local noundef <8 x i64> @test_512_no_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
@@ -106,6 +110,100 @@ define dso_local noundef <4 x i64> @test_256_no_combine_vex(<4 x i64> noundef %0
106110
ret <4 x i64> %5
107111
}
108112

109-
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="512" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
110-
attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
111-
attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
113+
define i64 @scalar_no_ifma(i64 %a, i64 %b, i64 %acc) #0 {
114+
; X64-LABEL: scalar_no_ifma:
115+
; X64-NOT: vpmadd52
116+
; X64-NOT: vpmullq
117+
; X64: imulq
118+
; X64: ret
119+
entry:
120+
%mul = mul i64 %a, %b
121+
%res = add i64 %acc, %mul
122+
ret i64 %res
123+
}
124+
125+
define <8 x i64> @mixed_width_too_wide(<8 x i64> %a, <8 x i64> %b, <8 x i64> %acc) #0 {
126+
; X64-LABEL: mixed_width_too_wide:
127+
; X64-NOT: vpmadd52luq
128+
; X64: vpmullq
129+
; X64: ret
130+
entry:
131+
; 40-bit and 13-bit, product fits < 2^53 (NOT < 2^52)
132+
%a40 = and <8 x i64> %a, splat (i64 1099511627775)
133+
%b13 = and <8 x i64> %b, splat (i64 8191)
134+
%mul = mul <8 x i64> %a40, %b13
135+
%res = add <8 x i64> %acc, %mul
136+
ret <8 x i64> %res
137+
}
138+
139+
define <8 x i64> @zext32_inputs_not_safe(<8 x i32> %ai32, <8 x i32> %bi32, <8 x i64> %acc) #0 {
140+
; X64-LABEL: zext32_inputs_not_safe:
141+
; X64: vpmul
142+
; X64-NOT: vpmadd52luq
143+
; X64: ret
144+
entry:
145+
%a = zext <8 x i32> %ai32 to <8 x i64>
146+
%b = zext <8 x i32> %bi32 to <8 x i64>
147+
%mul = mul <8 x i64> %a, %b
148+
%res = add <8 x i64> %acc, %mul
149+
ret <8 x i64> %res
150+
}
151+
152+
define <8 x i64> @const_2pow51_times_2(<8 x i64> %acc) #0 {
153+
; X64-LABEL: const_2pow51_times_2:
154+
; X64-NOT: vpmadd52luq
155+
; X64: vpaddq
156+
; X64: ret
157+
entry:
158+
%a = insertelement <8 x i64> undef, i64 2251799813685248, i32 0 ; 2^51
159+
%a.s = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> splat (i32 0)
160+
%b = insertelement <8 x i64> undef, i64 2, i32 0
161+
%b.s = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> splat (i32 0)
162+
%mul = mul <8 x i64> %a.s, %b.s ; product = 2^52
163+
%res = add <8 x i64> %acc, %mul ; needs full low-64 add
164+
ret <8 x i64> %res
165+
}
166+
167+
define <4 x i64> @safe_ifma_v4(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #1 {
168+
; X64-LABEL: safe_ifma_v4:
169+
; X64: vpmadd52luq
170+
; X64-NOT: vpmullq
171+
; X64: ret
172+
entry:
173+
%a26 = and <4 x i64> %a, splat (i64 67108863)
174+
%b26 = and <4 x i64> %b, splat (i64 67108863)
175+
%mul = mul <4 x i64> %a26, %b26
176+
%res = add <4 x i64> %acc, %mul
177+
ret <4 x i64> %res
178+
}
179+
180+
define <2 x i64> @safe_ifma_v2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %acc) #1 {
181+
; X64-LABEL: safe_ifma_v2:
182+
; X64: vpmadd52luq
183+
; X64-NOT: vpmullq
184+
; X64: ret
185+
entry:
186+
%a26 = and <2 x i64> %a, splat (i64 67108863)
187+
%b26 = and <2 x i64> %b, splat (i64 67108863)
188+
%mul = mul <2 x i64> %a26, %b26
189+
%res = add <2 x i64> %acc, %mul
190+
ret <2 x i64> %res
191+
}
192+
193+
define <4 x i64> @v4_no_vl_fallback(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #0 {
194+
; X64-LABEL: v4_no_vl_fallback:
195+
; X64-NOT: vpmadd52luq
196+
; X64: pmul
197+
; X64: ret
198+
entry:
199+
%a26 = and <4 x i64> %a, splat (i64 67108863)
200+
%b26 = and <4 x i64> %b, splat (i64 67108863)
201+
%mul = mul <4 x i64> %a26, %b26
202+
%res = add <4 x i64> %acc, %mul
203+
ret <4 x i64> %res
204+
}
205+
206+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,-avx512vl,+cmov,+crc32,+evex512,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
207+
attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
208+
attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
209+
attributes #3 = { "target-features"="+avx512dq,+avx512f,+avx512ifma,+avx512vl,-evex512" }

0 commit comments

Comments
 (0)