Skip to content

Commit 89bae92

Browse files
committed
Refactor X86ISelLowering.cpp and add test checks
Address feedback: Removed ValueTracking.h (sorry think clion autoadded and I missed it) Removed leftover comment // if (0 Added !Subtarget.hasAVXIFMA guard to prevent missed application of this change on AVXIFMA 128/256bit Add assert message Change X86ISD::VPMADD52L call to reflect different X86 op order Add test CHECK lines from script
1 parent 1798672 commit 89bae92

File tree

2 files changed

+272
-6
lines changed

2 files changed

+272
-6
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
#include "llvm/ADT/StringSwitch.h"
2828
#include "llvm/Analysis/BlockFrequencyInfo.h"
2929
#include "llvm/Analysis/ProfileSummaryInfo.h"
30-
#include "llvm/Analysis/ValueTracking.h"
3130
#include "llvm/Analysis/VectorUtils.h"
3231
#include "llvm/CodeGen/IntrinsicLowering.h"
3332
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -4462,7 +4461,6 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
44624461
unsigned NumSubs = 1;
44634462
if ((CheckBWI && Subtarget.useBWIRegs()) ||
44644463
(!CheckBWI && Subtarget.useAVX512Regs())) {
4465-
// if (0) {
44664464
if (VT.getSizeInBits() > 512) {
44674465
NumSubs = VT.getSizeInBits() / 512;
44684466
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -57978,7 +57976,8 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
5797857976
return SDValue();
5797957977

5798057978
// Need AVX-512VL vector length extensions if operating on XMM/YMM registers
57981-
if (!Subtarget.hasVLX() && VT.getSizeInBits() < 512)
57979+
if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
57980+
VT.getSizeInBits() < 512)
5798257981
return SDValue();
5798357982

5798457983
SDValue X, Y, Acc;
@@ -57996,9 +57995,10 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
5799657995
auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
5799757996
ArrayRef<SDValue> SubOps) {
5799857997
EVT SubVT = SubOps[0].getValueType();
57999-
assert(SubVT.getScalarSizeInBits() == 64);
58000-
return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[0] /*Acc*/,
58001-
SubOps[1] /*X*/, SubOps[2] /*Y*/);
57998+
assert(SubVT.getScalarSizeInBits() == 64 &&
57999+
"Unexpected element size, only supports 64bit size");
58000+
return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58001+
SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
5800258002
};
5800358003

5800458004
return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,

llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll

Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX
23
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL
34
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL
@@ -7,6 +8,27 @@
78
; 4503599627370495 == (1 << 52) - 1
89

910
define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
11+
; AVX-LABEL: test_512_combine:
12+
; AVX: # %bb.0:
13+
; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863]
14+
; AVX-NEXT: vpand %ymm6, %ymm2, %ymm2
15+
; AVX-NEXT: vpand %ymm6, %ymm0, %ymm0
16+
; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
17+
; AVX-NEXT: vpand %ymm6, %ymm3, %ymm0
18+
; AVX-NEXT: vpand %ymm6, %ymm1, %ymm1
19+
; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
20+
; AVX-NEXT: vmovdqa %ymm4, %ymm0
21+
; AVX-NEXT: vmovdqa %ymm5, %ymm1
22+
; AVX-NEXT: retq
23+
;
24+
; AVX512-LABEL: test_512_combine:
25+
; AVX512: # %bb.0:
26+
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
27+
; AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0
28+
; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
29+
; AVX512-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2
30+
; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0
31+
; AVX512-NEXT: retq
1032
%x_masked = and <8 x i64> %x, splat (i64 67108863)
1133
%y_masked = and <8 x i64> %y, splat (i64 67108863)
1234
%mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -15,6 +37,27 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
1537
}
1638

1739
define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
40+
; AVX-LABEL: test_512_combine_v2:
41+
; AVX: # %bb.0:
42+
; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3]
43+
; AVX-NEXT: vpand %ymm6, %ymm2, %ymm2
44+
; AVX-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
45+
; AVX-NEXT: vpand %ymm7, %ymm0, %ymm0
46+
; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
47+
; AVX-NEXT: vpand %ymm6, %ymm3, %ymm0
48+
; AVX-NEXT: vpand %ymm7, %ymm1, %ymm1
49+
; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
50+
; AVX-NEXT: vmovdqa %ymm4, %ymm0
51+
; AVX-NEXT: vmovdqa %ymm5, %ymm1
52+
; AVX-NEXT: retq
53+
;
54+
; AVX512-LABEL: test_512_combine_v2:
55+
; AVX512: # %bb.0:
56+
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
57+
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
58+
; AVX512-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2
59+
; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0
60+
; AVX512-NEXT: retq
1861
%x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1
1962
%y_masked = and <8 x i64> %y, splat (i64 3)
2063
%mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -23,6 +66,48 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
2366
}
2467

2568
define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
69+
; AVX-LABEL: test_512_no_combine:
70+
; AVX: # %bb.0:
71+
; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495]
72+
; AVX-NEXT: vpand %ymm6, %ymm0, %ymm7
73+
; AVX-NEXT: vpand %ymm6, %ymm1, %ymm8
74+
; AVX-NEXT: vpand %ymm6, %ymm2, %ymm9
75+
; AVX-NEXT: vpand %ymm6, %ymm3, %ymm6
76+
; AVX-NEXT: vpsrlq $32, %ymm8, %ymm8
77+
; AVX-NEXT: vpmuludq %ymm3, %ymm8, %ymm8
78+
; AVX-NEXT: vpsrlq $32, %ymm6, %ymm6
79+
; AVX-NEXT: vpmuludq %ymm6, %ymm1, %ymm6
80+
; AVX-NEXT: vpaddq %ymm6, %ymm8, %ymm6
81+
; AVX-NEXT: vpsllq $32, %ymm6, %ymm6
82+
; AVX-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
83+
; AVX-NEXT: vpsrlq $32, %ymm7, %ymm3
84+
; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
85+
; AVX-NEXT: vpsrlq $32, %ymm9, %ymm7
86+
; AVX-NEXT: vpmuludq %ymm7, %ymm0, %ymm7
87+
; AVX-NEXT: vpaddq %ymm3, %ymm7, %ymm3
88+
; AVX-NEXT: vpsllq $32, %ymm3, %ymm3
89+
; AVX-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
90+
; AVX-NEXT: vpaddq %ymm4, %ymm0, %ymm0
91+
; AVX-NEXT: vpaddq %ymm3, %ymm0, %ymm0
92+
; AVX-NEXT: vpaddq %ymm5, %ymm1, %ymm1
93+
; AVX-NEXT: vpaddq %ymm6, %ymm1, %ymm1
94+
; AVX-NEXT: retq
95+
;
96+
; AVX512-LABEL: test_512_no_combine:
97+
; AVX512: # %bb.0:
98+
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495]
99+
; AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm4
100+
; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm3
101+
; AVX512-NEXT: vpsrlq $32, %zmm4, %zmm4
102+
; AVX512-NEXT: vpmuludq %zmm1, %zmm4, %zmm4
103+
; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm3
104+
; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
105+
; AVX512-NEXT: vpaddq %zmm4, %zmm3, %zmm3
106+
; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
107+
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
108+
; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
109+
; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0
110+
; AVX512-NEXT: retq
26111
%x_masked = and <8 x i64> %x, splat (i64 4503599627370495)
27112
%y_masked = and <8 x i64> %y, splat (i64 4503599627370495)
28113
%mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -31,12 +116,72 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
31116
}
32117

33118
define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
119+
; AVX-LABEL: test_512_no_combine_v2:
120+
; AVX: # %bb.0:
121+
; AVX-NEXT: vpsrlq $32, %ymm1, %ymm6
122+
; AVX-NEXT: vpmuludq %ymm3, %ymm6, %ymm6
123+
; AVX-NEXT: vpsrlq $32, %ymm3, %ymm7
124+
; AVX-NEXT: vpmuludq %ymm7, %ymm1, %ymm7
125+
; AVX-NEXT: vpaddq %ymm6, %ymm7, %ymm6
126+
; AVX-NEXT: vpsllq $32, %ymm6, %ymm6
127+
; AVX-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
128+
; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3
129+
; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
130+
; AVX-NEXT: vpsrlq $32, %ymm2, %ymm7
131+
; AVX-NEXT: vpmuludq %ymm7, %ymm0, %ymm7
132+
; AVX-NEXT: vpaddq %ymm3, %ymm7, %ymm3
133+
; AVX-NEXT: vpsllq $32, %ymm3, %ymm3
134+
; AVX-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
135+
; AVX-NEXT: vpaddq %ymm4, %ymm0, %ymm0
136+
; AVX-NEXT: vpaddq %ymm3, %ymm0, %ymm0
137+
; AVX-NEXT: vpaddq %ymm5, %ymm1, %ymm1
138+
; AVX-NEXT: vpaddq %ymm6, %ymm1, %ymm1
139+
; AVX-NEXT: retq
140+
;
141+
; AVX512-LABEL: test_512_no_combine_v2:
142+
; AVX512: # %bb.0:
143+
; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm3
144+
; AVX512-NEXT: vpmuludq %zmm1, %zmm3, %zmm3
145+
; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm4
146+
; AVX512-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
147+
; AVX512-NEXT: vpaddq %zmm3, %zmm4, %zmm3
148+
; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
149+
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
150+
; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
151+
; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0
152+
; AVX512-NEXT: retq
34153
%mul = mul <8 x i64> %x, %y
35154
%res = add <8 x i64> %mul, %z
36155
ret <8 x i64> %res
37156
}
38157

39158
define <4 x i64> @test_256_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
159+
; AVX-LABEL: test_256_combine:
160+
; AVX: # %bb.0:
161+
; AVX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
162+
; AVX-NEXT: vpand %ymm3, %ymm0, %ymm0
163+
; AVX-NEXT: vpand %ymm3, %ymm1, %ymm1
164+
; AVX-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
165+
; AVX-NEXT: vmovdqa %ymm2, %ymm0
166+
; AVX-NEXT: retq
167+
;
168+
; AVX512-NOVL-LABEL: test_256_combine:
169+
; AVX512-NOVL: # %bb.0:
170+
; AVX512-NOVL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
171+
; AVX512-NOVL-NEXT: vpand %ymm3, %ymm0, %ymm0
172+
; AVX512-NOVL-NEXT: vpand %ymm3, %ymm1, %ymm1
173+
; AVX512-NOVL-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
174+
; AVX512-NOVL-NEXT: vpaddq %ymm0, %ymm2, %ymm0
175+
; AVX512-NOVL-NEXT: retq
176+
;
177+
; AVX512VL-LABEL: test_256_combine:
178+
; AVX512VL: # %bb.0:
179+
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
180+
; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
181+
; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
182+
; AVX512VL-NEXT: vpmadd52luq %ymm1, %ymm0, %ymm2
183+
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
184+
; AVX512VL-NEXT: retq
40185
%x_masked = and <4 x i64> %x, splat(i64 67108863)
41186
%y_masked = and <4 x i64> %y, splat(i64 67108863)
42187
%mul = mul nuw nsw <4 x i64> %x_masked, %y_masked
@@ -45,12 +190,50 @@ define <4 x i64> @test_256_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
45190
}
46191

47192
define <4 x i64> @test_256_no_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
193+
; X64-LABEL: test_256_no_combine:
194+
; X64: # %bb.0:
195+
; X64-NEXT: vpsrlq $32, %ymm0, %ymm3
196+
; X64-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
197+
; X64-NEXT: vpsrlq $32, %ymm1, %ymm4
198+
; X64-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
199+
; X64-NEXT: vpaddq %ymm3, %ymm4, %ymm3
200+
; X64-NEXT: vpsllq $32, %ymm3, %ymm3
201+
; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
202+
; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
203+
; X64-NEXT: vpaddq %ymm3, %ymm0, %ymm0
204+
; X64-NEXT: retq
48205
%mul = mul <4 x i64> %x, %y
49206
%res = add <4 x i64> %mul, %z
50207
ret <4 x i64> %res
51208
}
52209

53210
define <2 x i64> @test_128_combine(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
211+
; AVX-LABEL: test_128_combine:
212+
; AVX: # %bb.0:
213+
; AVX-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
214+
; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
215+
; AVX-NEXT: vpand %xmm3, %xmm1, %xmm1
216+
; AVX-NEXT: {vex} vpmadd52luq %xmm1, %xmm0, %xmm2
217+
; AVX-NEXT: vmovdqa %xmm2, %xmm0
218+
; AVX-NEXT: retq
219+
;
220+
; AVX512-NOVL-LABEL: test_128_combine:
221+
; AVX512-NOVL: # %bb.0:
222+
; AVX512-NOVL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
223+
; AVX512-NOVL-NEXT: vpand %xmm3, %xmm0, %xmm0
224+
; AVX512-NOVL-NEXT: vpand %xmm3, %xmm1, %xmm1
225+
; AVX512-NOVL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
226+
; AVX512-NOVL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
227+
; AVX512-NOVL-NEXT: retq
228+
;
229+
; AVX512VL-LABEL: test_128_combine:
230+
; AVX512VL: # %bb.0:
231+
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
232+
; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0
233+
; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1
234+
; AVX512VL-NEXT: vpmadd52luq %xmm1, %xmm0, %xmm2
235+
; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
236+
; AVX512VL-NEXT: retq
54237
%x_masked = and <2 x i64> %x, splat (i64 67108863)
55238
%y_masked = and <2 x i64> %y, splat (i64 67108863)
56239
%mul = mul <2 x i64> %x_masked, %y_masked
@@ -60,13 +243,28 @@ define <2 x i64> @test_128_combine(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
60243

61244
; Sanity check we're not applying this here
62245
define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
246+
; X64-LABEL: test_scalar_no_ifma:
247+
; X64: # %bb.0:
248+
; X64-NEXT: imulq %rsi, %rdi
249+
; X64-NEXT: leaq (%rdi,%rdx), %rax
250+
; X64-NEXT: retq
63251
%mul = mul <1 x i64> %x, %y
64252
%res = add <1 x i64> %mul, %z
65253
ret <1 x i64> %res
66254
}
67255

68256
define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
69257
; 40-bit and 13-bit, too wide
258+
; AVX-LABEL: test_mixed_width_too_wide:
259+
; AVX: # %bb.0:
260+
; AVX-NEXT: vmovaps %ymm5, %ymm1
261+
; AVX-NEXT: vmovaps %ymm4, %ymm0
262+
; AVX-NEXT: retq
263+
;
264+
; AVX512-LABEL: test_mixed_width_too_wide:
265+
; AVX512: # %bb.0:
266+
; AVX512-NEXT: vmovaps %zmm2, %zmm0
267+
; AVX512-NEXT: retq
70268
%x40 = and <8 x i64> %x, splat (i64 1099511627775)
71269
%y13 = and <8 x i64> %y, splat (i64 8191)
72270
%mul = mul <8 x i64> %x40, %y13
@@ -75,6 +273,27 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64
75273
}
76274

77275
define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) {
276+
; AVX-LABEL: test_zext32_inputs_not_safe:
277+
; AVX: # %bb.0:
278+
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
279+
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0
280+
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
281+
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
282+
; AVX-NEXT: vpmuludq %ymm5, %ymm4, %ymm4
283+
; AVX-NEXT: vextracti128 $1, %ymm1, %xmm1
284+
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
285+
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
286+
; AVX-NEXT: vpaddq %ymm4, %ymm2, %ymm0
287+
; AVX-NEXT: vpaddq %ymm1, %ymm3, %ymm1
288+
; AVX-NEXT: retq
289+
;
290+
; AVX512-LABEL: test_zext32_inputs_not_safe:
291+
; AVX512: # %bb.0:
292+
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
293+
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
294+
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
295+
; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
296+
; AVX512-NEXT: retq
78297
%x = zext <8 x i32> %xi32 to <8 x i64>
79298
%y = zext <8 x i32> %yi32 to <8 x i64>
80299
%mul = mul <8 x i64> %x, %y
@@ -83,6 +302,53 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32,
83302
}
84303

85304
define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) {
305+
; AVX-LABEL: test_1024_combine_split:
306+
; AVX: # %bb.0:
307+
; AVX-NEXT: pushq %rbp
308+
; AVX-NEXT: .cfi_def_cfa_offset 16
309+
; AVX-NEXT: .cfi_offset %rbp, -16
310+
; AVX-NEXT: movq %rsp, %rbp
311+
; AVX-NEXT: .cfi_def_cfa_register %rbp
312+
; AVX-NEXT: andq $-32, %rsp
313+
; AVX-NEXT: subq $32, %rsp
314+
; AVX-NEXT: vmovdqa 112(%rbp), %ymm8
315+
; AVX-NEXT: vmovdqa 80(%rbp), %ymm9
316+
; AVX-NEXT: vmovdqa 48(%rbp), %ymm10
317+
; AVX-NEXT: vmovdqa 16(%rbp), %ymm11
318+
; AVX-NEXT: vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863]
319+
; AVX-NEXT: vpand %ymm3, %ymm12, %ymm3
320+
; AVX-NEXT: vpand %ymm2, %ymm12, %ymm2
321+
; AVX-NEXT: vpand %ymm1, %ymm12, %ymm1
322+
; AVX-NEXT: vpand %ymm0, %ymm12, %ymm0
323+
; AVX-NEXT: vpand %ymm7, %ymm12, %ymm7
324+
; AVX-NEXT: {vex} vpmadd52luq %ymm7, %ymm3, %ymm8
325+
; AVX-NEXT: vpand %ymm6, %ymm12, %ymm3
326+
; AVX-NEXT: {vex} vpmadd52luq %ymm3, %ymm2, %ymm9
327+
; AVX-NEXT: vpand %ymm5, %ymm12, %ymm2
328+
; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm10
329+
; AVX-NEXT: vpand %ymm4, %ymm12, %ymm1
330+
; AVX-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm11
331+
; AVX-NEXT: vmovdqa %ymm11, %ymm0
332+
; AVX-NEXT: vmovdqa %ymm10, %ymm1
333+
; AVX-NEXT: vmovdqa %ymm9, %ymm2
334+
; AVX-NEXT: vmovdqa %ymm8, %ymm3
335+
; AVX-NEXT: movq %rbp, %rsp
336+
; AVX-NEXT: popq %rbp
337+
; AVX-NEXT: .cfi_def_cfa %rsp, 8
338+
; AVX-NEXT: retq
339+
;
340+
; AVX512-LABEL: test_1024_combine_split:
341+
; AVX512: # %bb.0:
342+
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
343+
; AVX512-NEXT: vpandq %zmm6, %zmm2, %zmm2
344+
; AVX512-NEXT: vpandq %zmm6, %zmm0, %zmm0
345+
; AVX512-NEXT: vpmadd52luq %zmm2, %zmm0, %zmm4
346+
; AVX512-NEXT: vpandq %zmm6, %zmm3, %zmm0
347+
; AVX512-NEXT: vpandq %zmm6, %zmm1, %zmm1
348+
; AVX512-NEXT: vpmadd52luq %zmm0, %zmm1, %zmm5
349+
; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0
350+
; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1
351+
; AVX512-NEXT: retq
86352
%x_masked = and <16 x i64> %x, splat (i64 67108863)
87353
%y_masked = and <16 x i64> %y, splat (i64 67108863)
88354
%mul = mul <16 x i64> %x_masked, %y_masked

0 commit comments

Comments
 (0)