|
1 | | -; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64 |
| 1 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX |
| 2 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL |
| 3 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL |
2 | 4 |
|
3 | 5 | ; 67108863 == (1 << 26) - 1 |
4 | 6 | ; 4503599627370496 == (1 << 52) |
5 | 7 | ; 4503599627370495 == (1 << 52) - 1 |
6 | 8 |
|
7 | | -define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 { |
8 | | -; X64-LABEL: test_512_combine_evex: |
9 | | -; X64: # %bb.0: |
10 | | -; X64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863] |
11 | | -; X64-NEXT: vpandq %zmm3, %zmm0, %zmm0 |
12 | | -; X64-NEXT: vpandq %zmm3, %zmm1, %zmm1 |
13 | | -; X64-NEXT: vpandq %zmm3, %zmm2, %zmm2 |
14 | | -; X64-NOT: vpmul |
15 | | -; X64-NOT: vpadd |
16 | | -; X64-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0 |
17 | | -; X64-NEXT: retq |
18 | | - %4 = and <8 x i64> %0, splat (i64 67108863) |
19 | | - %5 = and <8 x i64> %1, splat (i64 67108863) |
20 | | - %6 = and <8 x i64> %2, splat (i64 67108863) |
21 | | - %7 = mul nuw nsw <8 x i64> %5, %4 |
22 | | - %8 = add nuw nsw <8 x i64> %7, %6 |
23 | | - ret <8 x i64> %8 |
24 | | -} |
25 | | - |
26 | | -define dso_local <8 x i64> @test_512_no_combine_evex_v2(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 { |
27 | | -; X64-LABEL: test_512_no_combine_evex_v2: |
28 | | -; X64-NOT: vpmadd52luq |
29 | | -; X64: retq |
30 | | - %4 = and <8 x i64> %0, splat (i64 4503599627370495) |
31 | | - %5 = and <8 x i64> %1, splat (i64 4503599627370495) |
32 | | - %6 = and <8 x i64> %2, splat (i64 4503599627370495) |
33 | | - %7 = mul nuw nsw <8 x i64> %5, %4 |
34 | | - %8 = add nuw nsw <8 x i64> %7, %6 |
35 | | - ret <8 x i64> %8 |
| 9 | +define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { |
| 10 | + %x_masked = and <8 x i64> %x, splat (i64 67108863) |
| 11 | + %y_masked = and <8 x i64> %y, splat (i64 67108863) |
| 12 | + %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked |
| 13 | + %res = add nuw nsw <8 x i64> %mul, %z |
| 14 | + ret <8 x i64> %res |
36 | 15 | } |
37 | 16 |
|
38 | | -define dso_local noundef <8 x i64> @test_512_no_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 { |
39 | | -; X64-LABEL: test_512_no_combine_evex: |
40 | | -; X64: # %bb.0: |
41 | | -; X64-NOT: vpmadd52 |
42 | | -; X64-NEXT: vpmullq %zmm0, %zmm1, %zmm0 |
43 | | -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 |
44 | | -; X64-NEXT: retq |
45 | | - %4 = mul <8 x i64> %1, %0 |
46 | | - %5 = add <8 x i64> %4, %2 |
47 | | - ret <8 x i64> %5 |
| 17 | +define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { |
| 18 | + %x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1 |
| 19 | + %y_masked = and <8 x i64> %y, splat (i64 3) |
| 20 | + %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked |
| 21 | + %res = add nuw nsw <8 x i64> %mul, %z |
| 22 | + ret <8 x i64> %res |
48 | 23 | } |
49 | 24 |
|
50 | | -define dso_local <4 x i64> @test_256_combine_evex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #1 { |
51 | | -; X64-LABEL: test_256_combine_evex: |
52 | | -; X64: # %bb.0: |
53 | | -; X64-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863] |
54 | | -; X64-NEXT: vpand %ymm3, %ymm0, %ymm0 |
55 | | -; X64-NEXT: vpand %ymm3, %ymm1, %ymm1 |
56 | | -; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 |
57 | | -; X64-NOT: vpmul |
58 | | -; X64-NOT: vpadd |
59 | | -; X64-NEXT: vpmadd52luq %ymm1, %ymm2, %ymm0 |
60 | | -; X64-NEXT: retq |
61 | | - %4 = and <4 x i64> %0, <i64 67108863, i64 67108863, i64 67108863, i64 67108863> |
62 | | - %5 = and <4 x i64> %1, <i64 67108863, i64 67108863, i64 67108863, i64 67108863> |
63 | | - %6 = and <4 x i64> %2, <i64 67108863, i64 67108863, i64 67108863, i64 67108863> |
64 | | - %7 = mul nuw nsw <4 x i64> %5, %4 |
65 | | - %8 = add nuw nsw <4 x i64> %7, %6 |
66 | | - ret <4 x i64> %8 |
| 25 | +define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { |
| 26 | + %x_masked = and <8 x i64> %x, splat (i64 4503599627370495) |
| 27 | + %y_masked = and <8 x i64> %y, splat (i64 4503599627370495) |
| 28 | + %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked |
| 29 | + %res = add nuw nsw <8 x i64> %mul, %z |
| 30 | + ret <8 x i64> %res |
67 | 31 | } |
68 | 32 |
|
69 | | -define dso_local noundef <4 x i64> @test_256_no_combine_evex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #1 { |
70 | | -; X64-LABEL: test_256_no_combine_evex: |
71 | | -; X64: # %bb.0: |
72 | | -; X64-NOT: vpmadd52 |
73 | | -; X64-NEXT: vpmullq %ymm0, %ymm1, %ymm0 |
74 | | -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 |
75 | | -; X64-NEXT: retq |
76 | | - %4 = mul <4 x i64> %1, %0 |
77 | | - %5 = add <4 x i64> %4, %2 |
78 | | - ret <4 x i64> %5 |
| 33 | +define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { |
| 34 | + %mul = mul <8 x i64> %x, %y |
| 35 | + %res = add <8 x i64> %mul, %z |
| 36 | + ret <8 x i64> %res |
79 | 37 | } |
80 | 38 |
|
81 | | -define dso_local <4 x i64> @test_256_combine_vex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #2 { |
82 | | -; X64-LABEL: test_256_combine_vex: |
83 | | -; X64: # %bb.0: |
84 | | -; X64-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863] |
85 | | -; X64-NEXT: vpand %ymm3, %ymm0, %ymm0 |
86 | | -; X64-NEXT: vpand %ymm3, %ymm1, %ymm1 |
87 | | -; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 |
88 | | -; X64-NOT: vpmul |
89 | | -; X64-NOT: vpadd |
90 | | -; X64-NEXT: {vex} vpmadd52luq %ymm1, %ymm2, %ymm0 |
91 | | -; X64-NEXT: retq |
92 | | - %4 = and <4 x i64> %0, <i64 67108863, i64 67108863, i64 67108863, i64 67108863> |
93 | | - %5 = and <4 x i64> %1, <i64 67108863, i64 67108863, i64 67108863, i64 67108863> |
94 | | - %6 = and <4 x i64> %2, <i64 67108863, i64 67108863, i64 67108863, i64 67108863> |
95 | | - %7 = mul nuw nsw <4 x i64> %5, %4 |
96 | | - %8 = add nuw nsw <4 x i64> %7, %6 |
97 | | - ret <4 x i64> %8 |
| 39 | +define <4 x i64> @test_256_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) { |
| 40 | + %x_masked = and <4 x i64> %x, splat(i64 67108863) |
| 41 | + %y_masked = and <4 x i64> %y, splat(i64 67108863) |
| 42 | + %mul = mul nuw nsw <4 x i64> %x_masked, %y_masked |
| 43 | + %res = add nuw nsw <4 x i64> %z, %mul |
| 44 | + ret <4 x i64> %res |
98 | 45 | } |
99 | 46 |
|
100 | | -define dso_local noundef <4 x i64> @test_256_no_combine_vex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #2 { |
101 | | -; X64-LABEL: test_256_no_combine_vex: |
102 | | -; X64: # %bb.0: |
103 | | -; X64-NOT: vpmadd52 |
104 | | -; X64-NEXT: vpmullq %ymm0, %ymm1, %ymm0 |
105 | | -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 |
106 | | -; X64-NEXT: retq |
107 | | - %4 = mul <4 x i64> %1, %0 |
108 | | - %5 = add <4 x i64> %4, %2 |
109 | | - ret <4 x i64> %5 |
| 47 | +define <4 x i64> @test_256_no_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) { |
| 48 | + %mul = mul <4 x i64> %x, %y |
| 49 | + %res = add <4 x i64> %mul, %z |
| 50 | + ret <4 x i64> %res |
110 | 51 | } |
111 | 52 |
|
112 | | -define i64 @scalar_no_ifma(i64 %a, i64 %b, i64 %acc) #0 { |
113 | | -; X64-LABEL: scalar_no_ifma: |
114 | | -; X64-NOT: vpmadd52 |
115 | | -; X64-NOT: vpmullq |
116 | | -; X64: imulq |
117 | | -; X64: ret |
118 | | -entry: |
119 | | - %mul = mul i64 %a, %b |
120 | | - %res = add i64 %acc, %mul |
121 | | - ret i64 %res |
| 53 | +define <2 x i64> @test_128_combine(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { |
| 54 | + %x_masked = and <2 x i64> %x, splat (i64 67108863) |
| 55 | + %y_masked = and <2 x i64> %y, splat (i64 67108863) |
| 56 | + %mul = mul <2 x i64> %x_masked, %y_masked |
| 57 | + %res = add <2 x i64> %z, %mul |
| 58 | + ret <2 x i64> %res |
122 | 59 | } |
123 | 60 |
|
124 | | -define <8 x i64> @mixed_width_too_wide(<8 x i64> %a, <8 x i64> %b, <8 x i64> %acc) #0 { |
125 | | -; X64-LABEL: mixed_width_too_wide: |
126 | | -; X64-NOT: vpmadd52luq |
127 | | -; X64: vpmullq |
128 | | -; X64: ret |
129 | | -entry: |
130 | | - ; 40-bit and 13-bit, product fits < 2^53 (NOT < 2^52) |
131 | | - %a40 = and <8 x i64> %a, splat (i64 1099511627775) |
132 | | - %b13 = and <8 x i64> %b, splat (i64 8191) |
133 | | - %mul = mul <8 x i64> %a40, %b13 |
134 | | - %res = add <8 x i64> %acc, %mul |
135 | | - ret <8 x i64> %res |
| 61 | +; Sanity check we're not applying this here |
| 62 | +define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) { |
| 63 | + %mul = mul <1 x i64> %x, %y |
| 64 | + %res = add <1 x i64> %mul, %z |
| 65 | + ret <1 x i64> %res |
136 | 66 | } |
137 | 67 |
|
138 | | -define <8 x i64> @zext32_inputs_not_safe(<8 x i32> %ai32, <8 x i32> %bi32, <8 x i64> %acc) #0 { |
139 | | -; X64-LABEL: zext32_inputs_not_safe: |
140 | | -; X64: vpmul |
141 | | -; X64-NOT: vpmadd52luq |
142 | | -; X64: ret |
143 | | -entry: |
144 | | - %a = zext <8 x i32> %ai32 to <8 x i64> |
145 | | - %b = zext <8 x i32> %bi32 to <8 x i64> |
146 | | - %mul = mul <8 x i64> %a, %b |
147 | | - %res = add <8 x i64> %acc, %mul |
148 | | - ret <8 x i64> %res |
| 68 | +define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { |
| 69 | + ; 40-bit and 13-bit, too wide |
| 70 | + %x40 = and <8 x i64> %x, splat (i64 1099511627775) |
| 71 | + %y13 = and <8 x i64> %y, splat (i64 8191) |
| 72 | + %mul = mul <8 x i64> %x40, %y13 |
| 73 | + %res = add <8 x i64> %z, %mul |
| 74 | + ret <8 x i64> %z |
149 | 75 | } |
150 | 76 |
|
151 | | -define <8 x i64> @const_2pow51_times_2(<8 x i64> %acc) #0 { |
152 | | -; X64-LABEL: const_2pow51_times_2: |
153 | | -; X64-NOT: vpmadd52luq |
154 | | -; X64: vpaddq |
155 | | -; X64: ret |
156 | | -entry: |
157 | | - ; product = 2^52 |
158 | | - %mul = mul <8 x i64> splat(i64 2251799813685248), splat(i64 2) |
159 | | - %res = add <8 x i64> %acc, %mul ; needs full low-64 add |
| 77 | +define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) { |
| 78 | + %x = zext <8 x i32> %xi32 to <8 x i64> |
| 79 | + %y = zext <8 x i32> %yi32 to <8 x i64> |
| 80 | + %mul = mul <8 x i64> %x, %y |
| 81 | + %res = add <8 x i64> %z, %mul |
160 | 82 | ret <8 x i64> %res |
161 | 83 | } |
162 | 84 |
|
163 | | -define <4 x i64> @safe_ifma_v4(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #1 { |
164 | | -; X64-LABEL: safe_ifma_v4: |
165 | | -; X64: vpmadd52luq |
166 | | -; X64-NOT: vpmullq |
167 | | -; X64: ret |
168 | | -entry: |
169 | | - %a26 = and <4 x i64> %a, splat (i64 67108863) |
170 | | - %b26 = and <4 x i64> %b, splat (i64 67108863) |
171 | | - %mul = mul <4 x i64> %a26, %b26 |
172 | | - %res = add <4 x i64> %acc, %mul |
173 | | - ret <4 x i64> %res |
174 | | -} |
175 | | - |
176 | | -define <2 x i64> @safe_ifma_v2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %acc) #1 { |
177 | | -; X64-LABEL: safe_ifma_v2: |
178 | | -; X64: vpmadd52luq |
179 | | -; X64-NOT: vpmullq |
180 | | -; X64: ret |
181 | | -entry: |
182 | | - %a26 = and <2 x i64> %a, splat (i64 67108863) |
183 | | - %b26 = and <2 x i64> %b, splat (i64 67108863) |
184 | | - %mul = mul <2 x i64> %a26, %b26 |
185 | | - %res = add <2 x i64> %acc, %mul |
186 | | - ret <2 x i64> %res |
187 | | -} |
188 | | - |
189 | | -define <4 x i64> @v4_no_vl_fallback(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #0 { |
190 | | -; X64-LABEL: v4_no_vl_fallback: |
191 | | -; X64-NOT: vpmadd52luq |
192 | | -; X64: pmul |
193 | | -; X64: ret |
194 | | -entry: |
195 | | - %a26 = and <4 x i64> %a, splat (i64 67108863) |
196 | | - %b26 = and <4 x i64> %b, splat (i64 67108863) |
197 | | - %mul = mul <4 x i64> %a26, %b26 |
198 | | - %res = add <4 x i64> %acc, %mul |
199 | | - ret <4 x i64> %res |
200 | | -} |
201 | | - |
202 | | -define <16 x i64> @v16_test_split(<16 x i64> %a, <16 x i64> %b, <16 x i64> %acc) #1 { |
203 | | -; X64-LABEL: v16_test_split: |
204 | | -; X64: vpmadd52luq |
205 | | -; X64: vpmadd52luq |
206 | | -; X64: ret |
207 | | -entry: |
208 | | - %a26 = and <16 x i64> %a, splat (i64 67108863) |
209 | | - %b26 = and <16 x i64> %b, splat (i64 67108863) |
210 | | - %mul = mul <16 x i64> %a26, %b26 |
211 | | - %res = add <16 x i64> %acc, %mul |
| 85 | +define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) { |
| 86 | + %x_masked = and <16 x i64> %x, splat (i64 67108863) |
| 87 | + %y_masked = and <16 x i64> %y, splat (i64 67108863) |
| 88 | + %mul = mul <16 x i64> %x_masked, %y_masked |
| 89 | + %res = add <16 x i64> %z, %mul |
212 | 90 | ret <16 x i64> %res |
213 | 91 | } |
214 | | - |
215 | | -attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,-avx512vl,+cmov,+crc32,+evex512,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" } |
216 | | -attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" } |
217 | | -attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" } |
0 commit comments