11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2- ; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=X64
2+ ; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
33
44; 67108863 == (1 << 26) - 1
5+ ; 4503599627370496 == (1 << 52)
6+ ; 4503599627370495 == (1 << 52) - 1
57
68define dso_local <8 x i64 > @test_512_combine_evex (<8 x i64 > noundef %0 , <8 x i64 > noundef %1 , <8 x i64 > noundef %2 ) local_unnamed_addr #0 {
79; X64-LABEL: test_512_combine_evex:
@@ -22,14 +24,16 @@ define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64
2224 ret <8 x i64 > %8
2325}
2426
25- define dso_local <8 x i64 > @fff (<8 x i64 > noundef %0 , <8 x i64 > noundef %1 , <8 x i64 > noundef %2 ) local_unnamed_addr #0 {
26- %4 = and <8 x i64 > %0 , splat (i64 67108863 )
27- %5 = and <8 x i64 > %1 , splat (i64 67108863 )
28- %6 = and <8 x i64 > %2 , splat (i64 67108863 )
27+ define dso_local <8 x i64 > @test_512_no_combine_evex_v2 (<8 x i64 > noundef %0 , <8 x i64 > noundef %1 , <8 x i64 > noundef %2 ) local_unnamed_addr #0 {
28+ ; X64-LABEL: test_512_no_combine_evex_v2:
29+ ; X64-NOT: vpmadd52luq
30+ ; X64: retq
31+ %4 = and <8 x i64 > %0 , splat (i64 4503599627370495 )
32+ %5 = and <8 x i64 > %1 , splat (i64 4503599627370495 )
33+ %6 = and <8 x i64 > %2 , splat (i64 4503599627370495 )
2934 %7 = mul nuw nsw <8 x i64 > %5 , %4
30- %8 = mul nuw nsw <8 x i64 > %7 , %6
31- %9 = add nuw nsw <8 x i64 > %8 , %7
32- ret <8 x i64 > %9
35+ %8 = add nuw nsw <8 x i64 > %7 , %6
36+ ret <8 x i64 > %8
3337}
3438
3539define dso_local noundef <8 x i64 > @test_512_no_combine_evex (<8 x i64 > noundef %0 , <8 x i64 > noundef %1 , <8 x i64 > noundef %2 ) local_unnamed_addr #0 {
@@ -106,6 +110,100 @@ define dso_local noundef <4 x i64> @test_256_no_combine_vex(<4 x i64> noundef %0
106110 ret <4 x i64 > %5
107111}
108112
109- attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width" ="512" "no-trapping-math" ="true" "stack-protector-buffer-size" ="8" "target-cpu" ="x86-64" "target-features" ="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu" ="generic" }
110- attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width" ="256" "no-trapping-math" ="true" "stack-protector-buffer-size" ="8" "target-cpu" ="x86-64" "target-features" ="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu" ="generic" }
111- attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width" ="256" "no-trapping-math" ="true" "stack-protector-buffer-size" ="8" "target-cpu" ="x86-64" "target-features" ="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu" ="generic" }
113+ define i64 @scalar_no_ifma (i64 %a , i64 %b , i64 %acc ) #0 {
114+ ; X64-LABEL: scalar_no_ifma:
115+ ; X64-NOT: vpmadd52
116+ ; X64-NOT: vpmullq
117+ ; X64: imulq
118+ ; X64: ret
119+ entry:
120+ %mul = mul i64 %a , %b
121+ %res = add i64 %acc , %mul
122+ ret i64 %res
123+ }
124+
125+ define <8 x i64 > @mixed_width_too_wide (<8 x i64 > %a , <8 x i64 > %b , <8 x i64 > %acc ) #0 {
126+ ; X64-LABEL: mixed_width_too_wide:
127+ ; X64-NOT: vpmadd52luq
128+ ; X64: vpmullq
129+ ; X64: ret
130+ entry:
131+ ; 40-bit and 13-bit, product fits < 2^53 (NOT < 2^52)
132+ %a40 = and <8 x i64 > %a , splat (i64 1099511627775 )
133+ %b13 = and <8 x i64 > %b , splat (i64 8191 )
134+ %mul = mul <8 x i64 > %a40 , %b13
135+ %res = add <8 x i64 > %acc , %mul
136+ ret <8 x i64 > %res
137+ }
138+
139+ define <8 x i64 > @zext32_inputs_not_safe (<8 x i32 > %ai32 , <8 x i32 > %bi32 , <8 x i64 > %acc ) #0 {
140+ ; X64-LABEL: zext32_inputs_not_safe:
141+ ; X64: vpmul
142+ ; X64-NOT: vpmadd52luq
143+ ; X64: ret
144+ entry:
145+ %a = zext <8 x i32 > %ai32 to <8 x i64 >
146+ %b = zext <8 x i32 > %bi32 to <8 x i64 >
147+ %mul = mul <8 x i64 > %a , %b
148+ %res = add <8 x i64 > %acc , %mul
149+ ret <8 x i64 > %res
150+ }
151+
152+ define <8 x i64 > @const_2pow51_times_2 (<8 x i64 > %acc ) #0 {
153+ ; X64-LABEL: const_2pow51_times_2:
154+ ; X64-NOT: vpmadd52luq
155+ ; X64: vpaddq
156+ ; X64: ret
157+ entry:
158+ %a = insertelement <8 x i64 > undef , i64 2251799813685248 , i32 0 ; 2^51
159+ %a.s = shufflevector <8 x i64 > %a , <8 x i64 > poison, <8 x i32 > splat (i32 0 )
160+ %b = insertelement <8 x i64 > undef , i64 2 , i32 0
161+ %b.s = shufflevector <8 x i64 > %b , <8 x i64 > poison, <8 x i32 > splat (i32 0 )
162+ %mul = mul <8 x i64 > %a.s , %b.s ; product = 2^52
163+ %res = add <8 x i64 > %acc , %mul ; needs full low-64 add
164+ ret <8 x i64 > %res
165+ }
166+
167+ define <4 x i64 > @safe_ifma_v4 (<4 x i64 > %a , <4 x i64 > %b , <4 x i64 > %acc ) #1 {
168+ ; X64-LABEL: safe_ifma_v4:
169+ ; X64: vpmadd52luq
170+ ; X64-NOT: vpmullq
171+ ; X64: ret
172+ entry:
173+ %a26 = and <4 x i64 > %a , splat (i64 67108863 )
174+ %b26 = and <4 x i64 > %b , splat (i64 67108863 )
175+ %mul = mul <4 x i64 > %a26 , %b26
176+ %res = add <4 x i64 > %acc , %mul
177+ ret <4 x i64 > %res
178+ }
179+
180+ define <2 x i64 > @safe_ifma_v2 (<2 x i64 > %a , <2 x i64 > %b , <2 x i64 > %acc ) #1 {
181+ ; X64-LABEL: safe_ifma_v2:
182+ ; X64: vpmadd52luq
183+ ; X64-NOT: vpmullq
184+ ; X64: ret
185+ entry:
186+ %a26 = and <2 x i64 > %a , splat (i64 67108863 )
187+ %b26 = and <2 x i64 > %b , splat (i64 67108863 )
188+ %mul = mul <2 x i64 > %a26 , %b26
189+ %res = add <2 x i64 > %acc , %mul
190+ ret <2 x i64 > %res
191+ }
192+
193+ define <4 x i64 > @v4_no_vl_fallback (<4 x i64 > %a , <4 x i64 > %b , <4 x i64 > %acc ) #0 {
194+ ; X64-LABEL: v4_no_vl_fallback:
195+ ; X64-NOT: vpmadd52luq
196+ ; X64: pmul
197+ ; X64: ret
198+ entry:
199+ %a26 = and <4 x i64 > %a , splat (i64 67108863 )
200+ %b26 = and <4 x i64 > %b , splat (i64 67108863 )
201+ %mul = mul <4 x i64 > %a26 , %b26
202+ %res = add <4 x i64 > %acc , %mul
203+ ret <4 x i64 > %res
204+ }
205+
206+ attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features" ="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,-avx512vl,+cmov,+crc32,+evex512,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu" ="generic" }
207+ attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features" ="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu" ="generic" }
208+ attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features" ="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu" ="generic" }
209+ attributes #3 = { "target-features" ="+avx512dq,+avx512f,+avx512ifma,+avx512vl,-evex512" }
0 commit comments