11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2- ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86
3- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64
2+ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
3+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
4+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
45
56; Only bottom 16 bits are set - upper 48 bits are zero.
67define <2 x i64 > @combine_psadbw_shift (<16 x i8 > %0 , <16 x i8 > %1 ) nounwind {
7- ; CHECK-LABEL: combine_psadbw_shift:
8- ; CHECK: # %bb.0:
9- ; CHECK-NEXT: xorps %xmm0, %xmm0
10- ; CHECK-NEXT: ret{{[l|q]}}
8+ ; SSE-LABEL: combine_psadbw_shift:
9+ ; SSE: # %bb.0:
10+ ; SSE-NEXT: xorps %xmm0, %xmm0
11+ ; SSE-NEXT: ret{{[l|q]}}
12+ ;
13+ ; AVX2-LABEL: combine_psadbw_shift:
14+ ; AVX2: # %bb.0:
15+ ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
16+ ; AVX2-NEXT: retq
1117 %3 = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %0 , <16 x i8 > %1 )
1218 %4 = lshr <2 x i64 > %3 , <i64 48 , i64 48 >
1319 ret <2 x i64 > %4
1420}
1521
1622; Propagate the demanded result elements to the 8 aliasing source elements.
1723define i64 @combine_psadbw_demandedelt (<16 x i8 > %0 , <16 x i8 > %1 ) nounwind {
18- ; X86-LABEL: combine_psadbw_demandedelt:
19- ; X86: # %bb.0:
20- ; X86-NEXT: psadbw %xmm1, %xmm0
21- ; X86-NEXT: movd %xmm0, %eax
22- ; X86-NEXT: xorl %edx, %edx
23- ; X86-NEXT: retl
24+ ; X86-SSE-LABEL: combine_psadbw_demandedelt:
25+ ; X86-SSE: # %bb.0:
26+ ; X86-SSE-NEXT: psadbw %xmm1, %xmm0
27+ ; X86-SSE-NEXT: movd %xmm0, %eax
28+ ; X86-SSE-NEXT: xorl %edx, %edx
29+ ; X86-SSE-NEXT: retl
30+ ;
31+ ; X64-SSE-LABEL: combine_psadbw_demandedelt:
32+ ; X64-SSE: # %bb.0:
33+ ; X64-SSE-NEXT: psadbw %xmm1, %xmm0
34+ ; X64-SSE-NEXT: movq %xmm0, %rax
35+ ; X64-SSE-NEXT: retq
2436;
25- ; X64 -LABEL: combine_psadbw_demandedelt:
26- ; X64 : # %bb.0:
27- ; X64 -NEXT: psadbw %xmm1, %xmm0
28- ; X64 -NEXT: movq %xmm0, %rax
29- ; X64 -NEXT: retq
37+ ; AVX2 -LABEL: combine_psadbw_demandedelt:
38+ ; AVX2 : # %bb.0:
39+ ; AVX2 -NEXT: vpsadbw %xmm1, %xmm0 , %xmm0
40+ ; AVX2 -NEXT: vmovq %xmm0, %rax
41+ ; AVX2 -NEXT: retq
3042 %3 = shufflevector <16 x i8 > %0 , <16 x i8 > %0 , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 12 , i32 13 , i32 14 , i32 15 , i32 8 , i32 9 , i32 10 , i32 11 >
3143 %4 = shufflevector <16 x i8 > %1 , <16 x i8 > %1 , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 12 , i32 13 , i32 14 , i32 15 , i32 8 , i32 9 , i32 10 , i32 11 >
3244 %5 = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %3 , <16 x i8 > %4 )
@@ -36,25 +48,33 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
3648
3749; TODO: Each PSADBW source element has a maximum value of 3 - so max sum-of-diffs for each <8 x i8> should be 24.
3850define <2 x i64 > @combine_psadbw_cmp_knownbits (<16 x i8 > %a0 ) nounwind {
39- ; X86-LABEL: combine_psadbw_cmp_knownbits:
40- ; X86: # %bb.0:
41- ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
42- ; X86-NEXT: pxor %xmm1, %xmm1
43- ; X86-NEXT: psadbw %xmm0, %xmm1
44- ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
45- ; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
46- ; X86-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
47- ; X86-NEXT: retl
51+ ; X86-SSE-LABEL: combine_psadbw_cmp_knownbits:
52+ ; X86-SSE: # %bb.0:
53+ ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
54+ ; X86-SSE-NEXT: pxor %xmm1, %xmm1
55+ ; X86-SSE-NEXT: psadbw %xmm0, %xmm1
56+ ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
57+ ; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
58+ ; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
59+ ; X86-SSE-NEXT: retl
60+ ;
61+ ; X64-SSE-LABEL: combine_psadbw_cmp_knownbits:
62+ ; X64-SSE: # %bb.0:
63+ ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
64+ ; X64-SSE-NEXT: pxor %xmm1, %xmm1
65+ ; X64-SSE-NEXT: psadbw %xmm0, %xmm1
66+ ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
67+ ; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
68+ ; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
69+ ; X64-SSE-NEXT: retq
4870;
49- ; X64-LABEL: combine_psadbw_cmp_knownbits:
50- ; X64: # %bb.0:
51- ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
52- ; X64-NEXT: pxor %xmm1, %xmm1
53- ; X64-NEXT: psadbw %xmm0, %xmm1
54- ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
55- ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
56- ; X64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
57- ; X64-NEXT: retq
71+ ; AVX2-LABEL: combine_psadbw_cmp_knownbits:
72+ ; AVX2: # %bb.0:
73+ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
74+ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
75+ ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
76+ ; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
77+ ; AVX2-NEXT: retq
5878 %mask = and <16 x i8 > %a0 , <i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 >
5979 %sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
6080 %cmp = icmp sgt <2 x i64 > %sad , <i64 32 , i64 32 >
@@ -64,42 +84,53 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
6484
6585; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
6686define <2 x double > @combine_psadbw_sitofp_knownbits (<16 x i8 > %a0 ) nounwind {
67- ; X86-LABEL: combine_psadbw_sitofp_knownbits:
68- ; X86: # %bb.0:
69- ; X86-NEXT: pushl %ebp
70- ; X86-NEXT: movl %esp, %ebp
71- ; X86-NEXT: andl $-8, %esp
72- ; X86-NEXT: subl $32, %esp
73- ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
74- ; X86-NEXT: pxor %xmm1, %xmm1
75- ; X86-NEXT: psadbw %xmm0, %xmm1
76- ; X86-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
77- ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
78- ; X86-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
79- ; X86-NEXT: fildll {{[0-9]+}}(%esp)
80- ; X86-NEXT: fstpl {{[0-9]+}}(%esp)
81- ; X86-NEXT: fildll {{[0-9]+}}(%esp)
82- ; X86-NEXT: fstpl (%esp)
83- ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
84- ; X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
85- ; X86-NEXT: movl %ebp, %esp
86- ; X86-NEXT: popl %ebp
87- ; X86-NEXT: retl
87+ ; X86-SSE- LABEL: combine_psadbw_sitofp_knownbits:
88+ ; X86-SSE : # %bb.0:
89+ ; X86-SSE- NEXT: pushl %ebp
90+ ; X86-SSE- NEXT: movl %esp, %ebp
91+ ; X86-SSE- NEXT: andl $-8, %esp
92+ ; X86-SSE- NEXT: subl $32, %esp
93+ ; X86-SSE- NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
94+ ; X86-SSE- NEXT: pxor %xmm1, %xmm1
95+ ; X86-SSE- NEXT: psadbw %xmm0, %xmm1
96+ ; X86-SSE- NEXT: movq %xmm1, {{[0-9]+}}(%esp)
97+ ; X86-SSE- NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
98+ ; X86-SSE- NEXT: movq %xmm0, {{[0-9]+}}(%esp)
99+ ; X86-SSE- NEXT: fildll {{[0-9]+}}(%esp)
100+ ; X86-SSE- NEXT: fstpl {{[0-9]+}}(%esp)
101+ ; X86-SSE- NEXT: fildll {{[0-9]+}}(%esp)
102+ ; X86-SSE- NEXT: fstpl (%esp)
103+ ; X86-SSE- NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
104+ ; X86-SSE- NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
105+ ; X86-SSE- NEXT: movl %ebp, %esp
106+ ; X86-SSE- NEXT: popl %ebp
107+ ; X86-SSE- NEXT: retl
88108;
89- ; X64-LABEL: combine_psadbw_sitofp_knownbits:
90- ; X64: # %bb.0:
91- ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
92- ; X64-NEXT: pxor %xmm1, %xmm1
93- ; X64-NEXT: psadbw %xmm0, %xmm1
94- ; X64-NEXT: movd %xmm1, %eax
95- ; X64-NEXT: xorps %xmm0, %xmm0
96- ; X64-NEXT: cvtsi2sd %eax, %xmm0
97- ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
98- ; X64-NEXT: movd %xmm1, %eax
99- ; X64-NEXT: xorps %xmm1, %xmm1
100- ; X64-NEXT: cvtsi2sd %eax, %xmm1
101- ; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
102- ; X64-NEXT: retq
109+ ; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
110+ ; X64-SSE: # %bb.0:
111+ ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
112+ ; X64-SSE-NEXT: pxor %xmm1, %xmm1
113+ ; X64-SSE-NEXT: psadbw %xmm0, %xmm1
114+ ; X64-SSE-NEXT: movd %xmm1, %eax
115+ ; X64-SSE-NEXT: xorps %xmm0, %xmm0
116+ ; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
117+ ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
118+ ; X64-SSE-NEXT: movd %xmm1, %eax
119+ ; X64-SSE-NEXT: xorps %xmm1, %xmm1
120+ ; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
121+ ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
122+ ; X64-SSE-NEXT: retq
123+ ;
124+ ; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
125+ ; AVX2: # %bb.0:
126+ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
127+ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
128+ ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
129+ ; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
130+ ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
131+ ; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
132+ ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
133+ ; AVX2-NEXT: retq
103134 %mask = and <16 x i8 > %a0 , <i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 >
104135 %sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
105136 %cvt = sitofp <2 x i64 > %sad to <2 x double >
@@ -108,27 +139,40 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
108139
109140; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
110141define <2 x double > @combine_psadbw_uitofp_knownbits (<16 x i8 > %a0 ) nounwind {
111- ; X86-LABEL: combine_psadbw_uitofp_knownbits:
112- ; X86: # %bb.0:
113- ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
114- ; X86-NEXT: pxor %xmm1, %xmm1
115- ; X86-NEXT: psadbw %xmm1, %xmm0
116- ; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
117- ; X86-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
118- ; X86-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
119- ; X86-NEXT: addpd %xmm1, %xmm0
120- ; X86-NEXT: retl
142+ ; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
143+ ; X86-SSE: # %bb.0:
144+ ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
145+ ; X86-SSE-NEXT: pxor %xmm1, %xmm1
146+ ; X86-SSE-NEXT: psadbw %xmm1, %xmm0
147+ ; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
148+ ; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
149+ ; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
150+ ; X86-SSE-NEXT: addpd %xmm1, %xmm0
151+ ; X86-SSE-NEXT: retl
152+ ;
153+ ; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
154+ ; X64-SSE: # %bb.0:
155+ ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
156+ ; X64-SSE-NEXT: pxor %xmm1, %xmm1
157+ ; X64-SSE-NEXT: psadbw %xmm1, %xmm0
158+ ; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
159+ ; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
160+ ; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
161+ ; X64-SSE-NEXT: addpd %xmm1, %xmm0
162+ ; X64-SSE-NEXT: retq
121163;
122- ; X64-LABEL: combine_psadbw_uitofp_knownbits:
123- ; X64: # %bb.0:
124- ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
125- ; X64-NEXT: pxor %xmm1, %xmm1
126- ; X64-NEXT: psadbw %xmm1, %xmm0
127- ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
128- ; X64-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
129- ; X64-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
130- ; X64-NEXT: addpd %xmm1, %xmm0
131- ; X64-NEXT: retq
164+ ; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
165+ ; AVX2: # %bb.0:
166+ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167+ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
168+ ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
169+ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
170+ ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171+ ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
172+ ; AVX2-NEXT: # xmm1 = mem[0,0]
173+ ; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
174+ ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
175+ ; AVX2-NEXT: retq
132176 %mask = and <16 x i8 > %a0 , <i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 >
133177 %sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
134178 %cvt = uitofp <2 x i64 > %sad to <2 x double >
0 commit comments