Skip to content

Commit 26deae4

Browse files
RKSimonkcloudy0717
authored andcommitted
[X86] Add tests showing failure to concat sqrt intrinsics together. (llvm#170096)
Similar to fdiv, we should be trying to concat these high latency instructions together
1 parent 89f3657 commit 26deae4

File tree

1 file changed

+91
-0
lines changed

1 file changed

+91
-0
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
7+
8+
define <8 x float> @concat_sqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
9+
; SSE-LABEL: concat_sqrt_v8f32_v4f32:
10+
; SSE: # %bb.0:
11+
; SSE-NEXT: sqrtps %xmm0, %xmm0
12+
; SSE-NEXT: sqrtps %xmm1, %xmm1
13+
; SSE-NEXT: retq
14+
;
15+
; AVX-LABEL: concat_sqrt_v8f32_v4f32:
16+
; AVX: # %bb.0:
17+
; AVX-NEXT: vsqrtps %xmm0, %xmm0
18+
; AVX-NEXT: vsqrtps %xmm1, %xmm1
19+
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
20+
; AVX-NEXT: retq
21+
%v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
22+
%v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
23+
%res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
24+
ret <8 x float> %res
25+
}
26+
27+
define <16 x float> @concat_sqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
28+
; SSE-LABEL: concat_sqrt_v16f32_v4f32:
29+
; SSE: # %bb.0:
30+
; SSE-NEXT: sqrtps %xmm0, %xmm0
31+
; SSE-NEXT: sqrtps %xmm1, %xmm1
32+
; SSE-NEXT: sqrtps %xmm2, %xmm2
33+
; SSE-NEXT: sqrtps %xmm3, %xmm3
34+
; SSE-NEXT: retq
35+
;
36+
; AVX1OR2-LABEL: concat_sqrt_v16f32_v4f32:
37+
; AVX1OR2: # %bb.0:
38+
; AVX1OR2-NEXT: vsqrtps %xmm0, %xmm0
39+
; AVX1OR2-NEXT: vsqrtps %xmm1, %xmm1
40+
; AVX1OR2-NEXT: vsqrtps %xmm2, %xmm2
41+
; AVX1OR2-NEXT: vsqrtps %xmm3, %xmm3
42+
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
43+
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
44+
; AVX1OR2-NEXT: retq
45+
;
46+
; AVX512-LABEL: concat_sqrt_v16f32_v4f32:
47+
; AVX512: # %bb.0:
48+
; AVX512-NEXT: vsqrtps %xmm0, %xmm0
49+
; AVX512-NEXT: vsqrtps %xmm1, %xmm1
50+
; AVX512-NEXT: vsqrtps %xmm2, %xmm2
51+
; AVX512-NEXT: vsqrtps %xmm3, %xmm3
52+
; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
53+
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
54+
; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
55+
; AVX512-NEXT: retq
56+
%v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
57+
%v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
58+
%v2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a2)
59+
%v3 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a3)
60+
%r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
61+
%r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
62+
%res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
63+
ret <16 x float> %res
64+
}
65+
66+
define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
67+
; SSE-LABEL: concat_sqrt_v16f32_v8f32:
68+
; SSE: # %bb.0:
69+
; SSE-NEXT: sqrtps %xmm0, %xmm0
70+
; SSE-NEXT: sqrtps %xmm1, %xmm1
71+
; SSE-NEXT: sqrtps %xmm2, %xmm2
72+
; SSE-NEXT: sqrtps %xmm3, %xmm3
73+
; SSE-NEXT: retq
74+
;
75+
; AVX1OR2-LABEL: concat_sqrt_v16f32_v8f32:
76+
; AVX1OR2: # %bb.0:
77+
; AVX1OR2-NEXT: vsqrtps %ymm0, %ymm0
78+
; AVX1OR2-NEXT: vsqrtps %ymm1, %ymm1
79+
; AVX1OR2-NEXT: retq
80+
;
81+
; AVX512-LABEL: concat_sqrt_v16f32_v8f32:
82+
; AVX512: # %bb.0:
83+
; AVX512-NEXT: vsqrtps %ymm0, %ymm0
84+
; AVX512-NEXT: vsqrtps %ymm1, %ymm1
85+
; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
86+
; AVX512-NEXT: retq
87+
%v0 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
88+
%v1 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a1)
89+
%res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
90+
ret <16 x float> %res
91+
}

0 commit comments

Comments
 (0)