Skip to content

Commit e726dbc

Browse files
RKSimonaugusto2112
authored andcommitted
[X86] Add tests showing failure to concat RCPPS + RSQRTPS intrinsics together. (llvm#170098)
Can only do this for 128->256 cases as we can't safely convert to the RCP14/RSQRT14 variants
1 parent 71221e7 commit e726dbc

File tree

2 files changed

+130
-0
lines changed

2 files changed

+130
-0
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
7+
8+
define <8 x float> @concat_rcp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
9+
; SSE-LABEL: concat_rcp_v8f32_v4f32:
10+
; SSE: # %bb.0:
11+
; SSE-NEXT: rcpps %xmm0, %xmm0
12+
; SSE-NEXT: rcpps %xmm1, %xmm1
13+
; SSE-NEXT: retq
14+
;
15+
; AVX-LABEL: concat_rcp_v8f32_v4f32:
16+
; AVX: # %bb.0:
17+
; AVX-NEXT: vrcpps %xmm0, %xmm0
18+
; AVX-NEXT: vrcpps %xmm1, %xmm1
19+
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
20+
; AVX-NEXT: retq
21+
%v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
22+
%v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
23+
%res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
24+
ret <8 x float> %res
25+
}
26+
27+
; Ensure we don't convert rcpps to rcp14ps
28+
define <16 x float> @concat_rcp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
29+
; SSE-LABEL: concat_rcp_v16f32_v4f32:
30+
; SSE: # %bb.0:
31+
; SSE-NEXT: rcpps %xmm0, %xmm0
32+
; SSE-NEXT: rcpps %xmm1, %xmm1
33+
; SSE-NEXT: rcpps %xmm2, %xmm2
34+
; SSE-NEXT: rcpps %xmm3, %xmm3
35+
; SSE-NEXT: retq
36+
;
37+
; AVX1OR2-LABEL: concat_rcp_v16f32_v4f32:
38+
; AVX1OR2: # %bb.0:
39+
; AVX1OR2-NEXT: vrcpps %xmm0, %xmm0
40+
; AVX1OR2-NEXT: vrcpps %xmm1, %xmm1
41+
; AVX1OR2-NEXT: vrcpps %xmm2, %xmm2
42+
; AVX1OR2-NEXT: vrcpps %xmm3, %xmm3
43+
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
44+
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
45+
; AVX1OR2-NEXT: retq
46+
;
47+
; AVX512-LABEL: concat_rcp_v16f32_v4f32:
48+
; AVX512: # %bb.0:
49+
; AVX512-NEXT: vrcpps %xmm0, %xmm0
50+
; AVX512-NEXT: vrcpps %xmm1, %xmm1
51+
; AVX512-NEXT: vrcpps %xmm2, %xmm2
52+
; AVX512-NEXT: vrcpps %xmm3, %xmm3
53+
; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
54+
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
55+
; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
56+
; AVX512-NEXT: retq
57+
%v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
58+
%v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
59+
%v2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a2)
60+
%v3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a3)
61+
%r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
62+
%r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
63+
%res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
64+
ret <16 x float> %res
65+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
7+
8+
define <8 x float> @concat_rsqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
9+
; SSE-LABEL: concat_rsqrt_v8f32_v4f32:
10+
; SSE: # %bb.0:
11+
; SSE-NEXT: rsqrtps %xmm0, %xmm0
12+
; SSE-NEXT: rsqrtps %xmm1, %xmm1
13+
; SSE-NEXT: retq
14+
;
15+
; AVX-LABEL: concat_rsqrt_v8f32_v4f32:
16+
; AVX: # %bb.0:
17+
; AVX-NEXT: vrsqrtps %xmm0, %xmm0
18+
; AVX-NEXT: vrsqrtps %xmm1, %xmm1
19+
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
20+
; AVX-NEXT: retq
21+
%v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
22+
%v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
23+
%res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
24+
ret <8 x float> %res
25+
}
26+
27+
; Ensure we don't convert rsqrtps to rsqrt14ps
28+
define <16 x float> @concat_rsqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
29+
; SSE-LABEL: concat_rsqrt_v16f32_v4f32:
30+
; SSE: # %bb.0:
31+
; SSE-NEXT: rsqrtps %xmm0, %xmm0
32+
; SSE-NEXT: rsqrtps %xmm1, %xmm1
33+
; SSE-NEXT: rsqrtps %xmm2, %xmm2
34+
; SSE-NEXT: rsqrtps %xmm3, %xmm3
35+
; SSE-NEXT: retq
36+
;
37+
; AVX1OR2-LABEL: concat_rsqrt_v16f32_v4f32:
38+
; AVX1OR2: # %bb.0:
39+
; AVX1OR2-NEXT: vrsqrtps %xmm0, %xmm0
40+
; AVX1OR2-NEXT: vrsqrtps %xmm1, %xmm1
41+
; AVX1OR2-NEXT: vrsqrtps %xmm2, %xmm2
42+
; AVX1OR2-NEXT: vrsqrtps %xmm3, %xmm3
43+
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
44+
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
45+
; AVX1OR2-NEXT: retq
46+
;
47+
; AVX512-LABEL: concat_rsqrt_v16f32_v4f32:
48+
; AVX512: # %bb.0:
49+
; AVX512-NEXT: vrsqrtps %xmm0, %xmm0
50+
; AVX512-NEXT: vrsqrtps %xmm1, %xmm1
51+
; AVX512-NEXT: vrsqrtps %xmm2, %xmm2
52+
; AVX512-NEXT: vrsqrtps %xmm3, %xmm3
53+
; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
54+
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
55+
; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
56+
; AVX512-NEXT: retq
57+
%v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
58+
%v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
59+
%v2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a2)
60+
%v3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a3)
61+
%r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
62+
%r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
63+
%res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
64+
ret <16 x float> %res
65+
}

0 commit comments

Comments
 (0)