Skip to content

Commit 5b37d89

Browse files
committed
Merging r359899:
------------------------------------------------------------------------ r359899 | arsenm | 2019-05-03 08:37:07 -0700 (Fri, 03 May 2019) | 7 lines AMDGPU: Select VOP3 form of sub The VOP3 form should always be the preferred selection form to be shrunk later. The r600 sub test needs to be split out because it asserts on the arguments in the new test during the calling convention lowering. ------------------------------------------------------------------------ llvm-svn: 362654
1 parent c8af241 commit 5b37d89

File tree

3 files changed

+197
-53
lines changed

3 files changed

+197
-53
lines changed

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -516,15 +516,13 @@ let AddedComplexity = 1 in {
516516

517517
let SubtargetPredicate = HasAddNoCarryInsts in {
518518
def : DivergentBinOp<add, V_ADD_U32_e32>;
519-
def : DivergentBinOp<sub, V_SUB_U32_e32>;
520-
def : DivergentBinOp<sub, V_SUBREV_U32_e32>;
519+
520+
def : DivergentBinOp<sub, V_SUB_U32_e64>;
521521
}
522522

523523

524524
def : DivergentBinOp<add, V_ADD_I32_e32>;
525-
526-
def : DivergentBinOp<add, V_ADD_I32_e64>;
527-
def : DivergentBinOp<sub, V_SUB_I32_e32>;
525+
def : DivergentBinOp<sub, V_SUB_I32_e64>;
528526

529527
def : DivergentBinOp<sub, V_SUBREV_I32_e32>;
530528

llvm/test/CodeGen/AMDGPU/r600.sub.ll

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
2+
3+
declare i32 @llvm.r600.read.tidig.x() readnone
4+
5+
; FUNC-LABEL: {{^}}s_sub_i32:
6+
define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
7+
%result = sub i32 %a, %b
8+
store i32 %result, i32 addrspace(1)* %out
9+
ret void
10+
}
11+
12+
; FUNC-LABEL: {{^}}s_sub_imm_i32:
13+
define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) {
14+
%result = sub i32 1234, %a
15+
store i32 %result, i32 addrspace(1)* %out
16+
ret void
17+
}
18+
19+
; FUNC-LABEL: {{^}}test_sub_i32:
20+
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
21+
define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
22+
%b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
23+
%a = load i32, i32 addrspace(1)* %in
24+
%b = load i32, i32 addrspace(1)* %b_ptr
25+
%result = sub i32 %a, %b
26+
store i32 %result, i32 addrspace(1)* %out
27+
ret void
28+
}
29+
30+
; FUNC-LABEL: {{^}}test_sub_imm_i32:
31+
; EG: SUB_INT
32+
define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
33+
%a = load i32, i32 addrspace(1)* %in
34+
%result = sub i32 123, %a
35+
store i32 %result, i32 addrspace(1)* %out
36+
ret void
37+
}
38+
39+
; FUNC-LABEL: {{^}}test_sub_v2i32:
40+
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
41+
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
42+
define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
43+
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
44+
%a = load <2 x i32>, <2 x i32> addrspace(1) * %in
45+
%b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
46+
%result = sub <2 x i32> %a, %b
47+
store <2 x i32> %result, <2 x i32> addrspace(1)* %out
48+
ret void
49+
}
50+
51+
; FUNC-LABEL: {{^}}test_sub_v4i32:
52+
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
53+
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
54+
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
55+
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
56+
define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
57+
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
58+
%a = load <4 x i32>, <4 x i32> addrspace(1) * %in
59+
%b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
60+
%result = sub <4 x i32> %a, %b
61+
store <4 x i32> %result, <4 x i32> addrspace(1)* %out
62+
ret void
63+
}
64+
65+
; FUNC-LABEL: {{^}}test_sub_i16:
66+
define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
67+
%tid = call i32 @llvm.r600.read.tidig.x()
68+
%gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
69+
%b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i32 1
70+
%a = load volatile i16, i16 addrspace(1)* %gep
71+
%b = load volatile i16, i16 addrspace(1)* %b_ptr
72+
%result = sub i16 %a, %b
73+
store i16 %result, i16 addrspace(1)* %out
74+
ret void
75+
}
76+
77+
; FUNC-LABEL: {{^}}test_sub_v2i16:
78+
define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
79+
%tid = call i32 @llvm.r600.read.tidig.x()
80+
%gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
81+
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
82+
%a = load <2 x i16>, <2 x i16> addrspace(1)* %gep
83+
%b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
84+
%result = sub <2 x i16> %a, %b
85+
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
86+
ret void
87+
}
88+
89+
; FUNC-LABEL: {{^}}test_sub_v4i16:
90+
define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
91+
%tid = call i32 @llvm.r600.read.tidig.x()
92+
%gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
93+
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
94+
%a = load <4 x i16>, <4 x i16> addrspace(1) * %gep
95+
%b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
96+
%result = sub <4 x i16> %a, %b
97+
store <4 x i16> %result, <4 x i16> addrspace(1)* %out
98+
ret void
99+
}
100+
101+
; FUNC-LABEL: {{^}}s_sub_i64:
102+
; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
103+
; EG-DAG: SUB_INT {{[* ]*}}
104+
; EG-DAG: SUBB_UINT
105+
; EG-DAG: SUB_INT
106+
; EG-DAG: SUB_INT {{[* ]*}}
107+
define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
108+
%result = sub i64 %a, %b
109+
store i64 %result, i64 addrspace(1)* %out, align 8
110+
ret void
111+
}
112+
113+
; FUNC-LABEL: {{^}}v_sub_i64:
114+
; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
115+
; EG-DAG: SUB_INT {{[* ]*}}
116+
; EG-DAG: SUBB_UINT
117+
; EG-DAG: SUB_INT
118+
; EG-DAG: SUB_INT {{[* ]*}}
119+
define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
120+
%tid = call i32 @llvm.r600.read.tidig.x() readnone
121+
%a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
122+
%b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
123+
%a = load i64, i64 addrspace(1)* %a_ptr
124+
%b = load i64, i64 addrspace(1)* %b_ptr
125+
%result = sub i64 %a, %b
126+
store i64 %result, i64 addrspace(1)* %out, align 8
127+
ret void
128+
}
129+
130+
; FUNC-LABEL: {{^}}v_test_sub_v2i64:
131+
define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
132+
%tid = call i32 @llvm.r600.read.tidig.x() readnone
133+
%a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
134+
%b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
135+
%a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
136+
%b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
137+
%result = sub <2 x i64> %a, %b
138+
store <2 x i64> %result, <2 x i64> addrspace(1)* %out
139+
ret void
140+
}
141+
142+
; FUNC-LABEL: {{^}}v_test_sub_v4i64:
143+
define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
144+
%tid = call i32 @llvm.r600.read.tidig.x() readnone
145+
%a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
146+
%b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
147+
%a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
148+
%b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
149+
%result = sub <4 x i64> %a, %b
150+
store <4 x i64> %result, <4 x i64> addrspace(1)* %out
151+
ret void
152+
}

llvm/test/CodeGen/AMDGPU/sub.ll

Lines changed: 42 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
1-
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,FUNC %s
2-
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,FUNC %s
3-
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89,FUNC %s
4-
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
1+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
2+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
3+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
54

6-
declare i32 @llvm.r600.read.tidig.x() readnone
5+
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable
76

8-
; FUNC-LABEL: {{^}}s_sub_i32:
7+
; GCN-LABEL: {{^}}s_sub_i32:
98
; GCN: s_load_dwordx2
109
; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
1110
; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]]
@@ -15,7 +14,7 @@ define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
1514
ret void
1615
}
1716

18-
; FUNC-LABEL: {{^}}s_sub_imm_i32:
17+
; GCN-LABEL: {{^}}s_sub_imm_i32:
1918
; GCN: s_load_dword [[A:s[0-9]+]]
2019
; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]]
2120
define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) {
@@ -24,9 +23,7 @@ define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) {
2423
ret void
2524
}
2625

27-
; FUNC-LABEL: {{^}}test_sub_i32:
28-
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
29-
26+
; GCN-LABEL: {{^}}test_sub_i32:
3027
; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
3128
; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
3229
define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -38,9 +35,7 @@ define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)
3835
ret void
3936
}
4037

41-
; FUNC-LABEL: {{^}}test_sub_imm_i32:
42-
; EG: SUB_INT
43-
38+
; GCN-LABEL: {{^}}test_sub_imm_i32:
4439
; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}}
4540
; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
4641
define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -50,10 +45,7 @@ define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspac
5045
ret void
5146
}
5247

53-
; FUNC-LABEL: {{^}}test_sub_v2i32:
54-
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
55-
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
56-
48+
; GCN-LABEL: {{^}}test_sub_v2i32:
5749
; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
5850
; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
5951

@@ -68,12 +60,7 @@ define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32
6860
ret void
6961
}
7062

71-
; FUNC-LABEL: {{^}}test_sub_v4i32:
72-
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
73-
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
74-
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
75-
; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
76-
63+
; GCN-LABEL: {{^}}test_sub_v4i32:
7764
; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
7865
; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
7966
; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
@@ -92,11 +79,11 @@ define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32
9279
ret void
9380
}
9481

95-
; FUNC-LABEL: {{^}}test_sub_i16:
82+
; GCN-LABEL: {{^}}test_sub_i16:
9683
; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
9784
; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
9885
define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
99-
%tid = call i32 @llvm.r600.read.tidig.x()
86+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
10087
%gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
10188
%b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i32 1
10289
%a = load volatile i16, i16 addrspace(1)* %gep
@@ -106,13 +93,13 @@ define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)
10693
ret void
10794
}
10895

109-
; FUNC-LABEL: {{^}}test_sub_v2i16:
96+
; GCN-LABEL: {{^}}test_sub_v2i16:
11097
; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
11198
; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}}
11299

113100
; GFX9: v_pk_sub_i16
114101
define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
115-
%tid = call i32 @llvm.r600.read.tidig.x()
102+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
116103
%gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
117104
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
118105
%a = load <2 x i16>, <2 x i16> addrspace(1)* %gep
@@ -122,7 +109,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
122109
ret void
123110
}
124111

125-
; FUNC-LABEL: {{^}}test_sub_v4i16:
112+
; GCN-LABEL: {{^}}test_sub_v4i16:
126113
; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
127114
; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}}
128115
; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
@@ -131,7 +118,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
131118
; GFX9: v_pk_sub_i16
132119
; GFX9: v_pk_sub_i16
133120
define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
134-
%tid = call i32 @llvm.r600.read.tidig.x()
121+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
135122
%gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
136123
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
137124
%a = load <4 x i16>, <4 x i16> addrspace(1) * %gep
@@ -141,22 +128,16 @@ define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16
141128
ret void
142129
}
143130

144-
; FUNC-LABEL: {{^}}s_sub_i64:
131+
; GCN-LABEL: {{^}}s_sub_i64:
145132
; GCN: s_sub_u32
146133
; GCN: s_subb_u32
147-
148-
; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
149-
; EG-DAG: SUB_INT {{[* ]*}}
150-
; EG-DAG: SUBB_UINT
151-
; EG-DAG: SUB_INT
152-
; EG-DAG: SUB_INT {{[* ]*}}
153134
define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
154135
%result = sub i64 %a, %b
155136
store i64 %result, i64 addrspace(1)* %out, align 8
156137
ret void
157138
}
158139

159-
; FUNC-LABEL: {{^}}v_sub_i64:
140+
; GCN-LABEL: {{^}}v_sub_i64:
160141
; SI: v_sub_i32_e32
161142
; SI: v_subb_u32_e32
162143

@@ -165,14 +146,8 @@ define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64
165146

166147
; GFX9: v_sub_co_u32_e32
167148
; GFX9: v_subb_co_u32_e32
168-
169-
; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
170-
; EG-DAG: SUB_INT {{[* ]*}}
171-
; EG-DAG: SUBB_UINT
172-
; EG-DAG: SUB_INT
173-
; EG-DAG: SUB_INT {{[* ]*}}
174149
define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
175-
%tid = call i32 @llvm.r600.read.tidig.x() readnone
150+
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
176151
%a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
177152
%b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
178153
%a = load i64, i64 addrspace(1)* %a_ptr
@@ -182,7 +157,7 @@ define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspa
182157
ret void
183158
}
184159

185-
; FUNC-LABEL: {{^}}v_test_sub_v2i64:
160+
; GCN-LABEL: {{^}}v_test_sub_v2i64:
186161
; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
187162
; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
188163
; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
@@ -198,7 +173,7 @@ define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspa
198173
; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
199174
; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
200175
define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
201-
%tid = call i32 @llvm.r600.read.tidig.x() readnone
176+
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
202177
%a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
203178
%b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
204179
%a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
@@ -208,7 +183,7 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
208183
ret void
209184
}
210185

211-
; FUNC-LABEL: {{^}}v_test_sub_v4i64:
186+
; GCN-LABEL: {{^}}v_test_sub_v4i64:
212187
; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
213188
; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
214189
; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
@@ -236,7 +211,7 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
236211
; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
237212
; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
238213
define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
239-
%tid = call i32 @llvm.r600.read.tidig.x() readnone
214+
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
240215
%a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
241216
%b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
242217
%a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
@@ -245,3 +220,22 @@ define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
245220
store <4 x i64> %result, <4 x i64> addrspace(1)* %out
246221
ret void
247222
}
223+
224+
; Make sure the VOP3 form of sub is initially selected. Otherwise pair
225+
; of opies from/to VCC would be necessary
226+
227+
; GCN-LABEL: {{^}}sub_select_vop3:
228+
; SI: v_subrev_i32_e64 v0, s[0:1], s0, v0
229+
; VI: v_subrev_u32_e64 v0, s[0:1], s0, v0
230+
; GFX9: v_subrev_u32_e32 v0, s0, v0
231+
232+
; GCN: ; def vcc
233+
; GCN: ds_write_b32
234+
; GCN: ; use vcc
235+
define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) {
236+
%vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
237+
%sub = sub i32 %v, %s
238+
store i32 %sub, i32 addrspace(3)* undef
239+
call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
240+
ret void
241+
}

0 commit comments

Comments
 (0)