Skip to content

Commit d95b14d

Browse files
committed
Merging r360293:
------------------------------------------------------------------------ r360293 | arsenm | 2019-05-08 15:09:57 -0700 (Wed, 08 May 2019) | 21 lines AMDGPU: Select VOP3 form of add The VOP3 form should always be the preferred selection, to be shrunk later. This should only be an optimization issue, but this partially works around a problem from clobbering VCC when SIFixSGPRCopies rewrites an SCC defining operation directly to VCC. 3 of the testcases are regressions from failing to fold the immediate in cases it should. These can be avoided by improving the VCC liveness handling in SIFoldOperands. Simply increasing the threshold to computeRegisterLiveness works, although this is common enough that VCC liveness should probably be tracked throughout the pass. The hack of leaving behind an implicit_def instruction to avoid breaking iterator wastes instruction count, which inhibits finding the VCC def in long chains of adds. Doing this however exposes different, worse looking regressions from poor scheduling behavior. This could probably be avoided around by forcing the shrink of the addc here, but the scheduler should probably be fixed. The r600 add test needs to be split out because it asserts on the arguments in the new test during the calling convention lowering. ------------------------------------------------------------------------ llvm-svn: 362658
1 parent 5b37d89 commit d95b14d

File tree

8 files changed

+202
-74
lines changed

8 files changed

+202
-74
lines changed

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -515,17 +515,13 @@ let AddedComplexity = 1 in {
515515
}
516516

517517
let SubtargetPredicate = HasAddNoCarryInsts in {
518-
def : DivergentBinOp<add, V_ADD_U32_e32>;
519-
518+
def : DivergentBinOp<add, V_ADD_U32_e64>;
520519
def : DivergentBinOp<sub, V_SUB_U32_e64>;
521520
}
522521

523-
524-
def : DivergentBinOp<add, V_ADD_I32_e32>;
522+
def : DivergentBinOp<add, V_ADD_I32_e64>;
525523
def : DivergentBinOp<sub, V_SUB_I32_e64>;
526524

527-
def : DivergentBinOp<sub, V_SUBREV_I32_e32>;
528-
529525
def : DivergentBinOp<srl, V_LSHRREV_B32_e32>;
530526
def : DivergentBinOp<sra, V_ASHRREV_I32_e32>;
531527
def : DivergentBinOp<shl, V_LSHLREV_B32_e32>;

llvm/test/CodeGen/AMDGPU/add.ll

Lines changed: 22 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
11
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
22
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
33
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
4-
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
54

65
; FUNC-LABEL: {{^}}s_add_i32:
7-
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
8-
96
; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
107
; GCN: v_mov_b32_e32 v[[V_REG:[0-9]+]], s[[REG]]
118
; GCN: buffer_store_dword v[[V_REG]],
@@ -19,9 +16,6 @@ define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
1916
}
2017

2118
; FUNC-LABEL: {{^}}s_add_v2i32:
22-
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
23-
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
24-
2519
; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
2620
; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
2721
define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
@@ -34,11 +28,6 @@ define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> a
3428
}
3529

3630
; FUNC-LABEL: {{^}}s_add_v4i32:
37-
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
38-
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
39-
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
40-
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
41-
4231
; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
4332
; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
4433
; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
@@ -53,15 +42,6 @@ define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
5342
}
5443

5544
; FUNC-LABEL: {{^}}s_add_v8i32:
56-
; EG: ADD_INT
57-
; EG: ADD_INT
58-
; EG: ADD_INT
59-
; EG: ADD_INT
60-
; EG: ADD_INT
61-
; EG: ADD_INT
62-
; EG: ADD_INT
63-
; EG: ADD_INT
64-
6545
; GCN: s_add_i32
6646
; GCN: s_add_i32
6747
; GCN: s_add_i32
@@ -78,23 +58,6 @@ entry:
7858
}
7959

8060
; FUNC-LABEL: {{^}}s_add_v16i32:
81-
; EG: ADD_INT
82-
; EG: ADD_INT
83-
; EG: ADD_INT
84-
; EG: ADD_INT
85-
; EG: ADD_INT
86-
; EG: ADD_INT
87-
; EG: ADD_INT
88-
; EG: ADD_INT
89-
; EG: ADD_INT
90-
; EG: ADD_INT
91-
; EG: ADD_INT
92-
; EG: ADD_INT
93-
; EG: ADD_INT
94-
; EG: ADD_INT
95-
; EG: ADD_INT
96-
; EG: ADD_INT
97-
9861
; GCN: s_add_i32
9962
; GCN: s_add_i32
10063
; GCN: s_add_i32
@@ -124,7 +87,7 @@ entry:
12487
; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[A]], [[B]]
12588
; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
12689
define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
127-
%tid = call i32 @llvm.r600.read.tidig.x()
90+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
12891
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
12992
%b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
13093
%a = load volatile i32, i32 addrspace(1)* %gep
@@ -139,7 +102,7 @@ define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
139102
; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, 0x7b, [[A]]
140103
; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x7b, [[A]]
141104
define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
142-
%tid = call i32 @llvm.r600.read.tidig.x()
105+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
143106
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
144107
%b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
145108
%a = load volatile i32, i32 addrspace(1)* %gep
@@ -151,13 +114,6 @@ define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1
151114
; FUNC-LABEL: {{^}}add64:
152115
; GCN: s_add_u32
153116
; GCN: s_addc_u32
154-
155-
; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
156-
; EG-DAG: ADD_INT {{[* ]*}}
157-
; EG-DAG: ADDC_UINT
158-
; EG-DAG: ADD_INT
159-
; EG-DAG: ADD_INT {{[* ]*}}
160-
; EG-NOT: SUB
161117
define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
162118
entry:
163119
%add = add i64 %a, %b
@@ -172,13 +128,6 @@ entry:
172128

173129
; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
174130
; GCN-NOT: v_addc_u32_e32 s
175-
176-
; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
177-
; EG-DAG: ADD_INT {{[* ]*}}
178-
; EG-DAG: ADDC_UINT
179-
; EG-DAG: ADD_INT
180-
; EG-DAG: ADD_INT {{[* ]*}}
181-
; EG-NOT: SUB
182131
define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
183132
entry:
184133
%0 = load i64, i64 addrspace(1)* %in
@@ -191,13 +140,6 @@ entry:
191140
; FUNC-LABEL: {{^}}add64_in_branch:
192141
; GCN: s_add_u32
193142
; GCN: s_addc_u32
194-
195-
; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
196-
; EG-DAG: ADD_INT {{[* ]*}}
197-
; EG-DAG: ADDC_UINT
198-
; EG-DAG: ADD_INT
199-
; EG-DAG: ADD_INT {{[* ]*}}
200-
; EG-NOT: SUB
201143
define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
202144
entry:
203145
%0 = icmp eq i64 %a, 0
@@ -217,7 +159,26 @@ endif:
217159
ret void
218160
}
219161

220-
declare i32 @llvm.r600.read.tidig.x() #1
162+
; Make sure the VOP3 form of add is initially selected. Otherwise pair
163+
; of opies from/to VCC would be necessary
164+
165+
; GCN-LABEL: {{^}}add_select_vop3:
166+
; SI: v_add_i32_e64 v0, s[0:1], s0, v0
167+
; VI: v_add_u32_e64 v0, s[0:1], s0, v0
168+
; GFX9: v_add_u32_e32 v0, s0, v0
169+
170+
; GCN: ; def vcc
171+
; GCN: ds_write_b32
172+
; GCN: ; use vcc
173+
define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) {
174+
%vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
175+
%sub = add i32 %v, %s
176+
store i32 %sub, i32 addrspace(3)* undef
177+
call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
178+
ret void
179+
}
180+
181+
declare i32 @llvm.amdgcn.workitem.id.x() #1
221182

222183
attributes #0 = { nounwind }
223184
attributes #1 = { nounwind readnone speculatable }

llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,18 @@ declare void @llvm.amdgcn.s.barrier() #1
77

88
; Function Attrs: nounwind
99
; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
10+
; SI: s_movk_i32 [[K_0X88:s[0-9]+]], 0x
11+
; SI: s_movk_i32 [[K_0X100:s[0-9]+]], 0x100
1012
; CHECK: BB0_1:
1113
; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
1214
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
1315
; SI-DAG: v_add_i32_e32 [[VADDR8:v[0-9]+]], vcc, 8, [[VADDR]]
1416
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]]
1517
; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
1618
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
17-
; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]]
19+
; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, [[K_0X88]], [[VADDR]]
1820
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]]
19-
; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
21+
; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, [[K_0X100]], [[VADDR]]
2022
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
2123

2224
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2

llvm/test/CodeGen/AMDGPU/fence-barrier.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ define amdgpu_kernel void @test_local(i32 addrspace(1)*) {
5454
}
5555

5656
; GCN-LABEL: {{^}}test_global
57-
; GCN: v_add_u32_e32 v{{[0-9]+}}, vcc, 0x888, v{{[0-9]+}}
57+
; GCN: s_movk_i32 [[K:s[0-9]+]], 0x888
58+
; GCN: v_add_u32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}
5859
; GCN: flat_store_dword
5960
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
6061
; GCN-NEXT: s_barrier

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
1515
}
1616

1717
; VI-LABEL: {{^}}dpp_test1:
18-
; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
18+
; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
19+
; VI-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
1920
; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
2021
; VI-NEXT: s_nop 0
2122
; VI-NEXT: s_nop 0

llvm/test/CodeGen/AMDGPU/r600.add.ll

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
2+
3+
; FUNC-LABEL: {{^}}s_add_i32:
4+
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
5+
define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
6+
%b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
7+
%a = load i32, i32 addrspace(1)* %in
8+
%b = load i32, i32 addrspace(1)* %b_ptr
9+
%result = add i32 %a, %b
10+
store i32 %result, i32 addrspace(1)* %out
11+
ret void
12+
}
13+
14+
; FUNC-LABEL: {{^}}s_add_v2i32:
15+
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
16+
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
17+
define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
18+
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
19+
%a = load <2 x i32>, <2 x i32> addrspace(1)* %in
20+
%b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
21+
%result = add <2 x i32> %a, %b
22+
store <2 x i32> %result, <2 x i32> addrspace(1)* %out
23+
ret void
24+
}
25+
26+
; FUNC-LABEL: {{^}}s_add_v4i32:
27+
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
28+
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
29+
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
30+
; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
31+
define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
32+
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
33+
%a = load <4 x i32>, <4 x i32> addrspace(1)* %in
34+
%b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
35+
%result = add <4 x i32> %a, %b
36+
store <4 x i32> %result, <4 x i32> addrspace(1)* %out
37+
ret void
38+
}
39+
40+
; FUNC-LABEL: {{^}}s_add_v8i32:
41+
; EG: ADD_INT
42+
; EG: ADD_INT
43+
; EG: ADD_INT
44+
; EG: ADD_INT
45+
; EG: ADD_INT
46+
; EG: ADD_INT
47+
; EG: ADD_INT
48+
; EG: ADD_INT
49+
define amdgpu_kernel void @s_add_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
50+
entry:
51+
%0 = add <8 x i32> %a, %b
52+
store <8 x i32> %0, <8 x i32> addrspace(1)* %out
53+
ret void
54+
}
55+
56+
; FUNC-LABEL: {{^}}s_add_v16i32:
57+
; EG: ADD_INT
58+
; EG: ADD_INT
59+
; EG: ADD_INT
60+
; EG: ADD_INT
61+
; EG: ADD_INT
62+
; EG: ADD_INT
63+
; EG: ADD_INT
64+
; EG: ADD_INT
65+
; EG: ADD_INT
66+
; EG: ADD_INT
67+
; EG: ADD_INT
68+
; EG: ADD_INT
69+
; EG: ADD_INT
70+
; EG: ADD_INT
71+
; EG: ADD_INT
72+
; EG: ADD_INT
73+
define amdgpu_kernel void @s_add_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
74+
entry:
75+
%0 = add <16 x i32> %a, %b
76+
store <16 x i32> %0, <16 x i32> addrspace(1)* %out
77+
ret void
78+
}
79+
80+
; FUNC-LABEL: {{^}}v_add_i32:
81+
define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
82+
%tid = call i32 @llvm.r600.read.tidig.x()
83+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
84+
%b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
85+
%a = load volatile i32, i32 addrspace(1)* %gep
86+
%b = load volatile i32, i32 addrspace(1)* %b_ptr
87+
%result = add i32 %a, %b
88+
store i32 %result, i32 addrspace(1)* %out
89+
ret void
90+
}
91+
92+
; FUNC-LABEL: {{^}}v_add_imm_i32:
93+
define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
94+
%tid = call i32 @llvm.r600.read.tidig.x()
95+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
96+
%b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
97+
%a = load volatile i32, i32 addrspace(1)* %gep
98+
%result = add i32 %a, 123
99+
store i32 %result, i32 addrspace(1)* %out
100+
ret void
101+
}
102+
103+
; FUNC-LABEL: {{^}}add64:
104+
; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
105+
; EG-DAG: ADD_INT {{[* ]*}}
106+
; EG-DAG: ADDC_UINT
107+
; EG-DAG: ADD_INT
108+
; EG-DAG: ADD_INT {{[* ]*}}
109+
; EG-NOT: SUB
110+
define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
111+
entry:
112+
%add = add i64 %a, %b
113+
store i64 %add, i64 addrspace(1)* %out
114+
ret void
115+
}
116+
117+
; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
118+
; use VCC. The test is designed so that %a will be stored in an SGPR and
119+
; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
120+
; to a VGPR before doing the add.
121+
122+
; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
123+
; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
124+
; EG-DAG: ADD_INT {{[* ]*}}
125+
; EG-DAG: ADDC_UINT
126+
; EG-DAG: ADD_INT
127+
; EG-DAG: ADD_INT {{[* ]*}}
128+
; EG-NOT: SUB
129+
define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
130+
entry:
131+
%0 = load i64, i64 addrspace(1)* %in
132+
%1 = add i64 %a, %0
133+
store i64 %1, i64 addrspace(1)* %out
134+
ret void
135+
}
136+
137+
; Test i64 add inside a branch.
138+
; FUNC-LABEL: {{^}}add64_in_branch:
139+
; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
140+
; EG-DAG: ADD_INT {{[* ]*}}
141+
; EG-DAG: ADDC_UINT
142+
; EG-DAG: ADD_INT
143+
; EG-DAG: ADD_INT {{[* ]*}}
144+
; EG-NOT: SUB
145+
define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
146+
entry:
147+
%0 = icmp eq i64 %a, 0
148+
br i1 %0, label %if, label %else
149+
150+
if:
151+
%1 = load i64, i64 addrspace(1)* %in
152+
br label %endif
153+
154+
else:
155+
%2 = add i64 %a, %b
156+
br label %endif
157+
158+
endif:
159+
%3 = phi i64 [%1, %if], [%2, %else]
160+
store i64 %3, i64 addrspace(1)* %out
161+
ret void
162+
}
163+
164+
declare i32 @llvm.r600.read.tidig.x() #1
165+
166+
attributes #0 = { nounwind }
167+
attributes #1 = { nounwind readnone speculatable }

llvm/test/CodeGen/AMDGPU/salu-to-valu.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ bb7: ; preds = %bb3
458458
}
459459

460460
; GCN-LABEL: {{^}}phi_visit_order:
461-
; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}}
461+
; GCN: v_add_i32_e64 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 1, v{{[0-9]+}}
462462
define amdgpu_kernel void @phi_visit_order() {
463463
bb:
464464
br label %bb1

0 commit comments

Comments
 (0)