Skip to content

Commit 5485f24

Browse files
committed
[AMDGPU] Illegal VGPR to SGPR copy
This patch resolves an instance of an illegal VGPR to SGPR copy by invoking `BuildMI()` when the source register is a VGPR and a destination register is a SGPR, and since we cannot copy data directly from a VGPR to a SGPR, we use `AMDGPU::V_READFIRSTLANE_B32`. Fixes SWDEV-530052.
1 parent 74a102f commit 5485f24

7 files changed

+218
-79
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
875875
}
876876

877877
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
878-
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
878+
// We invoke BuildMI() only when we have verified that the source register
879+
// is a VGPR and the destination register is a SGPR, and since we cannot
880+
// transfer data directly from VGPR to SGPR, we use
881+
// AMDGPU::V_READFIRSTLANE_B32
882+
assert(AMDGPU::SReg_32RegClass.contains(DestReg));
883+
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
884+
BuildMI(MBB, MI, DL, this->get(AMDGPU::V_READFIRSTLANE_B32), DestReg)
885+
.addReg(SrcReg);
879886
return;
880887
}
881888

llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll renamed to llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill.ll

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
2-
3-
; CHECK: illegal VGPR to SGPR copy
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck -enable-var-scope %s
42

53
declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
64
declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
@@ -25,3 +23,13 @@ attributes #0 = { nounwind }
2523

2624
!llvm.module.flags = !{!0}
2725
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
26+
; CHECK: v_readlane_b32
27+
; CHECK: s_mov_b32
28+
; CHECK: v_writelane_b32
29+
; CHECK: s_swappc_b64
30+
; CHECK: s_or_saveexec_b64
31+
; CHECK: buffer_load_dword
32+
; CHECK: s_waitcnt
33+
; CHECK: s_addk_i32
34+
; CHECK: v_readfirstlane_b32
35+
; CHECK: s_mov_b64

llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
; RUN: not llc -mtriple=amdgcn -verify-machineinstrs=0 < %s 2>&1 | FileCheck -check-prefix=ERR %s
22
; RUN: not llc -mtriple=amdgcn -verify-machineinstrs=0 < %s 2>&1 | FileCheck -check-prefix=GCN %s
33

4-
; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_i32 void (): illegal VGPR to SGPR copy
5-
; GCN: ; illegal copy v1 to s9
64

75
define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_i32() #0 {
86
%vgpr = call i32 asm sideeffect "; def $0", "=${v1}"()
@@ -42,9 +40,7 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 {
4240
ret void
4341
}
4442

45-
; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal VGPR to SGPR copy
46-
; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1
47-
; GCN: ; illegal copy [[COPY1]] to s9
43+
4844
define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 {
4945
%agpr = call i32 asm sideeffect "; def $0", "=${a1}"()
5046
call void asm sideeffect "; use $0", "${s9}"(i32 %agpr)
@@ -54,12 +50,12 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 {
5450
; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal VGPR to SGPR copy
5551
; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0
5652
; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1
57-
; GCN: ; illegal copy v[[[COPY1L]]:[[COPY1H]]] to s[10:11]
53+
; GCN: ; illegal copy v[0:1] to s[10:11]
5854
define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 {
5955
%vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"()
6056
call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr)
6157
ret void
6258
}
6359

6460
attributes #0 = { nounwind }
65-
attributes #1 = { nounwind "target-cpu"="gfx908" }
61+
attributes #1 = { nounwind "target-cpu"="gfx908" }

llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll

Lines changed: 98 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,107 @@
1-
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s
2-
; RUN: FileCheck -check-prefix=ERR %s < %t.err
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s
33

44
; FIXME: This error will be fixed by supporting arbitrary divergent
55
; dynamic allocas by performing a wave umax of the size.
66

7-
; ERR: error: <unknown>:0:0: in function move_to_valu_assert_srd_is_physreg_swdev503538 i32 (ptr addrspace(1)): illegal VGPR to SGPR copy
8-
9-
; CHECK: ; illegal copy v0 to s32
107

118
define i32 @move_to_valu_assert_srd_is_physreg_swdev503538(ptr addrspace(1) %ptr) {
9+
; CHECK-LABEL: move_to_valu_assert_srd_is_physreg_swdev503538:
10+
; CHECK: ; %bb.0: ; %entry
11+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12+
; CHECK-NEXT: s_mov_b32 s7, s33
13+
; CHECK-NEXT: s_mov_b32 s33, s32
14+
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
15+
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
16+
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
17+
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
18+
; CHECK-NEXT: v_mov_b32_e32 v2, v1
19+
; CHECK-NEXT: ; implicit-def: $sgpr4
20+
; CHECK-NEXT: ; implicit-def: $sgpr4
21+
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
22+
; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr2 killed $exec
23+
; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5
24+
; CHECK-NEXT: v_mov_b32_e32 v0, s32
25+
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse
26+
; CHECK-NEXT: v_readfirstlane_b32 s32, v0
27+
; CHECK-NEXT: v_accvgpr_write_b32 a1, v0 ; Reload Reuse
28+
; CHECK-NEXT: ; implicit-def: $sgpr4
29+
; CHECK-NEXT: s_mov_b64 s[4:5], exec
30+
; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
31+
; CHECK-NEXT: v_writelane_b32 v3, s4, 0
32+
; CHECK-NEXT: v_writelane_b32 v3, s5, 1
33+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
34+
; CHECK-NEXT: v_accvgpr_write_b32 a2, v3 ; Reload Reuse
35+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
36+
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
37+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
38+
; CHECK-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
39+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
40+
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse
41+
; CHECK-NEXT: v_readfirstlane_b32 s4, v0
42+
; CHECK-NEXT: v_writelane_b32 v3, s4, 2
43+
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
44+
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
45+
; CHECK-NEXT: v_writelane_b32 v3, s4, 3
46+
; CHECK-NEXT: v_writelane_b32 v3, s5, 4
47+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
48+
; CHECK-NEXT: v_accvgpr_write_b32 a2, v3 ; Reload Reuse
49+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
50+
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
51+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
52+
; CHECK-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
53+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
54+
; CHECK-NEXT: v_readlane_b32 s4, v3, 3
55+
; CHECK-NEXT: v_readlane_b32 s5, v3, 4
56+
; CHECK-NEXT: v_readlane_b32 s6, v3, 2
57+
; CHECK-NEXT: s_nop 4
58+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s6
59+
; CHECK-NEXT: s_waitcnt vmcnt(0)
60+
; CHECK-NEXT: v_accvgpr_write_b32 a3, v0 ; Reload Reuse
61+
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
62+
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
63+
; CHECK-NEXT: ; %bb.3:
64+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
65+
; CHECK-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
66+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
67+
; CHECK-NEXT: v_readlane_b32 s4, v3, 0
68+
; CHECK-NEXT: v_readlane_b32 s5, v3, 1
69+
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
70+
; CHECK-NEXT: s_mov_b32 s4, 0
71+
; CHECK-NEXT: v_writelane_b32 v3, s4, 5
72+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
73+
; CHECK-NEXT: v_accvgpr_write_b32 a2, v3 ; Reload Reuse
74+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
75+
; CHECK-NEXT: .LBB0_4: ; %loadstoreloop
76+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
77+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
78+
; CHECK-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
79+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
80+
; CHECK-NEXT: v_readlane_b32 s4, v3, 5
81+
; CHECK-NEXT: v_accvgpr_read_b32 v0, a1 ; Reload Reuse
82+
; CHECK-NEXT: v_add_u32_e64 v1, v0, s4
83+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
84+
; CHECK-NEXT: buffer_store_byte v0, v1, s[0:3], 0 offen
85+
; CHECK-NEXT: s_mov_b32 s5, 1
86+
; CHECK-NEXT: s_add_i32 s4, s4, s5
87+
; CHECK-NEXT: s_mov_b32 s5, 0x800
88+
; CHECK-NEXT: s_cmp_lt_u32 s4, s5
89+
; CHECK-NEXT: v_writelane_b32 v3, s4, 5
90+
; CHECK-NEXT: s_mov_b64 s[10:11], exec
91+
; CHECK-NEXT: s_mov_b64 exec, -1
92+
; CHECK-NEXT: v_accvgpr_write_b32 a2, v3 ; Reload Reuse
93+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
94+
; CHECK-NEXT: s_cbranch_scc1 .LBB0_4
95+
; CHECK-NEXT: ; %bb.5: ; %Flow
96+
; CHECK-NEXT: ; %bb.6: ; %split
97+
; CHECK-NEXT: v_accvgpr_read_b32 v0, a3 ; Reload Reuse
98+
; CHECK-NEXT: s_mov_b32 s32, s33
99+
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
100+
; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
101+
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
102+
; CHECK-NEXT: s_mov_b32 s33, s7
103+
; CHECK-NEXT: s_waitcnt vmcnt(0)
104+
; CHECK-NEXT: s_setpc_b64 s[30:31]
12105
entry:
13106
%idx = load i32, ptr addrspace(1) %ptr, align 4
14107
%zero = extractelement <4 x i32> zeroinitializer, i32 %idx
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
3+
4+
declare void @test_buffer_load_sgpr_plus_imm_offset_noflags(i32 inreg)
5+
6+
define void @test_load_zext(i32 %foo) {
7+
; CHECK-LABEL: test_load_zext:
8+
; CHECK: ; %bb.0:
9+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10+
; CHECK-NEXT: s_mov_b32 s0, s33
11+
; CHECK-NEXT: s_mov_b32 s33, s32
12+
; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
13+
; CHECK-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
14+
; CHECK-NEXT: s_mov_b64 exec, s[2:3]
15+
; CHECK-NEXT: s_add_i32 s32, s32, 16
16+
; CHECK-NEXT: v_writelane_b32 v40, s0, 2
17+
; CHECK-NEXT: s_getpc_b64 s[0:1]
18+
; CHECK-NEXT: s_add_u32 s0, s0, test_buffer_load_sgpr_plus_imm_offset_noflags@gotpcrel32@lo+4
19+
; CHECK-NEXT: s_addc_u32 s1, s1, test_buffer_load_sgpr_plus_imm_offset_noflags@gotpcrel32@hi+12
20+
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
21+
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
22+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
23+
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
24+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
25+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[2:3]
26+
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
27+
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
28+
; CHECK-NEXT: s_mov_b32 s32, s33
29+
; CHECK-NEXT: v_readlane_b32 s0, v40, 2
30+
; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
31+
; CHECK-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
32+
; CHECK-NEXT: s_mov_b64 exec, s[2:3]
33+
; CHECK-NEXT: s_mov_b32 s33, s0
34+
; CHECK-NEXT: s_waitcnt vmcnt(0)
35+
; CHECK-NEXT: s_setpc_b64 s[30:31]
36+
call void @test_buffer_load_sgpr_plus_imm_offset_noflags(i32 %foo)
37+
ret void
38+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
3+
4+
define void @test_load_zext(<4 x i32> %LGV) {
5+
; CHECK-LABEL: test_load_zext:
6+
; CHECK: ; %bb.0: ; %.entry
7+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8+
; CHECK-NEXT: s_mov_b32 s0, s33
9+
; CHECK-NEXT: s_mov_b32 s33, s32
10+
; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
11+
; CHECK-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
12+
; CHECK-NEXT: s_mov_b64 exec, s[2:3]
13+
; CHECK-NEXT: s_add_i32 s32, s32, 16
14+
; CHECK-NEXT: v_writelane_b32 v40, s0, 2
15+
; CHECK-NEXT: s_getpc_b64 s[0:1]
16+
; CHECK-NEXT: s_add_u32 s0, s0, test_buffer_load_sgpr_plus_imm_offset_noflags@gotpcrel32@lo+4
17+
; CHECK-NEXT: s_addc_u32 s1, s1, test_buffer_load_sgpr_plus_imm_offset_noflags@gotpcrel32@hi+12
18+
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0
19+
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
20+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
21+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
22+
; CHECK-NEXT: v_readfirstlane_b32 s2, v2
23+
; CHECK-NEXT: v_readfirstlane_b32 s3, v3
24+
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
25+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
26+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
27+
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
28+
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
29+
; CHECK-NEXT: s_mov_b32 s32, s33
30+
; CHECK-NEXT: v_readlane_b32 s0, v40, 2
31+
; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
32+
; CHECK-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
33+
; CHECK-NEXT: s_mov_b64 exec, s[2:3]
34+
; CHECK-NEXT: s_mov_b32 s33, s0
35+
; CHECK-NEXT: s_waitcnt vmcnt(0)
36+
; CHECK-NEXT: s_setpc_b64 s[30:31]
37+
.entry:
38+
call void @test_buffer_load_sgpr_plus_imm_offset_noflags(<4 x i32> %LGV)
39+
ret void
40+
}
41+
42+
declare void @test_buffer_load_sgpr_plus_imm_offset_noflags(<4 x i32> inreg)
Lines changed: 18 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,33 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 2> %t.err < %s | FileCheck %s
3-
; RUN: FileCheck -check-prefix=ERR %s < %t.err
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck %s
42
; FIXME: These tests cannot be tail called, and should be executed in a waterfall loop.
53

64
declare hidden void @void_func_i32_inreg(i32 inreg)
7-
8-
; ERR: error: <unknown>:0:0: in function tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy
9-
; ERR: error: <unknown>:0:0: in function indirect_tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy
10-
115
define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
12-
; CHECK-LABEL: tail_call_i32_inreg_divergent:
13-
; CHECK: ; %bb.0:
14-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15-
; CHECK-NEXT: s_mov_b32 s16, s33
16-
; CHECK-NEXT: s_mov_b32 s33, s32
17-
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
18-
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
19-
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
20-
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
21-
; CHECK-NEXT: s_addk_i32 s32, 0x400
22-
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
23-
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
24-
; CHECK-NEXT: s_getpc_b64 s[16:17]
25-
; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg@rel32@lo+4
26-
; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg@rel32@hi+12
27-
; CHECK-NEXT: ; illegal copy v0 to s0
28-
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
29-
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
30-
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
31-
; CHECK-NEXT: s_mov_b32 s32, s33
32-
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
33-
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
34-
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
35-
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
36-
; CHECK-NEXT: s_mov_b32 s33, s4
37-
; CHECK-NEXT: s_waitcnt vmcnt(0)
38-
; CHECK-NEXT: s_setpc_b64 s[30:31]
396
tail call void @void_func_i32_inreg(i32 inreg %vgpr)
407
ret void
418
}
429

4310
@constant = external hidden addrspace(4) constant ptr
4411

4512
define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
46-
; CHECK-LABEL: indirect_tail_call_i32_inreg_divergent:
47-
; CHECK: ; %bb.0:
48-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49-
; CHECK-NEXT: s_mov_b32 s16, s33
50-
; CHECK-NEXT: s_mov_b32 s33, s32
51-
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
52-
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
53-
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
54-
; CHECK-NEXT: s_addk_i32 s32, 0x400
55-
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
56-
; CHECK-NEXT: s_getpc_b64 s[16:17]
57-
; CHECK-NEXT: s_add_u32 s16, s16, constant@rel32@lo+4
58-
; CHECK-NEXT: s_addc_u32 s17, s17, constant@rel32@hi+12
59-
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
60-
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
61-
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
62-
; CHECK-NEXT: ; illegal copy v0 to s0
63-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
64-
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
65-
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
66-
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
67-
; CHECK-NEXT: s_mov_b32 s32, s33
68-
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
69-
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
70-
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
71-
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
72-
; CHECK-NEXT: s_mov_b32 s33, s4
73-
; CHECK-NEXT: s_waitcnt vmcnt(0)
74-
; CHECK-NEXT: s_setpc_b64 s[30:31]
7513
%fptr = load ptr, ptr addrspace(4) @constant, align 8
7614
tail call void %fptr(i32 inreg %vgpr)
7715
ret void
7816
}
17+
;CHECK: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
18+
;CHECK: s_mov_b64 exec, s[18:19]
19+
;CHECK: v_writelane_b32 v40, s16, 2
20+
;CHECK: s_addk_i32 s32, 0x400
21+
;CHECK: v_writelane_b32 v40, s30, 0
22+
;CHECK: s_getpc_b64 s[16:17]
23+
;CHECK: s_add_u32 s16, s16, void_func_i32_inreg@rel32@lo+4
24+
;CHECK: s_addc_u32 s17, s17, void_func_i32_inreg@rel32@hi+12
25+
;CHECK: v_readfirstlane_b32 s0, v0
26+
;CHECK: v_writelane_b32 v40, s31, 1
27+
;CHECK: s_swappc_b64 s[30:31], s[16:17]
28+
;CHECK: v_readlane_b32 s31, v40, 1
29+
;CHECK: v_readlane_b32 s30, v40, 0
30+
;CHECK: s_mov_b32 s32, s33
31+
;CHECK: v_readlane_b32 s4, v40, 2
32+
;CHECK: s_or_saveexec_b64 s[6:7], -1
33+
;CHECK: buffer_load_dword v40, off, s[0:3], s33

0 commit comments

Comments
 (0)