Skip to content

Commit 3f8c9b2

Browse files
committed
[AMDGPU] Fix sgpr to vreg_1 copy
1 parent ade2f10 commit 3f8c9b2

File tree

3 files changed

+147
-7
lines changed

3 files changed

+147
-7
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -946,13 +946,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
946946

947947
// Copies and REG_SEQUENCE do not contribute to the final assembly
948948
// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
949-
if (Inst->isCopy() || Inst->isRegSequence()) {
950-
if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
951-
if (!Inst->isCopy() ||
952-
!tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
953-
Info.NumSVCopies++;
954-
continue;
955-
}
949+
if (Inst->isRegSequence() &&
950+
TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
951+
Info.NumSVCopies++;
952+
continue;
953+
}
954+
if (Inst->isCopy()) {
955+
const TargetRegisterClass *SrcRC, *DstRC;
956+
std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI);
957+
if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
958+
!tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
959+
Info.NumSVCopies++;
960+
continue;
956961
}
957962
}
958963

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s
3+
4+
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
5+
target triple = "amdgcn-amd-amdhsa"
6+
7+
define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
8+
; GCN-LABEL: copy_to_vreg_1:
9+
; GCN: ; %bb.0: ; %._crit_edge
10+
; GCN-NEXT: s_load_dword s4, s[4:5], 0x24
11+
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
12+
; GCN-NEXT: v_mov_b64_e32 v[2:3], 0
13+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
14+
; GCN-NEXT: s_sub_i32 s5, 1, s4
15+
; GCN-NEXT: s_cmp_lt_u32 s4, 2
16+
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
17+
; GCN-NEXT: s_and_b64 s[2:3], s[0:1], exec
18+
; GCN-NEXT: s_cselect_b32 s3, s5, 1
19+
; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
20+
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
21+
; GCN-NEXT: s_addc_u32 s0, 1, 0
22+
; GCN-NEXT: v_readfirstlane_b32 s2, v1
23+
; GCN-NEXT: s_cmp_ge_u32 s3, s4
24+
; GCN-NEXT: s_cselect_b32 s4, s0, s2
25+
; GCN-NEXT: v_mov_b32_e32 v1, 0
26+
; GCN-NEXT: s_cmp_lg_u64 0, 0
27+
; GCN-NEXT: s_mov_b64 s[0:1], 0
28+
; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
29+
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
30+
; GCN-NEXT: s_branch .LBB0_3
31+
; GCN-NEXT: .LBB0_1: ; %Flow
32+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
33+
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
34+
; GCN-NEXT: s_xor_b64 s[8:9], exec, -1
35+
; GCN-NEXT: .LBB0_2: ; %Flow2
36+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
37+
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
38+
; GCN-NEXT: s_and_b64 s[4:5], exec, s[8:9]
39+
; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
40+
; GCN-NEXT: s_mov_b32 s4, 0
41+
; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1]
42+
; GCN-NEXT: s_cbranch_execz .LBB0_8
43+
; GCN-NEXT: .LBB0_3: ; %.lr.ph27
44+
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
45+
; GCN-NEXT: s_cmp_lg_u32 s4, 0
46+
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
47+
; GCN-NEXT: s_or_b64 s[8:9], vcc, s[4:5]
48+
; GCN-NEXT: s_xor_b64 s[6:7], s[8:9], -1
49+
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[8:9]
50+
; GCN-NEXT: s_cbranch_execz .LBB0_5
51+
; GCN-NEXT: ; %bb.4: ; %pred.store.if
52+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
53+
; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec
54+
; GCN-NEXT: global_store_byte v[2:3], v1, off
55+
; GCN-NEXT: .LBB0_5: ; %Flow1
56+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
57+
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
58+
; GCN-NEXT: s_mov_b64 s[8:9], -1
59+
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
60+
; GCN-NEXT: s_cbranch_execz .LBB0_2
61+
; GCN-NEXT: ; %bb.6: ; %pred.store.continue
62+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
63+
; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[2:3]
64+
; GCN-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
65+
; GCN-NEXT: s_cbranch_execz .LBB0_1
66+
; GCN-NEXT: ; %bb.7: ; %pred.store.if41
67+
; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1
68+
; GCN-NEXT: global_store_byte v[2:3], v1, off
69+
; GCN-NEXT: s_branch .LBB0_1
70+
; GCN-NEXT: .LBB0_8: ; %DummyReturnBlock
71+
; GCN-NEXT: s_endpgm
72+
._crit_edge:
73+
%1 = tail call i32 @llvm.amdgcn.workitem.id.x()
74+
%2 = udiv i32 1, %0
75+
br label %.lr.ph27
76+
77+
.lr.ph27: ; preds = %pred.store.if41, %pred.store.continue, %._crit_edge
78+
%3 = phi i32 [ %2, %._crit_edge ], [ 0, %pred.store.if41 ], [ 0, %pred.store.continue ]
79+
%4 = icmp ugt i32 %3, 0
80+
%broadcast.splatinsert37 = insertelement <4 x i1> zeroinitializer, i1 %4, i64 0
81+
%.zext = zext i32 %1 to i64
82+
%broadcast.splatinsert39 = insertelement <4 x i64> zeroinitializer, i64 %.zext, i64 0
83+
%5 = icmp uge <4 x i64> %broadcast.splatinsert39, splat (i64 1)
84+
%6 = or <4 x i1> %5, %broadcast.splatinsert37
85+
%7 = extractelement <4 x i1> %6, i64 0
86+
br i1 %7, label %pred.store.if, label %pred.store.continue
87+
88+
pred.store.if: ; preds = %.lr.ph27
89+
store i8 0, ptr addrspace(1) null, align 64
90+
br label %pred.store.continue
91+
92+
pred.store.continue: ; preds = %pred.store.if, %.lr.ph27
93+
%8 = extractelement <4 x i1> %6, i64 1
94+
br i1 %8, label %pred.store.if41, label %.lr.ph27
95+
96+
pred.store.if41: ; preds = %pred.store.continue
97+
store i8 0, ptr addrspace(1) null, align 64
98+
br label %.lr.ph27
99+
}
100+
101+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
102+
declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #0
103+
104+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
2+
3+
---
4+
name: copy_to_vreg_1
5+
tracksRegLiveness: true
6+
body: |
7+
; GCN-LABEL: name: copy_to_vreg_1
8+
; GCN: bb.0:
9+
; GCN-NEXT: successors: %bb.1(0x80000000)
10+
; GCN-NEXT: liveins: $vgpr0, $vgpr1
11+
; GCN-NEXT: {{ $}}
12+
; GCN-NEXT: [[V_CVT_U32_F32_e64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
13+
; GCN-NEXT: [[IMPLICIT_DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
14+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
15+
; GCN-NEXT: [[V_CMP_GT_U32_e64:%[0-9]+]]:sreg_64_xexec = samesign V_CMP_GT_U32_e64 [[V_CVT_U32_F32_e64]], killed [[COPY1]], implicit $exec
16+
; GCN-NEXT: [[VREG1:%[0-9]+]]:vreg_1 = COPY [[V_CMP_GT_U32_e64]]
17+
; GCN-NEXT: {{ $}}
18+
; GCN-NEXT: bb.1:
19+
; GCN-NEXT: S_ENDPGM 0
20+
bb.0:
21+
liveins: $vgpr0, $vgpr1
22+
%0:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
23+
%1:sreg_32 = COPY %0:vgpr_32
24+
%2:sreg_32 = COPY $vgpr1
25+
samesign S_CMP_GT_U32 %1:sreg_32, killed %2:sreg_32, implicit-def $scc
26+
%3:sreg_64 = COPY $scc
27+
%4:vreg_1 = COPY %3:sreg_64
28+
29+
bb.1:
30+
S_ENDPGM 0
31+
...

0 commit comments

Comments
 (0)