Skip to content

Commit ba5cde7

Browse files
authored
[AMDGPU][GlobalISel] Fix issue with copy_scc_vcc on gfx7 (#165355)
When selecting for G_AMDGPU_COPY_SCC_VCC, we use S_CMP_LG_U64 or S_CMP_LG_U32 for wave64 and wave32 respectively. However, on gfx7 we do not have the S_CMP_LG_U64 instruction. Work around this issue by using S_OR_B64 instead.
1 parent 0030fac commit ba5cde7

File tree

3 files changed

+118
-5
lines changed

3 files changed

+118
-5
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -221,12 +221,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
221221
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
222222
const DebugLoc &DL = I.getDebugLoc();
223223
MachineBasicBlock *BB = I.getParent();
224+
Register VCCReg = I.getOperand(1).getReg();
225+
MachineInstr *Cmp;
226+
227+
if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
228+
unsigned CmpOpc =
229+
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
230+
Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
231+
} else {
232+
// For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64
233+
// which sets SCC as a side effect.
234+
Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
235+
Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
236+
.addReg(VCCReg)
237+
.addReg(VCCReg);
238+
}
224239

225-
unsigned CmpOpc =
226-
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
227-
MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
228-
.addReg(I.getOperand(1).getReg())
229-
.addImm(0);
230240
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
231241
return false;
232242

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
5+
6+
define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) {
7+
; GFX7-LABEL: fcmp_uniform_select:
8+
; GFX7: ; %bb.0:
9+
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9
10+
; GFX7-NEXT: s_load_dword s3, s[4:5], 0xb
11+
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
12+
; GFX7-NEXT: s_mov_b32 s2, -1
13+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
14+
; GFX7-NEXT: v_cmp_eq_f32_e64 s[4:5], s6, 0
15+
; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
16+
; GFX7-NEXT: s_cselect_b32 s4, 1, 0
17+
; GFX7-NEXT: s_and_b32 s4, s4, 1
18+
; GFX7-NEXT: s_cmp_lg_u32 s4, 0
19+
; GFX7-NEXT: s_cselect_b32 s3, s7, s3
20+
; GFX7-NEXT: v_mov_b32_e32 v0, s3
21+
; GFX7-NEXT: s_mov_b32 s3, 0xf000
22+
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
23+
; GFX7-NEXT: s_endpgm
24+
;
25+
; GFX8-LABEL: fcmp_uniform_select:
26+
; GFX8: ; %bb.0:
27+
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
28+
; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
29+
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
30+
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
31+
; GFX8-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0
32+
; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
33+
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
34+
; GFX8-NEXT: s_and_b32 s0, s0, 1
35+
; GFX8-NEXT: s_cmp_lg_u32 s0, 0
36+
; GFX8-NEXT: s_cselect_b32 s0, s1, s6
37+
; GFX8-NEXT: v_mov_b32_e32 v0, s2
38+
; GFX8-NEXT: v_mov_b32_e32 v2, s0
39+
; GFX8-NEXT: v_mov_b32_e32 v1, s3
40+
; GFX8-NEXT: flat_store_dword v[0:1], v2
41+
; GFX8-NEXT: s_endpgm
42+
;
43+
; GFX11-LABEL: fcmp_uniform_select:
44+
; GFX11: ; %bb.0:
45+
; GFX11-NEXT: s_clause 0x2
46+
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
47+
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
48+
; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
49+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
50+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
51+
; GFX11-NEXT: v_cmp_eq_f32_e64 s0, s0, 0
52+
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
53+
; GFX11-NEXT: s_cselect_b32 s0, 1, 0
54+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
55+
; GFX11-NEXT: s_and_b32 s0, s0, 1
56+
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
57+
; GFX11-NEXT: s_cselect_b32 s0, s1, s6
58+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
59+
; GFX11-NEXT: v_mov_b32_e32 v0, s0
60+
; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
61+
; GFX11-NEXT: s_endpgm
62+
%cmp = fcmp oeq float %a, 0.0
63+
%sel = select i1 %cmp, i32 %b, i32 %c
64+
store i32 %sel, ptr addrspace(1) %out
65+
ret void
66+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s
4+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s
5+
6+
---
7+
name: test_copy_scc_vcc
8+
legalized: true
9+
regBankSelected: true
10+
tracksRegLiveness: true
11+
body: |
12+
bb.0:
13+
; GFX7-LABEL: name: test_copy_scc_vcc
14+
; GFX7: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
15+
; GFX7-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[DEF]], [[DEF]], implicit-def $scc
16+
; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc
17+
; GFX7-NEXT: $sgpr0 = COPY [[COPY]]
18+
; GFX7-NEXT: S_ENDPGM 0, implicit $sgpr0
19+
;
20+
; GF8-LABEL: name: test_copy_scc_vcc
21+
; GF8: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
22+
; GF8-NEXT: S_CMP_LG_U64 [[DEF]], 0, implicit-def $scc
23+
; GF8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc
24+
; GF8-NEXT: $sgpr0 = COPY [[COPY]]
25+
; GF8-NEXT: S_ENDPGM 0, implicit $sgpr0
26+
;
27+
; GFX11-LABEL: name: test_copy_scc_vcc
28+
; GFX11: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
29+
; GFX11-NEXT: S_CMP_LG_U32 [[DEF]], 0, implicit-def $scc
30+
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $scc
31+
; GFX11-NEXT: $sgpr0 = COPY [[COPY]]
32+
; GFX11-NEXT: S_ENDPGM 0, implicit $sgpr0
33+
%0:vcc(s1) = G_IMPLICIT_DEF
34+
%1:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC %0
35+
$sgpr0 = COPY %1
36+
S_ENDPGM 0, implicit $sgpr0
37+
...

0 commit comments

Comments
 (0)