Skip to content

Commit 40270e8

Browse files
AMDGPU/GlobalISel: Add regbanklegalize rules for load and store (#153176)
Cover all the missing cases and add very detailed tests for each rule. In summary: - Flat and Scratch, addrspace(0) and addrspace(5), loads are always divergent. - Global and Constant, addrspace(1) and addrspace(4), have real uniform loads, s_load, but require additional checks for align and flags in mmo. For not natural align or not uniform mmo do uniform-in-vgpr lowering. - Private, addrspace(3), only has instructions for divergent load, for uniform do uniform-in-vgpr lowering. - Store rules are simplified using Ptr32 and Ptr64. All operands need to be vgpr. Some tests have code size regression since they use more sgpr instructions, marked with FixMe comment to get back to later.
1 parent 30f9fb7 commit 40270e8

File tree

62 files changed

+6164
-786
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+6164
-786
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,32 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
352352
MI.eraseFromParent();
353353
}
354354

355+
void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
356+
Register Dst = MI.getDstReg();
357+
Register Ptr = MI.getPointerReg();
358+
MachineMemOperand &MMO = MI.getMMO();
359+
unsigned MemSize = 8 * MMO.getSize().getValue();
360+
361+
MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
362+
363+
if (MI.getOpcode() == G_LOAD) {
364+
B.buildLoad(Dst, Ptr, *WideMMO);
365+
} else {
366+
auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
367+
368+
if (MI.getOpcode() == G_ZEXTLOAD) {
369+
APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
370+
auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
371+
B.buildAnd(Dst, Load, MaskCst);
372+
} else {
373+
assert(MI.getOpcode() == G_SEXTLOAD);
374+
B.buildSExtInReg(Dst, Load, MemSize);
375+
}
376+
}
377+
378+
MI.eraseFromParent();
379+
}
380+
355381
void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
356382
Register Dst = MI.getOperand(0).getReg();
357383
LLT Ty = MRI.getType(Dst);
@@ -744,6 +770,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
744770
}
745771
break;
746772
}
773+
case WidenMMOToS32:
774+
return widenMMOToS32(cast<GAnyLoad>(MI));
747775
}
748776

749777
if (!WaterfallSgprs.empty()) {
@@ -759,6 +787,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
759787
return LLT::scalar(1);
760788
case Sgpr16:
761789
case Vgpr16:
790+
case UniInVgprS16:
762791
return LLT::scalar(16);
763792
case Sgpr32:
764793
case Sgpr32_WF:
@@ -895,6 +924,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
895924
case SgprB256:
896925
case SgprB512:
897926
case UniInVcc:
927+
case UniInVgprS16:
898928
case UniInVgprS32:
899929
case UniInVgprV2S16:
900930
case UniInVgprV4S32:
@@ -1015,6 +1045,18 @@ void RegBankLegalizeHelper::applyMappingDst(
10151045
B.buildTrunc(Reg, CopyS32_Vcc);
10161046
break;
10171047
}
1048+
case UniInVgprS16: {
1049+
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1050+
assert(RB == SgprRB);
1051+
Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1052+
Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1053+
Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1054+
Op.setReg(NewVgprDstS16);
1055+
B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1056+
buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1057+
B.buildTrunc(Reg, NewSgprDstS32);
1058+
break;
1059+
}
10181060
case UniInVgprS32:
10191061
case UniInVgprV2S16:
10201062
case UniInVgprV4S32: {

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "AMDGPURegBankLegalizeRules.h"
1313
#include "llvm/ADT/SmallSet.h"
14+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
1415
#include "llvm/CodeGen/MachineRegisterInfo.h"
1516

1617
namespace llvm {
@@ -107,6 +108,7 @@ class RegBankLegalizeHelper {
107108
void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
108109
LLT MergeTy = LLT());
109110
void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
111+
void widenMMOToS32(GAnyLoad &MI) const;
110112

111113
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
112114
SmallSet<Register, 4> &SgprWaterfallOperandRegs);

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 176 additions & 46 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ enum RegBankLLTMappingApplyID {
176176

177177
// Dst only modifiers: read-any-lane and truncs
178178
UniInVcc,
179+
UniInVgprS16,
179180
UniInVgprS32,
180181
UniInVgprV2S16,
181182
UniInVgprV4S32,
@@ -221,6 +222,7 @@ enum LoweringMethodID {
221222
UniCstExt,
222223
SplitLoad,
223224
WidenLoad,
225+
WidenMMOToS32
224226
};
225227

226228
enum FastRulesTypes {

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
define i8 @atomic_load_flat_monotonic_i8(ptr %ptr) {
77
; GCN-LABEL: atomic_load_flat_monotonic_i8:

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
5-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
5+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
66

77
define i8 @atomic_load_global_monotonic_i8(ptr addrspace(1) %ptr) {
88
; GFX6-LABEL: atomic_load_global_monotonic_i8:

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
2-
; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
1+
; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
2+
; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
33

44
; GCN-LABEL: {{^}}atomic_load_monotonic_i8:
55
; GCN: s_waitcnt

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
55

66
; TODO: Merge with atomic_load_local.ll
77

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_store_local.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
1+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,CI %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
33

44
; GCN-LABEL: {{^}}atomic_store_monotonic_i8:
55
; GCN: s_waitcnt

llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
44

55
; End to end tests for scalar vs. vector boolean legalization strategies.
66

77
define amdgpu_ps float @select_vgpr_sgpr_trunc_cond(i32 inreg %a, i32 %b, i32 %c) {
88
; WAVE64-LABEL: select_vgpr_sgpr_trunc_cond:
99
; WAVE64: ; %bb.0:
10-
; WAVE64-NEXT: s_and_b32 s0, 1, s0
11-
; WAVE64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
10+
; WAVE64-NEXT: s_cmp_lg_u32 s0, 0
11+
; WAVE64-NEXT: s_cselect_b64 vcc, exec, 0
1212
; WAVE64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1313
; WAVE64-NEXT: ; return to shader part epilog
1414
;
1515
; WAVE32-LABEL: select_vgpr_sgpr_trunc_cond:
1616
; WAVE32: ; %bb.0:
17-
; WAVE32-NEXT: s_and_b32 s0, 1, s0
18-
; WAVE32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
17+
; WAVE32-NEXT: s_cmp_lg_u32 s0, 0
18+
; WAVE32-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0
1919
; WAVE32-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
2020
; WAVE32-NEXT: ; return to shader part epilog
2121
%cc = trunc i32 %a to i1
@@ -28,16 +28,16 @@ define amdgpu_ps float @select_vgpr_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inre
2828
; WAVE64-LABEL: select_vgpr_sgpr_trunc_and_cond:
2929
; WAVE64: ; %bb.0:
3030
; WAVE64-NEXT: s_and_b32 s0, s0, s1
31-
; WAVE64-NEXT: s_and_b32 s0, 1, s0
32-
; WAVE64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
31+
; WAVE64-NEXT: s_cmp_lg_u32 s0, 0
32+
; WAVE64-NEXT: s_cselect_b64 vcc, exec, 0
3333
; WAVE64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
3434
; WAVE64-NEXT: ; return to shader part epilog
3535
;
3636
; WAVE32-LABEL: select_vgpr_sgpr_trunc_and_cond:
3737
; WAVE32: ; %bb.0:
3838
; WAVE32-NEXT: s_and_b32 s0, s0, s1
39-
; WAVE32-NEXT: s_and_b32 s0, 1, s0
40-
; WAVE32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
39+
; WAVE32-NEXT: s_cmp_lg_u32 s0, 0
40+
; WAVE32-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0
4141
; WAVE32-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
4242
; WAVE32-NEXT: ; return to shader part epilog
4343
%cc.0 = trunc i32 %a.0 to i1

0 commit comments

Comments
 (0)