Skip to content

Commit d085348

Browse files
committed
[AMDGPU] Add option to preinflate to AVGPR
Change-Id: Ia488b12f06bdc3e462f1cd90baf64a3375f15c4c
1 parent 5708851 commit d085348

File tree

4 files changed

+219
-0
lines changed

4 files changed

+219
-0
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
253253
if (!LIS->hasInterval(Reg))
254254
continue;
255255
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
256+
256257
if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
257258
(ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
258259
continue;

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323
#define DEBUG_TYPE "si-fold-operands"
2424
using namespace llvm;
2525

26+
static cl::opt<bool>
27+
InflateToAVGPR("amdgpu-avgpr-inflation", cl::Hidden, cl::init(false),
28+
cl::desc("Enable register inflation to avgpr register class "
29+
"(which can be assigned to either AGPR or VGPR)."));
30+
2631
namespace {
2732

2833
struct FoldCandidate {
@@ -2627,6 +2632,9 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
26272632
bool Changed = false;
26282633
for (MachineBasicBlock *MBB : depth_first(&MF)) {
26292634
MachineOperand *CurrentKnownM0Val = nullptr;
2635+
2636+
2637+
26302638
for (auto &MI : make_early_inc_range(*MBB)) {
26312639
Changed |= tryFoldCndMask(MI);
26322640

@@ -2666,6 +2674,22 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
26662674
Changed |= tryFoldClamp(MI);
26672675
}
26682676

2677+
2678+
if (MFI->getMinWavesPerEU() > 1)
2679+
return Changed;
2680+
2681+
2682+
for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
2683+
Register Reg = Register::index2VirtReg(I);
2684+
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
2685+
2686+
if (InflateToAVGPR && ST->hasGFX90AInsts() &&
2687+
(TRI->isAGPRClass(RC) || TRI->isVGPRClass(RC))) {
2688+
bool Inflated = MRI->recomputeRegClass(Reg);
2689+
Changed |= Inflated;
2690+
}
2691+
}
2692+
26692693
Changed |= tryOptimizeAGPRPhis(*MBB);
26702694
}
26712695

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs %s 2>&1 | FileCheck %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-avgpr-inflation=1 -verify-machineinstrs %s 2>&1 | FileCheck -check-prefix=INFLATE %s
4+
5+
6+
define amdgpu_kernel void @attn_fwd(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %in2, ptr addrspace(3) %in3, ptr addrspace(3) %in4, ptr addrspace(3) %in5, ptr addrspace(3) %in6, ptr addrspace(3) %in7, ptr addrspace(0) %out) #0 {
7+
; CHECK-LABEL: attn_fwd:
8+
; CHECK: ; %bb.0:
9+
; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
10+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
11+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
12+
; CHECK-NEXT: v_mov_b32_e32 v0, s8
13+
; CHECK-NEXT: v_mov_b32_e32 v4, s9
14+
; CHECK-NEXT: v_mov_b32_e32 v5, s10
15+
; CHECK-NEXT: ds_read_b128 v[0:3], v0
16+
; CHECK-NEXT: ds_read_b128 v[8:11], v4
17+
; CHECK-NEXT: ds_read_b128 v[4:7], v5
18+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
19+
; CHECK-NEXT: v_accvgpr_write_b32 a16, v7 ; Reload Reuse
20+
; CHECK-NEXT: v_accvgpr_write_b32 a17, v6 ; Reload Reuse
21+
; CHECK-NEXT: v_accvgpr_write_b32 a18, v5 ; Reload Reuse
22+
; CHECK-NEXT: v_accvgpr_write_b32 a19, v4 ; Reload Reuse
23+
; CHECK-NEXT: v_mov_b32_e32 v4, s11
24+
; CHECK-NEXT: ds_read_b128 v[12:15], v4
25+
; CHECK-NEXT: v_mov_b32_e32 v4, s12
26+
; CHECK-NEXT: ds_read_b128 v[16:19], v4
27+
; CHECK-NEXT: v_mov_b32_e32 v4, s13
28+
; CHECK-NEXT: v_mov_b32_e32 v5, s14
29+
; CHECK-NEXT: v_mov_b32_e32 v6, s15
30+
; CHECK-NEXT: ds_read_b128 v[20:23], v4
31+
; CHECK-NEXT: ds_read_b128 v[24:27], v5
32+
; CHECK-NEXT: ds_read_b128 v[4:7], v6
33+
; CHECK-NEXT: ; sched_barrier mask(0x00000000)
34+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[0:3], 0
35+
; CHECK-NEXT: v_accvgpr_read_b32 v3, a16 ; Reload Reuse
36+
; CHECK-NEXT: v_accvgpr_read_b32 v2, a17 ; Reload Reuse
37+
; CHECK-NEXT: v_accvgpr_read_b32 v1, a18 ; Reload Reuse
38+
; CHECK-NEXT: v_accvgpr_read_b32 v0, a19 ; Reload Reuse
39+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[8:11], v[8:11], a[0:15]
40+
; CHECK-NEXT: s_nop 0
41+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[0:3], a[0:15]
42+
; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
43+
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
44+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[12:15], v[12:15], a[0:15]
45+
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
46+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[16:19], a[0:15]
47+
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
48+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[20:23], v[20:23], a[0:15]
49+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
50+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[24:27], v[24:27], a[0:15]
51+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
52+
; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[4:7], v[4:7], a[0:15]
53+
; CHECK-NEXT: s_nop 7
54+
; CHECK-NEXT: s_nop 3
55+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:48
56+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:32
57+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:16
58+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[0:3]
59+
; CHECK-NEXT: s_endpgm
60+
;
61+
; INFLATE-LABEL: attn_fwd:
62+
; INFLATE: ; %bb.0:
63+
; INFLATE-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
64+
; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
65+
; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
66+
; INFLATE-NEXT: v_mov_b32_e32 v0, s8
67+
; INFLATE-NEXT: v_mov_b32_e32 v4, s9
68+
; INFLATE-NEXT: v_mov_b32_e32 v8, s10
69+
; INFLATE-NEXT: v_mov_b32_e32 v12, s11
70+
; INFLATE-NEXT: v_mov_b32_e32 v16, s12
71+
; INFLATE-NEXT: v_mov_b32_e32 v20, s13
72+
; INFLATE-NEXT: v_mov_b32_e32 v24, s14
73+
; INFLATE-NEXT: ds_read_b128 a[0:3], v0
74+
; INFLATE-NEXT: ds_read_b128 v[4:7], v4
75+
; INFLATE-NEXT: ds_read_b128 v[8:11], v8
76+
; INFLATE-NEXT: ds_read_b128 v[12:15], v12
77+
; INFLATE-NEXT: ds_read_b128 v[16:19], v16
78+
; INFLATE-NEXT: v_mov_b32_e32 v0, s15
79+
; INFLATE-NEXT: ds_read_b128 v[20:23], v20
80+
; INFLATE-NEXT: ds_read_b128 v[24:27], v24
81+
; INFLATE-NEXT: ds_read_b128 a[16:19], v0
82+
; INFLATE-NEXT: ; sched_barrier mask(0x00000000)
83+
; INFLATE-NEXT: s_waitcnt lgkmcnt(7)
84+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 0
85+
; INFLATE-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
86+
; INFLATE-NEXT: s_waitcnt lgkmcnt(6)
87+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[4:7], v[4:7], a[0:15]
88+
; INFLATE-NEXT: s_waitcnt lgkmcnt(5)
89+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[8:11], v[8:11], a[0:15]
90+
; INFLATE-NEXT: s_waitcnt lgkmcnt(4)
91+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[12:15], v[12:15], a[0:15]
92+
; INFLATE-NEXT: s_waitcnt lgkmcnt(3)
93+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[16:19], a[0:15]
94+
; INFLATE-NEXT: s_waitcnt lgkmcnt(2)
95+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[20:23], v[20:23], a[0:15]
96+
; INFLATE-NEXT: s_waitcnt lgkmcnt(1)
97+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[24:27], v[24:27], a[0:15]
98+
; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
99+
; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], a[16:19], a[16:19], a[0:15]
100+
; INFLATE-NEXT: s_nop 7
101+
; INFLATE-NEXT: s_nop 3
102+
; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:48
103+
; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:32
104+
; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:16
105+
; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[0:3]
106+
; INFLATE-NEXT: s_endpgm
107+
%load0 = load <8 x half>, ptr addrspace(3) %in0, align 16
108+
%load1 = load <8 x half>, ptr addrspace(3) %in1, align 16
109+
%load2 = load <8 x half>, ptr addrspace(3) %in2, align 16
110+
%load3 = load <8 x half>, ptr addrspace(3) %in3, align 16
111+
%load4 = load <8 x half>, ptr addrspace(3) %in4, align 16
112+
%load5 = load <8 x half>, ptr addrspace(3) %in5, align 16
113+
%load6 = load <8 x half>, ptr addrspace(3) %in6, align 16
114+
%load7 = load <8 x half>, ptr addrspace(3) %in7, align 16
115+
tail call void @llvm.amdgcn.sched.barrier(i32 0)
116+
%mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
117+
%mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0)
118+
%mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0)
119+
%mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0)
120+
%mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0)
121+
%mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0)
122+
%mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0)
123+
%mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0)
124+
store <16 x float> %mfma7, ptr addrspace(0) %out
125+
ret void
126+
}
127+
128+
attributes #0 = { "amdgpu-num-vgpr"="24" "amdgpu-agpr-alloc"="20,256"}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs --run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck %s
3+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs --amdgpu-avgpr-inflation=1 --run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck %s -check-prefix=INFLATE
4+
5+
---
6+
name: agpr_constraint
7+
tracksRegLiveness: true
8+
body: |
9+
bb.0:
10+
; CHECK-LABEL: name: agpr_constraint
11+
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
12+
; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
13+
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3)
14+
; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs
15+
;
16+
; INFLATE-LABEL: name: agpr_constraint
17+
; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
18+
; INFLATE-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
19+
; INFLATE-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3)
20+
; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs
21+
%0:vgpr_32 = IMPLICIT_DEF
22+
%1:areg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec
23+
INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %1, 2147483657 /* reguse tiedto:$0 */, %1(tied-def 3)
24+
S_ENDPGM 0, amdgpu_allvgprs
25+
...
26+
27+
---
28+
name: vgpr_constraint
29+
tracksRegLiveness: true
30+
body: |
31+
bb.0:
32+
; CHECK-LABEL: name: vgpr_constraint
33+
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
34+
; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
35+
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3)
36+
; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs
37+
;
38+
; INFLATE-LABEL: name: vgpr_constraint
39+
; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
40+
; INFLATE-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
41+
; INFLATE-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3)
42+
; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs
43+
%0:vgpr_32 = IMPLICIT_DEF
44+
%1:vreg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec
45+
INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %1, 2147483657 /* reguse tiedto:$0 */, %1(tied-def 3)
46+
S_ENDPGM 0, amdgpu_allvgprs
47+
...
48+
49+
---
50+
name: no_constraint
51+
tracksRegLiveness: true
52+
body: |
53+
bb.0:
54+
; CHECK-LABEL: name: no_constraint
55+
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
56+
; CHECK-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
57+
; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs
58+
;
59+
; INFLATE-LABEL: name: no_constraint
60+
; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
61+
; INFLATE-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec
62+
; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs
63+
%0:vgpr_32 = IMPLICIT_DEF
64+
%1:vreg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec
65+
S_ENDPGM 0, amdgpu_allvgprs
66+
...

0 commit comments

Comments
 (0)