Skip to content

Commit 1057c63

Browse files
abhigargrepoAbhinav Garg
andauthored
[AMDGPU][GlobalISel] Add register bank legalization for G_FADD (#163407)
This patch adds register bank legalization support for G_FADD opcodes in the AMDGPU GlobalISel pipeline. Added new reg bank type UniInVgprS64. This patch also adds a combine logic for ReadAnyLane + Trunc + AnyExt. --------- Co-authored-by: Abhinav Garg <[email protected]>
1 parent 96c6fd3 commit 1057c63

File tree

6 files changed

+225
-3
lines changed

6 files changed

+225
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2525
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
2626
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
27+
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
2728
#include "llvm/CodeGen/GlobalISel/Utils.h"
2829
#include "llvm/CodeGen/MachineFunctionPass.h"
2930
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
3435

3536
using namespace llvm;
3637
using namespace AMDGPU;
38+
using namespace llvm::MIPatternMatch;
3739

3840
namespace {
3941

42+
// AMDGPU-specific pattern matchers
43+
template <typename SrcTy>
44+
inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
45+
m_GAMDGPUReadAnyLane(const SrcTy &Src) {
46+
return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
47+
}
48+
4049
class AMDGPURegBankLegalize : public MachineFunctionPass {
4150
public:
4251
static char ID;
@@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
160169

161170
Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
162171
// Src = G_AMDGPU_READANYLANE RALSrc
163-
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
164-
if (RAL)
172+
Register RALSrc;
173+
if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
165174
return RALSrc;
166175

176+
// TruncSrc = G_AMDGPU_READANYLANE RALSrc
177+
// AextSrc = G_TRUNC TruncSrc
178+
// Src = G_ANYEXT AextSrc
179+
if (mi_match(Src, MRI,
180+
m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
181+
return RALSrc;
182+
}
183+
167184
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
168185
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
169186
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
626626
MI.eraseFromParent();
627627
}
628628

629+
void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
630+
Register Dst = MI.getOperand(0).getReg();
631+
assert(MRI.getType(Dst) == V2S16);
632+
auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
633+
auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
634+
unsigned Opc = MI.getOpcode();
635+
auto Flags = MI.getFlags();
636+
auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
637+
auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
638+
auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
639+
auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
640+
auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
641+
auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
642+
B.buildMergeLikeInstr(Dst, {Lo, Hi});
643+
MI.eraseFromParent();
644+
}
645+
629646
void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
630647
Register Dst = MI.getOperand(0).getReg();
631648
LLT DstTy = MRI.getType(Dst);
@@ -698,6 +715,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
698715
return lowerUnpackBitShift(MI);
699716
case UnpackMinMax:
700717
return lowerUnpackMinMax(MI);
718+
case ScalarizeToS16:
719+
return lowerSplitTo16(MI);
701720
case Ext32To64: {
702721
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
703722
MachineInstrBuilder Hi;
@@ -849,6 +868,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
849868
return LLT::scalar(32);
850869
case Sgpr64:
851870
case Vgpr64:
871+
case UniInVgprS64:
852872
return LLT::scalar(64);
853873
case Sgpr128:
854874
case Vgpr128:
@@ -972,6 +992,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
972992
case UniInVcc:
973993
case UniInVgprS16:
974994
case UniInVgprS32:
995+
case UniInVgprS64:
975996
case UniInVgprV2S16:
976997
case UniInVgprV4S32:
977998
case UniInVgprB32:
@@ -1104,6 +1125,7 @@ void RegBankLegalizeHelper::applyMappingDst(
11041125
break;
11051126
}
11061127
case UniInVgprS32:
1128+
case UniInVgprS64:
11071129
case UniInVgprV2S16:
11081130
case UniInVgprV4S32: {
11091131
assert(Ty == getTyFromID(MethodIDs[OpIdx]));

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class RegBankLegalizeHelper {
7272
static constexpr LLT P6 = LLT::pointer(6, 32);
7373

7474
MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
75+
MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
7576
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
7677
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
7778

@@ -121,6 +122,7 @@ class RegBankLegalizeHelper {
121122
void lowerV_BFE(MachineInstr &MI);
122123
void lowerS_BFE(MachineInstr &MI);
123124
void lowerSplitTo32(MachineInstr &MI);
125+
void lowerSplitTo16(MachineInstr &MI);
124126
void lowerSplitTo32Select(MachineInstr &MI);
125127
void lowerSplitTo32SExtInReg(MachineInstr &MI);
126128
void lowerUnpackMinMax(MachineInstr &MI);

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -918,9 +918,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
918918
bool hasSALUFloat = ST->hasSALUFloatInsts();
919919

920920
addRulesForGOpcs({G_FADD}, Standard)
921+
.Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
922+
.Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
923+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
921924
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
922925
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
923-
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
926+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
927+
.Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
928+
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
929+
.Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
930+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
931+
hasSALUFloat)
932+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
933+
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
934+
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
924935

925936
addRulesForGOpcs({G_FPTOUI})
926937
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
9292
V4S32,
9393

9494
UniV2S16,
95+
UniV2S32,
9596

9697
DivV2S16,
98+
DivV2S32,
9799

98100
// B types
99101
B32,
@@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
178180
UniInVcc,
179181
UniInVgprS16,
180182
UniInVgprS32,
183+
UniInVgprS64,
181184
UniInVgprV2S16,
185+
UniInVgprV2S32,
182186
UniInVgprV4S32,
183187
UniInVgprB32,
184188
UniInVgprB64,
@@ -217,6 +221,7 @@ enum LoweringMethodID {
217221
V_BFE,
218222
VgprToVccCopy,
219223
SplitTo32,
224+
ScalarizeToS16,
220225
SplitTo32Select,
221226
SplitTo32SExtInReg,
222227
Ext32To64,
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
5+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
6+
7+
define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
8+
; GFX11-FAKE16-LABEL: fadd_s16_uniform:
9+
; GFX11-FAKE16: ; %bb.0:
10+
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
11+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
12+
;
13+
; GFX11-TRUE16-LABEL: fadd_s16_uniform:
14+
; GFX11-TRUE16: ; %bb.0:
15+
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
16+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
17+
;
18+
; GFX12-LABEL: fadd_s16_uniform:
19+
; GFX12: ; %bb.0:
20+
; GFX12-NEXT: s_add_f16 s0, s0, s1
21+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
22+
; GFX12-NEXT: v_mov_b32_e32 v0, s0
23+
; GFX12-NEXT: ; return to shader part epilog
24+
%fadd = fadd half %a, %b
25+
ret half %fadd
26+
}
27+
28+
define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
29+
; GFX11-FAKE16-LABEL: fadd_s16_div:
30+
; GFX11-FAKE16: ; %bb.0:
31+
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
32+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
33+
;
34+
; GFX11-TRUE16-LABEL: fadd_s16_div:
35+
; GFX11-TRUE16: ; %bb.0:
36+
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
37+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
38+
;
39+
; GFX12-FAKE16-LABEL: fadd_s16_div:
40+
; GFX12-FAKE16: ; %bb.0:
41+
; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
42+
; GFX12-FAKE16-NEXT: ; return to shader part epilog
43+
;
44+
; GFX12-TRUE16-LABEL: fadd_s16_div:
45+
; GFX12-TRUE16: ; %bb.0:
46+
; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
47+
; GFX12-TRUE16-NEXT: ; return to shader part epilog
48+
%fadd = fadd half %a, %b
49+
ret half %fadd
50+
}
51+
52+
define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
53+
; GFX11-LABEL: fadd_s32_uniform:
54+
; GFX11: ; %bb.0:
55+
; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
56+
; GFX11-NEXT: ; return to shader part epilog
57+
;
58+
; GFX12-LABEL: fadd_s32_uniform:
59+
; GFX12: ; %bb.0:
60+
; GFX12-NEXT: s_add_f32 s0, s0, s1
61+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
62+
; GFX12-NEXT: v_mov_b32_e32 v0, s0
63+
; GFX12-NEXT: ; return to shader part epilog
64+
%fadd = fadd float %a, %b
65+
ret float %fadd
66+
}
67+
68+
define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
69+
; GCN-LABEL: fadd_s32_div:
70+
; GCN: ; %bb.0:
71+
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
72+
; GCN-NEXT: ; return to shader part epilog
73+
%fadd = fadd float %a, %b
74+
ret float %fadd
75+
}
76+
77+
define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
78+
; GFX11-LABEL: fadd_s64_uniform:
79+
; GFX11: ; %bb.0:
80+
; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3]
81+
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
82+
; GFX11-NEXT: s_endpgm
83+
;
84+
; GFX12-LABEL: fadd_s64_uniform:
85+
; GFX12: ; %bb.0:
86+
; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3]
87+
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
88+
; GFX12-NEXT: s_endpgm
89+
%fadd = fadd double %a, %b
90+
store double %fadd, ptr addrspace(1) %ptr
91+
ret void
92+
}
93+
94+
define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
95+
; GFX11-LABEL: fadd_s64_div:
96+
; GFX11: ; %bb.0:
97+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
98+
; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
99+
; GFX11-NEXT: s_endpgm
100+
;
101+
; GFX12-LABEL: fadd_s64_div:
102+
; GFX12: ; %bb.0:
103+
; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3]
104+
; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
105+
; GFX12-NEXT: s_endpgm
106+
%fadd = fadd double %a, %b
107+
store double %fadd, ptr addrspace(1) %ptr
108+
ret void
109+
}
110+
111+
define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
112+
; GFX11-LABEL: fadd_v2s16_uniform:
113+
; GFX11: ; %bb.0:
114+
; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
115+
; GFX11-NEXT: ; return to shader part epilog
116+
;
117+
; GFX12-LABEL: fadd_v2s16_uniform:
118+
; GFX12: ; %bb.0:
119+
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
120+
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
121+
; GFX12-NEXT: s_add_f16 s0, s0, s1
122+
; GFX12-NEXT: s_add_f16 s1, s2, s3
123+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
124+
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
125+
; GFX12-NEXT: v_mov_b32_e32 v0, s0
126+
; GFX12-NEXT: ; return to shader part epilog
127+
%fadd = fadd <2 x half> %a, %b
128+
ret <2 x half> %fadd
129+
}
130+
131+
define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
132+
; GCN-LABEL: fadd_v2s16_div:
133+
; GCN: ; %bb.0:
134+
; GCN-NEXT: v_pk_add_f16 v0, v0, v1
135+
; GCN-NEXT: ; return to shader part epilog
136+
%fadd = fadd <2 x half> %a, %b
137+
ret <2 x half> %fadd
138+
}
139+
140+
define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
141+
; GFX11-LABEL: fadd_v2s32_uniform:
142+
; GFX11: ; %bb.0:
143+
; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
144+
; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
145+
; GFX11-NEXT: ; return to shader part epilog
146+
;
147+
; GFX12-LABEL: fadd_v2s32_uniform:
148+
; GFX12: ; %bb.0:
149+
; GFX12-NEXT: s_add_f32 s0, s0, s2
150+
; GFX12-NEXT: s_add_f32 s1, s1, s3
151+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
152+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
153+
; GFX12-NEXT: ; return to shader part epilog
154+
%fadd = fadd <2 x float> %a, %b
155+
ret <2 x float> %fadd
156+
}
157+
158+
define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
159+
; GCN-LABEL: fadd_v2s32_div:
160+
; GCN: ; %bb.0:
161+
; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
162+
; GCN-NEXT: ; return to shader part epilog
163+
%fadd = fadd <2 x float> %a, %b
164+
ret <2 x float> %fadd
165+
}

0 commit comments

Comments
 (0)