Skip to content

Commit 9849ed9

Browse files
tpopplialan
authored andcommitted
AMDGPU GlobalISel G_ADD and G_PTR_ADD 64 support
This considers hasLshlAddB64 support and adds patterns for ptradd.
1 parent 040860a commit 9849ed9

File tree

3 files changed

+105
-18
lines changed

3 files changed

+105
-18
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -736,13 +736,29 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
736736
.widenScalarToNextMultipleOf(0, 32)
737737
.maxScalar(0, S32);
738738
} else {
739-
getActionDefinitionsBuilder({G_ADD, G_SUB})
739+
getActionDefinitionsBuilder(G_SUB)
740740
.legalFor({S32, S16, V2S16})
741741
.clampMaxNumElementsStrict(0, S16, 2)
742742
.scalarize(0)
743743
.minScalar(0, S16)
744744
.widenScalarToNextMultipleOf(0, 32)
745745
.maxScalar(0, S32);
746+
if (ST.hasLshlAddB64())
747+
getActionDefinitionsBuilder(G_ADD)
748+
.legalFor({S64, S32, S16, V2S16})
749+
.clampMaxNumElementsStrict(0, S16, 2)
750+
.scalarize(0)
751+
.minScalar(0, S16)
752+
.widenScalarToNextMultipleOf(0, 32)
753+
.maxScalar(0, S32);
754+
else
755+
getActionDefinitionsBuilder(G_ADD)
756+
.legalFor({S32, S16, V2S16})
757+
.clampMaxNumElementsStrict(0, S16, 2)
758+
.scalarize(0)
759+
.minScalar(0, S16)
760+
.widenScalarToNextMultipleOf(0, 32)
761+
.maxScalar(0, S32);
746762
}
747763

748764
if (ST.hasScalarSMulU64()) {

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -762,6 +762,24 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
762762
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
763763
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
764764

765+
let SubtargetPredicate = isGFX940Plus in {
766+
// TODO: Canonicalize these in the target specific CombinerHelper?
767+
def : GCNPat<
768+
(ptradd (shl i64:$src0, i32:$shift), i64:$src1),
769+
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$shift, VSrc_b64:$src1)
770+
>;
771+
772+
def : GCNPat<
773+
(ptradd i64:$src0, (shl i64:$src1, i32:$shift)),
774+
(V_LSHL_ADD_U64_e64 VSrc_b64:$src1, VSrc_b32:$shift, VSrc_b64:$src0)
775+
>;
776+
777+
def : GCNPat<
778+
(ptradd i64:$src0, i64:$src1),
779+
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, (i32 0), VSrc_b64:$src1)
780+
>;
781+
}
782+
765783
def : GCNPat<
766784
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
767785
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;

llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll

Lines changed: 70 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,57 @@
22

33
define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
44
; GCN-LABEL: lshl_add_u64_v1v:
5-
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1, v[{{[0-9:]+}}]
5+
; GCN: ; %bb.0:
6+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
8+
; GCN-NEXT: s_setpc_b64 s[30:31]
69
%shl = shl i64 %v, 1
710
%add = add i64 %shl, %a
811
ret i64 %add
912
}
1013

1114
define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
1215
; GCN-LABEL: lshl_add_u64_v4v:
13-
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4, v[{{[0-9:]+}}]
16+
; GCN: ; %bb.0:
17+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3]
19+
; GCN-NEXT: s_setpc_b64 s[30:31]
1420
%shl = shl i64 %v, 4
1521
%add = add i64 %shl, %a
1622
ret i64 %add
1723
}
1824

1925
define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
2026
; GCN-LABEL: lshl_add_u64_v5v:
21-
; GCN: v_lshlrev_b64
22-
; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
27+
; GCN: ; %bb.0:
28+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 5, v[2:3]
30+
; GCN-NEXT: s_setpc_b64 s[30:31]
2331
%shl = shl i64 %v, 5
2432
%add = add i64 %shl, %a
2533
ret i64 %add
2634
}
2735

2836
define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
2937
; GCN-LABEL: lshl_add_u64_vvv:
30-
; GCN: v_lshlrev_b64
31-
; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
38+
; GCN: ; %bb.0:
39+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], v2, v[4:5]
41+
; GCN-NEXT: s_setpc_b64 s[30:31]
3242
%shl = shl i64 %v, %s
3343
%add = add i64 %shl, %a
3444
ret i64 %add
3545
}
3646

3747
define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
3848
; GCN-LABEL: lshl_add_u64_s2v:
39-
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}]
49+
; GCN: ; %bb.0:
50+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
51+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
52+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
53+
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
54+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
55+
; GCN-NEXT: s_endpgm
4056
%a = load i64, ptr undef
4157
%shl = shl i64 %v, 2
4258
%add = add i64 %shl, %a
@@ -46,7 +62,13 @@ define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
4662

4763
define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
4864
; GCN-LABEL: lshl_add_u64_v2s:
49-
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}]
65+
; GCN: ; %bb.0:
66+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
67+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
68+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
69+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
70+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
71+
; GCN-NEXT: s_endpgm
5072
%v = load i64, ptr undef
5173
%shl = shl i64 %v, 2
5274
%add = add i64 %shl, %a
@@ -56,9 +78,14 @@ define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
5678

5779
define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
5880
; GCN-LABEL: lshl_add_u64_s2s:
59-
; GCN: s_lshl_b64
60-
; GCN: s_add_u32
61-
; GCN: s_addc_u32
81+
; GCN: ; %bb.0:
82+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
83+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
84+
; GCN-NEXT: v_mov_b32_e32 v0, s2
85+
; GCN-NEXT: v_mov_b32_e32 v1, s3
86+
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
87+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
88+
; GCN-NEXT: s_endpgm
6289
%shl = shl i64 %v, 2
6390
%add = add i64 %shl, %a
6491
store i64 %add, ptr undef
@@ -67,14 +94,23 @@ define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
6794

6895
define i64 @add_u64_vv(i64 %v, i64 %a) {
6996
; GCN-LABEL: add_u64_vv:
70-
; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
97+
; GCN: ; %bb.0:
98+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
100+
; GCN-NEXT: s_setpc_b64 s[30:31]
71101
%add = add i64 %v, %a
72102
ret i64 %add
73103
}
74104

75105
define amdgpu_kernel void @add_u64_sv(i64 %v) {
76106
; GCN-LABEL: add_u64_sv:
77-
; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
107+
; GCN: ; %bb.0:
108+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
109+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
110+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
111+
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
112+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
113+
; GCN-NEXT: s_endpgm
78114
%a = load i64, ptr undef
79115
%add = add i64 %v, %a
80116
store i64 %add, ptr undef
@@ -83,7 +119,13 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
83119

84120
define amdgpu_kernel void @add_u64_vs(i64 %a) {
85121
; GCN-LABEL: add_u64_vs:
86-
; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
122+
; GCN: ; %bb.0:
123+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
124+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
125+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
126+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
127+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
128+
; GCN-NEXT: s_endpgm
87129
%v = load i64, ptr undef
88130
%add = add i64 %v, %a
89131
store i64 %add, ptr undef
@@ -92,16 +134,27 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
92134

93135
define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
94136
; GCN-LABEL: add_u64_ss:
95-
; GCN: s_add_u32
96-
; GCN: s_addc_u32 s1, s1, s3
137+
; GCN: ; %bb.0:
138+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
139+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
140+
; GCN-NEXT: s_add_u32 s0, s0, s2
141+
; GCN-NEXT: s_addc_u32 s1, s1, s3
142+
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
143+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
144+
; GCN-NEXT: s_endpgm
97145
%add = add i64 %v, %a
98146
store i64 %add, ptr undef
99147
ret void
100148
}
101149

102150
define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
103151
; GCN-LABEL: lshl_add_u64_gep:
104-
; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
152+
; GCN: ; %bb.0:
153+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
155+
; GCN-NEXT: flat_load_dword v0, v[0:1]
156+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
157+
; GCN-NEXT: s_setpc_b64 s[30:31]
105158
%gep = getelementptr inbounds i32, ptr %p, i64 %a
106159
%v = load i32, ptr %gep
107160
ret i32 %v

0 commit comments

Comments
 (0)