Skip to content

Commit 4674053

Browse files
committed
AMDGPU GlobalISel G_ADD and G_PTR_ADD 64 support
This considers hasLshlAddB64 support and adds patterns for ptradd.
1 parent d553e5d commit 4674053

File tree

3 files changed

+106
-18
lines changed

3 files changed

+106
-18
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -730,13 +730,29 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
730730
.widenScalarToNextMultipleOf(0, 32)
731731
.maxScalar(0, S32);
732732
} else {
733-
getActionDefinitionsBuilder({G_ADD, G_SUB})
733+
getActionDefinitionsBuilder(G_SUB)
734734
.legalFor({S32, S16, V2S16})
735735
.clampMaxNumElementsStrict(0, S16, 2)
736736
.scalarize(0)
737737
.minScalar(0, S16)
738738
.widenScalarToNextMultipleOf(0, 32)
739739
.maxScalar(0, S32);
740+
if (ST.hasLshlAddB64())
741+
getActionDefinitionsBuilder(G_ADD)
742+
.legalFor({S64, S32, S16, V2S16})
743+
.clampMaxNumElementsStrict(0, S16, 2)
744+
.scalarize(0)
745+
.minScalar(0, S16)
746+
.widenScalarToNextMultipleOf(0, 32)
747+
.maxScalar(0, S32);
748+
else
749+
getActionDefinitionsBuilder(G_ADD)
750+
.legalFor({S32, S16, V2S16})
751+
.clampMaxNumElementsStrict(0, S16, 2)
752+
.scalarize(0)
753+
.minScalar(0, S16)
754+
.widenScalarToNextMultipleOf(0, 32)
755+
.maxScalar(0, S32);
740756
}
741757

742758
if (ST.hasScalarSMulU64()) {

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,24 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
737737
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
738738
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
739739

740+
let SubtargetPredicate = isGFX940Plus in {
741+
// TODO: Canonicalize these in the target specific CombinerHelper?
742+
def : GCNPat<
743+
(ptradd (shl i64:$src0, i32:$shift), i64:$src1),
744+
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$shift, VSrc_b64:$src1)
745+
>;
746+
747+
def : GCNPat<
748+
(ptradd i64:$src0, (shl i64:$src1, i32:$shift)),
749+
(V_LSHL_ADD_U64_e64 VSrc_b64:$src1, VSrc_b32:$shift, VSrc_b64:$src0)
750+
>;
751+
752+
def : GCNPat<
753+
(ptradd i64:$src0, i64:$src1),
754+
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, (i32 0), VSrc_b64:$src1)
755+
>;
756+
}
757+
740758
def : GCNPat<
741759
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
742760
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;

llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll

Lines changed: 71 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,59 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
23

34
define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
45
; GCN-LABEL: lshl_add_u64_v1v:
5-
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1, v[{{[0-9:]+}}]
6+
; GCN: ; %bb.0:
7+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
9+
; GCN-NEXT: s_setpc_b64 s[30:31]
610
%shl = shl i64 %v, 1
711
%add = add i64 %shl, %a
812
ret i64 %add
913
}
1014

1115
define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
1216
; GCN-LABEL: lshl_add_u64_v4v:
13-
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4, v[{{[0-9:]+}}]
17+
; GCN: ; %bb.0:
18+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3]
20+
; GCN-NEXT: s_setpc_b64 s[30:31]
1421
%shl = shl i64 %v, 4
1522
%add = add i64 %shl, %a
1623
ret i64 %add
1724
}
1825

1926
define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
2027
; GCN-LABEL: lshl_add_u64_v5v:
21-
; GCN: v_lshlrev_b64
22-
; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
28+
; GCN: ; %bb.0:
29+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 5, v[2:3]
31+
; GCN-NEXT: s_setpc_b64 s[30:31]
2332
%shl = shl i64 %v, 5
2433
%add = add i64 %shl, %a
2534
ret i64 %add
2635
}
2736

2837
define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
2938
; GCN-LABEL: lshl_add_u64_vvv:
30-
; GCN: v_lshlrev_b64
31-
; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
39+
; GCN: ; %bb.0:
40+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], v2, v[4:5]
42+
; GCN-NEXT: s_setpc_b64 s[30:31]
3243
%shl = shl i64 %v, %s
3344
%add = add i64 %shl, %a
3445
ret i64 %add
3546
}
3647

3748
define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
3849
; GCN-LABEL: lshl_add_u64_s2v:
39-
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}]
50+
; GCN: ; %bb.0:
51+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
52+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
53+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
54+
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
55+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
56+
; GCN-NEXT: s_endpgm
4057
%a = load i64, ptr undef
4158
%shl = shl i64 %v, 2
4259
%add = add i64 %shl, %a
@@ -46,7 +63,13 @@ define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
4663

4764
define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
4865
; GCN-LABEL: lshl_add_u64_v2s:
49-
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}]
66+
; GCN: ; %bb.0:
67+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
68+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
69+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
70+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
71+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
72+
; GCN-NEXT: s_endpgm
5073
%v = load i64, ptr undef
5174
%shl = shl i64 %v, 2
5275
%add = add i64 %shl, %a
@@ -56,9 +79,14 @@ define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
5679

5780
define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
5881
; GCN-LABEL: lshl_add_u64_s2s:
59-
; GCN: s_lshl_b64
60-
; GCN: s_add_u32
61-
; GCN: s_addc_u32
82+
; GCN: ; %bb.0:
83+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
84+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
85+
; GCN-NEXT: v_mov_b32_e32 v0, s2
86+
; GCN-NEXT: v_mov_b32_e32 v1, s3
87+
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
88+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
89+
; GCN-NEXT: s_endpgm
6290
%shl = shl i64 %v, 2
6391
%add = add i64 %shl, %a
6492
store i64 %add, ptr undef
@@ -67,14 +95,23 @@ define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
6795

6896
define i64 @add_u64_vv(i64 %v, i64 %a) {
6997
; GCN-LABEL: add_u64_vv:
70-
; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
98+
; GCN: ; %bb.0:
99+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
101+
; GCN-NEXT: s_setpc_b64 s[30:31]
71102
%add = add i64 %v, %a
72103
ret i64 %add
73104
}
74105

75106
define amdgpu_kernel void @add_u64_sv(i64 %v) {
76107
; GCN-LABEL: add_u64_sv:
77-
; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
108+
; GCN: ; %bb.0:
109+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
110+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
111+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
112+
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
113+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
114+
; GCN-NEXT: s_endpgm
78115
%a = load i64, ptr undef
79116
%add = add i64 %v, %a
80117
store i64 %add, ptr undef
@@ -83,7 +120,13 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
83120

84121
define amdgpu_kernel void @add_u64_vs(i64 %a) {
85122
; GCN-LABEL: add_u64_vs:
86-
; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
123+
; GCN: ; %bb.0:
124+
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
125+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
126+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
127+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
128+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
129+
; GCN-NEXT: s_endpgm
87130
%v = load i64, ptr undef
88131
%add = add i64 %v, %a
89132
store i64 %add, ptr undef
@@ -92,16 +135,27 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
92135

93136
define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
94137
; GCN-LABEL: add_u64_ss:
95-
; GCN: s_add_u32
96-
; GCN: s_addc_u32 s1, s1, s3
138+
; GCN: ; %bb.0:
139+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
140+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
141+
; GCN-NEXT: s_add_u32 s0, s0, s2
142+
; GCN-NEXT: s_addc_u32 s1, s1, s3
143+
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
144+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
145+
; GCN-NEXT: s_endpgm
97146
%add = add i64 %v, %a
98147
store i64 %add, ptr undef
99148
ret void
100149
}
101150

102151
define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
103152
; GCN-LABEL: lshl_add_u64_gep:
104-
; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
153+
; GCN: ; %bb.0:
154+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155+
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
156+
; GCN-NEXT: flat_load_dword v0, v[0:1]
157+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
158+
; GCN-NEXT: s_setpc_b64 s[30:31]
105159
%gep = getelementptr inbounds i32, ptr %p, i64 %a
106160
%v = load i32, ptr %gep
107161
ret i32 %v

0 commit comments

Comments
 (0)