Skip to content

Commit c41aa01

Browse files
committed
AMDGPU: Select V_PK_ADD_{MIN|MAX}_{I|U}16 and V_{MIN|MAX}3_{I|U}16 on gfx1250
1 parent e56658e commit c41aa01

File tree

4 files changed

+423
-14
lines changed

4 files changed

+423
-14
lines changed

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,30 @@ defm V_PK_MIN3_U16 : VOP3PInst<"v_pk_min3_u16", PK_ADD_MINMAX_Profile>;
372372
}
373373
} // End isCommutable = 1, isReMaterializable = 1
374374

375+
// TODO: Extend pattern to select op_sel and op_sel_hi.
376+
class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
377+
VOP3P_Pseudo inst,
378+
ValueType vt = inst.Pfl.Src0VT,
379+
RegisterOperand RC = getVCSrcForVT<vt>.ret> : GCNPat <
380+
(ThreeOpFrag<op1, op2> vt:$src0, vt:$src1, vt:$src2),
381+
(inst SRCMODS.OP_SEL_1, RC:$src0, SRCMODS.OP_SEL_1, RC:$src1,
382+
SRCMODS.OP_SEL_1, RC:$src2, DSTCLAMP.NONE, 0)
383+
>;
384+
385+
let SubtargetPredicate = HasPkAddMinMaxInsts in {
386+
def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
387+
def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
388+
def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
389+
def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
390+
}
391+
392+
let SubtargetPredicate = HasPkMinMax3Insts in {
393+
def : ThreeOp_OpSelClampPats<smax, smax, V_PK_MAX3_I16>;
394+
def : ThreeOp_OpSelClampPats<umax, umax, V_PK_MAX3_U16>;
395+
def : ThreeOp_OpSelClampPats<smin, smin, V_PK_MIN3_I16>;
396+
def : ThreeOp_OpSelClampPats<umin, umin, V_PK_MIN3_U16>;
397+
}
398+
375399
// Defines patterns that extract signed 4bit from each Idx[0].
376400
foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in
377401
def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src),
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,SDAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GISEL %s
4+
5+
define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
6+
; GCN-LABEL: add_max_u32_vvv:
7+
; GCN: ; %bb.0:
8+
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
9+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
10+
; GCN-NEXT: v_max_u32_e32 v0, v0, v2
11+
; GCN-NEXT: ; return to shader part epilog
12+
%add = add i32 %a, %b
13+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
14+
%ret = bitcast i32 %max to float
15+
ret float %ret
16+
}
17+
18+
define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
19+
; GCN-LABEL: add_max_u32_svv:
20+
; GCN: ; %bb.0:
21+
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
22+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
23+
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
24+
; GCN-NEXT: ; return to shader part epilog
25+
%add = add i32 %a, %b
26+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
27+
%ret = bitcast i32 %max to float
28+
ret float %ret
29+
}
30+
31+
define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
32+
; GCN-LABEL: add_max_u32_ssv:
33+
; GCN: ; %bb.0:
34+
; GCN-NEXT: s_add_co_i32 s0, s0, s1
35+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
36+
; GCN-NEXT: v_max_u32_e32 v0, s0, v0
37+
; GCN-NEXT: ; return to shader part epilog
38+
%add = add i32 %a, %b
39+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
40+
%ret = bitcast i32 %max to float
41+
ret float %ret
42+
}
43+
44+
define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
45+
; GCN-LABEL: add_max_u32_sss:
46+
; GCN: ; %bb.0:
47+
; GCN-NEXT: s_add_co_i32 s0, s0, s1
48+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
49+
; GCN-NEXT: s_max_u32 s0, s0, s2
50+
; GCN-NEXT: v_mov_b32_e32 v0, s0
51+
; GCN-NEXT: ; return to shader part epilog
52+
%add = add i32 %a, %b
53+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
54+
%ret = bitcast i32 %max to float
55+
ret float %ret
56+
}
57+
58+
define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
59+
; GCN-LABEL: add_max_u32_vsi:
60+
; GCN: ; %bb.0:
61+
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
62+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
63+
; GCN-NEXT: v_max_u32_e32 v0, 4, v0
64+
; GCN-NEXT: ; return to shader part epilog
65+
%add = add i32 %a, %b
66+
%max = call i32 @llvm.umax.i32(i32 %add, i32 4)
67+
%ret = bitcast i32 %max to float
68+
ret float %ret
69+
}
70+
71+
define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
72+
; GCN-LABEL: add_max_u32_svl:
73+
; GCN: ; %bb.0:
74+
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
75+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
76+
; GCN-NEXT: v_max_u32_e32 v0, 0x64, v0
77+
; GCN-NEXT: ; return to shader part epilog
78+
%add = add i32 %a, %b
79+
%max = call i32 @llvm.umax.i32(i32 %add, i32 100)
80+
%ret = bitcast i32 %max to float
81+
ret float %ret
82+
}
83+
84+
define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
85+
; GCN-LABEL: add_max_u32_slv:
86+
; GCN: ; %bb.0:
87+
; GCN-NEXT: s_addk_co_i32 s0, 0x64
88+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
89+
; GCN-NEXT: v_max_u32_e32 v0, s0, v0
90+
; GCN-NEXT: ; return to shader part epilog
91+
%add = add i32 %a, 100
92+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
93+
%ret = bitcast i32 %max to float
94+
ret float %ret
95+
}
96+
97+
define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
98+
; GCN-LABEL: add_max_i32_vvv:
99+
; GCN: ; %bb.0:
100+
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
101+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
102+
; GCN-NEXT: v_max_i32_e32 v0, v0, v2
103+
; GCN-NEXT: ; return to shader part epilog
104+
%add = add i32 %a, %b
105+
%max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
106+
%ret = bitcast i32 %max to float
107+
ret float %ret
108+
}
109+
110+
define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
111+
; GCN-LABEL: add_min_u32_vvv:
112+
; GCN: ; %bb.0:
113+
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
114+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
115+
; GCN-NEXT: v_min_u32_e32 v0, v0, v2
116+
; GCN-NEXT: ; return to shader part epilog
117+
%add = add i32 %a, %b
118+
%max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
119+
%ret = bitcast i32 %max to float
120+
ret float %ret
121+
}
122+
123+
define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
124+
; GCN-LABEL: add_min_i32_vvv:
125+
; GCN: ; %bb.0:
126+
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
127+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
128+
; GCN-NEXT: v_min_i32_e32 v0, v0, v2
129+
; GCN-NEXT: ; return to shader part epilog
130+
%add = add i32 %a, %b
131+
%max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
132+
%ret = bitcast i32 %max to float
133+
ret float %ret
134+
}
135+
136+
define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
137+
; GCN-LABEL: add_max_v2u16_vvv:
138+
; GCN: ; %bb.0:
139+
; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
140+
; GCN-NEXT: ; return to shader part epilog
141+
%add = add <2 x i16> %a, %b
142+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
143+
%ret = bitcast <2 x i16> %max to float
144+
ret float %ret
145+
}
146+
147+
define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x i16> %c) {
148+
; GCN-LABEL: add_max_v2u16_svv:
149+
; GCN: ; %bb.0:
150+
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1
151+
; GCN-NEXT: ; return to shader part epilog
152+
%add = add <2 x i16> %a, %b
153+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
154+
%ret = bitcast <2 x i16> %max to float
155+
ret float %ret
156+
}
157+
158+
define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
159+
; SDAG-LABEL: add_max_v2u16_ssv:
160+
; SDAG: ; %bb.0:
161+
; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
162+
; SDAG-NEXT: ; return to shader part epilog
163+
;
164+
; GISEL-LABEL: add_max_v2u16_ssv:
165+
; GISEL: ; %bb.0:
166+
; GISEL-NEXT: s_lshr_b32 s2, s0, 16
167+
; GISEL-NEXT: s_lshr_b32 s3, s1, 16
168+
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
169+
; GISEL-NEXT: s_add_co_i32 s2, s2, s3
170+
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
171+
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
172+
; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
173+
; GISEL-NEXT: ; return to shader part epilog
174+
%add = add <2 x i16> %a, %b
175+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
176+
%ret = bitcast <2 x i16> %max to float
177+
ret float %ret
178+
}
179+
180+
define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
181+
; SDAG-LABEL: add_max_v2u16_sss:
182+
; SDAG: ; %bb.0:
183+
; SDAG-NEXT: v_pk_add_u16 v0, s0, s1
184+
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
185+
; SDAG-NEXT: v_pk_max_u16 v0, v0, s2
186+
; SDAG-NEXT: ; return to shader part epilog
187+
;
188+
; GISEL-LABEL: add_max_v2u16_sss:
189+
; GISEL: ; %bb.0:
190+
; GISEL-NEXT: s_lshr_b32 s3, s0, 16
191+
; GISEL-NEXT: s_lshr_b32 s4, s1, 16
192+
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
193+
; GISEL-NEXT: s_add_co_i32 s3, s3, s4
194+
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
195+
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3
196+
; GISEL-NEXT: s_and_b32 s3, s2, 0xffff
197+
; GISEL-NEXT: s_lshr_b32 s1, s0, 16
198+
; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
199+
; GISEL-NEXT: s_lshr_b32 s2, s2, 16
200+
; GISEL-NEXT: s_max_u32 s0, s0, s3
201+
; GISEL-NEXT: s_max_u32 s1, s1, s2
202+
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
203+
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
204+
; GISEL-NEXT: v_mov_b32_e32 v0, s0
205+
; GISEL-NEXT: ; return to shader part epilog
206+
%add = add <2 x i16> %a, %b
207+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
208+
%ret = bitcast <2 x i16> %max to float
209+
ret float %ret
210+
}
211+
212+
define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
213+
; GCN-LABEL: add_max_v2u16_vsi:
214+
; GCN: ; %bb.0:
215+
; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4
216+
; GCN-NEXT: ; return to shader part epilog
217+
%add = add <2 x i16> %a, %b
218+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
219+
%ret = bitcast <2 x i16> %max to float
220+
ret float %ret
221+
}
222+
223+
define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
224+
; GCN-LABEL: add_max_v2u16_svl:
225+
; GCN: ; %bb.0:
226+
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064
227+
; GCN-NEXT: ; return to shader part epilog
228+
%add = add <2 x i16> %a, %b
229+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
230+
%ret = bitcast <2 x i16> %max to float
231+
ret float %ret
232+
}
233+
234+
define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
235+
; SDAG-LABEL: add_max_v2u16_slv:
236+
; SDAG: ; %bb.0:
237+
; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
238+
; SDAG-NEXT: ; return to shader part epilog
239+
;
240+
; GISEL-LABEL: add_max_v2u16_slv:
241+
; GISEL: ; %bb.0:
242+
; GISEL-NEXT: s_lshr_b32 s1, s0, 16
243+
; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064
244+
; GISEL-NEXT: s_addk_co_i32 s1, 0x64
245+
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
246+
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
247+
; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
248+
; GISEL-NEXT: ; return to shader part epilog
249+
%add = add <2 x i16> %a, <i16 100, i16 100>
250+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
251+
%ret = bitcast <2 x i16> %max to float
252+
ret float %ret
253+
}
254+
255+
define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
256+
; GCN-LABEL: add_max_v2s16_vvv:
257+
; GCN: ; %bb.0:
258+
; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
259+
; GCN-NEXT: ; return to shader part epilog
260+
%add = add <2 x i16> %a, %b
261+
%max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
262+
%ret = bitcast <2 x i16> %max to float
263+
ret float %ret
264+
}
265+
266+
define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
267+
; GCN-LABEL: add_min_v2u16_vvv:
268+
; GCN: ; %bb.0:
269+
; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
270+
; GCN-NEXT: ; return to shader part epilog
271+
%add = add <2 x i16> %a, %b
272+
%max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
273+
%ret = bitcast <2 x i16> %max to float
274+
ret float %ret
275+
}
276+
277+
define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
278+
; GCN-LABEL: add_min_v2s16_vvv:
279+
; GCN: ; %bb.0:
280+
; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
281+
; GCN-NEXT: ; return to shader part epilog
282+
%add = add <2 x i16> %a, %b
283+
%max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
284+
%ret = bitcast <2 x i16> %max to float
285+
ret float %ret
286+
}
287+
288+
declare <2 x i16> @llvm.smin.v216(<2 x i16>, <2 x i16>)
289+
declare <2 x i16> @llvm.smax.v216(<2 x i16>, <2 x i16>)
290+
declare <2 x i16> @llvm.umin.v216(<2 x i16>, <2 x i16>)
291+
declare <2 x i16> @llvm.umax.v216(<2 x i16>, <2 x i16>)
292+
declare i32 @llvm.smin.i32(i32, i32)
293+
declare i32 @llvm.smax.i32(i32, i32)
294+
declare i32 @llvm.umin.i32(i32, i32)
295+
declare i32 @llvm.umax.i32(i32, i32)

0 commit comments

Comments
 (0)