Skip to content

Commit d385e9d

Browse files
changpengrampitec
andauthored
AMDGPU: Support V_PK_ADD_{MIN|MAX}_{I|U}16 and V_{MIN|MAX}3_{I|U}16 on gfx1250 (#150155)
Co-authored-by: Stanislav Mekhanoshin <[email protected]>
1 parent 0f23569 commit d385e9d

File tree

9 files changed

+1331
-14
lines changed

9 files changed

+1331
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2519,6 +2519,14 @@ def HasFmaakFmamkF64Insts :
25192519
Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
25202520
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
25212521

2522+
def HasPkAddMinMaxInsts :
2523+
Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
2524+
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
2525+
2526+
def HasPkMinMax3Insts :
2527+
Predicate<"Subtarget->hasPkMinMax3Insts()">,
2528+
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
2529+
25222530
def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
25232531
AssemblerPredicate<(all_of FeatureImageInsts)>;
25242532

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1500,6 +1500,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
15001500

15011501
bool hasVOPD3() const { return GFX1250Insts; }
15021502

1503+
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
1504+
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
1505+
1506+
// \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
1507+
bool hasPkMinMax3Insts() const { return GFX1250Insts; }
1508+
15031509
// \returns true if target has S_SETPRIO_INC_WG instruction.
15041510
bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
15051511

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,49 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
353353
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
354354
}
355355

356+
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
357+
let HasModifiers = 0;
358+
}
359+
360+
let isCommutable = 1, isReMaterializable = 1 in {
361+
let SubtargetPredicate = HasPkAddMinMaxInsts in {
362+
defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
363+
defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
364+
defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
365+
defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
366+
}
367+
let SubtargetPredicate = HasPkMinMax3Insts in {
368+
defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
369+
defm V_PK_MAX3_U16 : VOP3PInst<"v_pk_max3_u16", PK_ADD_MINMAX_Profile>;
370+
defm V_PK_MIN3_I16 : VOP3PInst<"v_pk_min3_i16", PK_ADD_MINMAX_Profile>;
371+
defm V_PK_MIN3_U16 : VOP3PInst<"v_pk_min3_u16", PK_ADD_MINMAX_Profile>;
372+
}
373+
} // End isCommutable = 1, isReMaterializable = 1
374+
375+
// TODO: Extend pattern to select op_sel and op_sel_hi.
376+
class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
377+
VOP3P_Pseudo inst,
378+
ValueType vt = inst.Pfl.Src0VT,
379+
RegisterOperand RC = getVCSrcForVT<vt>.ret> : GCNPat <
380+
(ThreeOpFrag<op1, op2> vt:$src0, vt:$src1, vt:$src2),
381+
(inst SRCMODS.OP_SEL_1, RC:$src0, SRCMODS.OP_SEL_1, RC:$src1,
382+
SRCMODS.OP_SEL_1, RC:$src2, DSTCLAMP.NONE, 0)
383+
>;
384+
385+
let SubtargetPredicate = HasPkAddMinMaxInsts in {
386+
def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
387+
def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
388+
def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
389+
def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
390+
}
391+
392+
let SubtargetPredicate = HasPkMinMax3Insts in {
393+
def : ThreeOp_OpSelClampPats<smax, smax, V_PK_MAX3_I16>;
394+
def : ThreeOp_OpSelClampPats<umax, umax, V_PK_MAX3_U16>;
395+
def : ThreeOp_OpSelClampPats<smin, smin, V_PK_MIN3_I16>;
396+
def : ThreeOp_OpSelClampPats<umin, umin, V_PK_MIN3_U16>;
397+
}
398+
356399
// Defines patterns that extract signed 4bit from each Idx[0].
357400
foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in
358401
def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src),
@@ -2157,6 +2200,8 @@ multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op>
21572200

21582201
multiclass VOP3P_Real_gfx12<bits<8> op> : VOP3P_Real_Base<GFX12Gen, op>;
21592202

2203+
multiclass VOP3P_Real_gfx1250<bits<8> op> : VOP3P_Real_Base<GFX1250Gen, op>;
2204+
21602205
multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
21612206
string backing_ps_name = NAME,
21622207
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> :
@@ -2165,6 +2210,15 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
21652210
defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
21662211
defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
21672212

2213+
defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>;
2214+
defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>;
2215+
defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>;
2216+
defm V_PK_ADD_MIN_U16 : VOP3P_Real_gfx1250<0x2e>;
2217+
defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>;
2218+
defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>;
2219+
defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>;
2220+
defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>;
2221+
21682222
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
21692223
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
21702224

llvm/test/CodeGen/AMDGPU/add-max.ll

Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,SDAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GISEL %s
4+
5+
define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
6+
; GCN-LABEL: add_max_u32_vvv:
7+
; GCN: ; %bb.0:
8+
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
9+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
10+
; GCN-NEXT: v_max_u32_e32 v0, v0, v2
11+
; GCN-NEXT: ; return to shader part epilog
12+
%add = add i32 %a, %b
13+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
14+
%ret = bitcast i32 %max to float
15+
ret float %ret
16+
}
17+
18+
define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
19+
; GCN-LABEL: add_max_u32_svv:
20+
; GCN: ; %bb.0:
21+
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
22+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
23+
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
24+
; GCN-NEXT: ; return to shader part epilog
25+
%add = add i32 %a, %b
26+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
27+
%ret = bitcast i32 %max to float
28+
ret float %ret
29+
}
30+
31+
define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
32+
; GCN-LABEL: add_max_u32_ssv:
33+
; GCN: ; %bb.0:
34+
; GCN-NEXT: s_add_co_i32 s0, s0, s1
35+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
36+
; GCN-NEXT: v_max_u32_e32 v0, s0, v0
37+
; GCN-NEXT: ; return to shader part epilog
38+
%add = add i32 %a, %b
39+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
40+
%ret = bitcast i32 %max to float
41+
ret float %ret
42+
}
43+
44+
define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
45+
; GCN-LABEL: add_max_u32_sss:
46+
; GCN: ; %bb.0:
47+
; GCN-NEXT: s_add_co_i32 s0, s0, s1
48+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
49+
; GCN-NEXT: s_max_u32 s0, s0, s2
50+
; GCN-NEXT: v_mov_b32_e32 v0, s0
51+
; GCN-NEXT: ; return to shader part epilog
52+
%add = add i32 %a, %b
53+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
54+
%ret = bitcast i32 %max to float
55+
ret float %ret
56+
}
57+
58+
define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
59+
; GCN-LABEL: add_max_u32_vsi:
60+
; GCN: ; %bb.0:
61+
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
62+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
63+
; GCN-NEXT: v_max_u32_e32 v0, 4, v0
64+
; GCN-NEXT: ; return to shader part epilog
65+
%add = add i32 %a, %b
66+
%max = call i32 @llvm.umax.i32(i32 %add, i32 4)
67+
%ret = bitcast i32 %max to float
68+
ret float %ret
69+
}
70+
71+
define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
72+
; GCN-LABEL: add_max_u32_svl:
73+
; GCN: ; %bb.0:
74+
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
75+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
76+
; GCN-NEXT: v_max_u32_e32 v0, 0x64, v0
77+
; GCN-NEXT: ; return to shader part epilog
78+
%add = add i32 %a, %b
79+
%max = call i32 @llvm.umax.i32(i32 %add, i32 100)
80+
%ret = bitcast i32 %max to float
81+
ret float %ret
82+
}
83+
84+
define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
85+
; GCN-LABEL: add_max_u32_slv:
86+
; GCN: ; %bb.0:
87+
; GCN-NEXT: s_addk_co_i32 s0, 0x64
88+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
89+
; GCN-NEXT: v_max_u32_e32 v0, s0, v0
90+
; GCN-NEXT: ; return to shader part epilog
91+
%add = add i32 %a, 100
92+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
93+
%ret = bitcast i32 %max to float
94+
ret float %ret
95+
}
96+
97+
define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
98+
; GCN-LABEL: add_max_i32_vvv:
99+
; GCN: ; %bb.0:
100+
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
101+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
102+
; GCN-NEXT: v_max_i32_e32 v0, v0, v2
103+
; GCN-NEXT: ; return to shader part epilog
104+
%add = add i32 %a, %b
105+
%max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
106+
%ret = bitcast i32 %max to float
107+
ret float %ret
108+
}
109+
110+
define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
111+
; GCN-LABEL: add_min_u32_vvv:
112+
; GCN: ; %bb.0:
113+
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
114+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
115+
; GCN-NEXT: v_min_u32_e32 v0, v0, v2
116+
; GCN-NEXT: ; return to shader part epilog
117+
%add = add i32 %a, %b
118+
%max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
119+
%ret = bitcast i32 %max to float
120+
ret float %ret
121+
}
122+
123+
define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
124+
; GCN-LABEL: add_min_i32_vvv:
125+
; GCN: ; %bb.0:
126+
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
127+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
128+
; GCN-NEXT: v_min_i32_e32 v0, v0, v2
129+
; GCN-NEXT: ; return to shader part epilog
130+
%add = add i32 %a, %b
131+
%max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
132+
%ret = bitcast i32 %max to float
133+
ret float %ret
134+
}
135+
136+
define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
137+
; GCN-LABEL: add_max_v2u16_vvv:
138+
; GCN: ; %bb.0:
139+
; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
140+
; GCN-NEXT: ; return to shader part epilog
141+
%add = add <2 x i16> %a, %b
142+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
143+
%ret = bitcast <2 x i16> %max to float
144+
ret float %ret
145+
}
146+
147+
define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x i16> %c) {
148+
; GCN-LABEL: add_max_v2u16_svv:
149+
; GCN: ; %bb.0:
150+
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1
151+
; GCN-NEXT: ; return to shader part epilog
152+
%add = add <2 x i16> %a, %b
153+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
154+
%ret = bitcast <2 x i16> %max to float
155+
ret float %ret
156+
}
157+
158+
define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
159+
; SDAG-LABEL: add_max_v2u16_ssv:
160+
; SDAG: ; %bb.0:
161+
; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
162+
; SDAG-NEXT: ; return to shader part epilog
163+
;
164+
; GISEL-LABEL: add_max_v2u16_ssv:
165+
; GISEL: ; %bb.0:
166+
; GISEL-NEXT: s_lshr_b32 s2, s0, 16
167+
; GISEL-NEXT: s_lshr_b32 s3, s1, 16
168+
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
169+
; GISEL-NEXT: s_add_co_i32 s2, s2, s3
170+
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
171+
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
172+
; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
173+
; GISEL-NEXT: ; return to shader part epilog
174+
%add = add <2 x i16> %a, %b
175+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
176+
%ret = bitcast <2 x i16> %max to float
177+
ret float %ret
178+
}
179+
180+
define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
181+
; SDAG-LABEL: add_max_v2u16_sss:
182+
; SDAG: ; %bb.0:
183+
; SDAG-NEXT: v_pk_add_u16 v0, s0, s1
184+
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
185+
; SDAG-NEXT: v_pk_max_u16 v0, v0, s2
186+
; SDAG-NEXT: ; return to shader part epilog
187+
;
188+
; GISEL-LABEL: add_max_v2u16_sss:
189+
; GISEL: ; %bb.0:
190+
; GISEL-NEXT: s_lshr_b32 s3, s0, 16
191+
; GISEL-NEXT: s_lshr_b32 s4, s1, 16
192+
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
193+
; GISEL-NEXT: s_add_co_i32 s3, s3, s4
194+
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
195+
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3
196+
; GISEL-NEXT: s_and_b32 s3, s2, 0xffff
197+
; GISEL-NEXT: s_lshr_b32 s1, s0, 16
198+
; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
199+
; GISEL-NEXT: s_lshr_b32 s2, s2, 16
200+
; GISEL-NEXT: s_max_u32 s0, s0, s3
201+
; GISEL-NEXT: s_max_u32 s1, s1, s2
202+
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
203+
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
204+
; GISEL-NEXT: v_mov_b32_e32 v0, s0
205+
; GISEL-NEXT: ; return to shader part epilog
206+
%add = add <2 x i16> %a, %b
207+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
208+
%ret = bitcast <2 x i16> %max to float
209+
ret float %ret
210+
}
211+
212+
define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
213+
; GCN-LABEL: add_max_v2u16_vsi:
214+
; GCN: ; %bb.0:
215+
; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4
216+
; GCN-NEXT: ; return to shader part epilog
217+
%add = add <2 x i16> %a, %b
218+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
219+
%ret = bitcast <2 x i16> %max to float
220+
ret float %ret
221+
}
222+
223+
define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
224+
; GCN-LABEL: add_max_v2u16_svl:
225+
; GCN: ; %bb.0:
226+
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064
227+
; GCN-NEXT: ; return to shader part epilog
228+
%add = add <2 x i16> %a, %b
229+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
230+
%ret = bitcast <2 x i16> %max to float
231+
ret float %ret
232+
}
233+
234+
define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
235+
; SDAG-LABEL: add_max_v2u16_slv:
236+
; SDAG: ; %bb.0:
237+
; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
238+
; SDAG-NEXT: ; return to shader part epilog
239+
;
240+
; GISEL-LABEL: add_max_v2u16_slv:
241+
; GISEL: ; %bb.0:
242+
; GISEL-NEXT: s_lshr_b32 s1, s0, 16
243+
; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064
244+
; GISEL-NEXT: s_addk_co_i32 s1, 0x64
245+
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
246+
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
247+
; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
248+
; GISEL-NEXT: ; return to shader part epilog
249+
%add = add <2 x i16> %a, <i16 100, i16 100>
250+
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
251+
%ret = bitcast <2 x i16> %max to float
252+
ret float %ret
253+
}
254+
255+
define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
256+
; GCN-LABEL: add_max_v2s16_vvv:
257+
; GCN: ; %bb.0:
258+
; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
259+
; GCN-NEXT: ; return to shader part epilog
260+
%add = add <2 x i16> %a, %b
261+
%max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
262+
%ret = bitcast <2 x i16> %max to float
263+
ret float %ret
264+
}
265+
266+
define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
267+
; GCN-LABEL: add_min_v2u16_vvv:
268+
; GCN: ; %bb.0:
269+
; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
270+
; GCN-NEXT: ; return to shader part epilog
271+
%add = add <2 x i16> %a, %b
272+
%max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
273+
%ret = bitcast <2 x i16> %max to float
274+
ret float %ret
275+
}
276+
277+
define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
278+
; GCN-LABEL: add_min_v2s16_vvv:
279+
; GCN: ; %bb.0:
280+
; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
281+
; GCN-NEXT: ; return to shader part epilog
282+
%add = add <2 x i16> %a, %b
283+
%max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
284+
%ret = bitcast <2 x i16> %max to float
285+
ret float %ret
286+
}
287+
288+
declare <2 x i16> @llvm.smin.v216(<2 x i16>, <2 x i16>)
289+
declare <2 x i16> @llvm.smax.v216(<2 x i16>, <2 x i16>)
290+
declare <2 x i16> @llvm.umin.v216(<2 x i16>, <2 x i16>)
291+
declare <2 x i16> @llvm.umax.v216(<2 x i16>, <2 x i16>)
292+
declare i32 @llvm.smin.i32(i32, i32)
293+
declare i32 @llvm.smax.i32(i32, i32)
294+
declare i32 @llvm.umin.i32(i32, i32)
295+
declare i32 @llvm.umax.i32(i32, i32)

0 commit comments

Comments
 (0)