Skip to content

Commit c5881f2

Browse files
committed
added half and i16 types support
1 parent ffad027 commit c5881f2

File tree

2 files changed

+204
-8
lines changed

2 files changed

+204
-8
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1416,7 +1416,7 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
14161416
// value from the comparison instruction.
14171417
auto tryFoldCndMaskCmp =
14181418
[&](MachineOperand *SrcOp, std::optional<int64_t> SrcImm,
1419-
unsigned CmpOpcodes[4], AMDGPU::OpName CmpValName) -> bool {
1419+
ArrayRef<unsigned> CmpOpcodes, AMDGPU::OpName CmpValName) -> bool {
14201420
// We'll try to process only register operands with known values.
14211421
if (!SrcImm || !SrcOp->isReg())
14221422
return false;
@@ -1432,8 +1432,10 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
14321432

14331433
unsigned CmpOpc = PredI->getOpcode();
14341434

1435-
if (CmpOpc != CmpOpcodes[0] && CmpOpc != CmpOpcodes[1] &&
1436-
CmpOpc != CmpOpcodes[2] && CmpOpc != CmpOpcodes[3])
1435+
// Check if the comparison instruction is one of the expected ones.
1436+
const auto *CmpOpcI = find_if(
1437+
CmpOpcodes, [CmpOpc](unsigned Opcode) { return Opcode == CmpOpc; });
1438+
if (CmpOpcI == CmpOpcodes.end())
14371439
return false;
14381440

14391441
// Check if the immediate value of the source operand matches the immediate
@@ -1467,18 +1469,21 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
14671469
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
14681470
if (!Src1->isIdenticalTo(*Src0)) {
14691471
// Try to fold with not-equal comparisons
1470-
unsigned NECmpOpcodes[4] = {
1472+
unsigned NECmpOpcodes[] = {
14711473
AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_LG_F32_e64,
1472-
AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64};
1474+
AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64,
1475+
AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_I16_e64,
1476+
AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_LG_F16_e64};
14731477

14741478
std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
14751479
if (tryFoldCndMaskCmp(Src0, Src0Imm, NECmpOpcodes, AMDGPU::OpName::src1))
14761480
return true;
14771481

14781482
// Try to fold with equal comparisons
1479-
unsigned EQCmpOpcodes[4] = {
1480-
AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64,
1481-
AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_EQ_U32_e64};
1483+
unsigned EQCmpOpcodes[] = {
1484+
AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_I32_e64,
1485+
AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U16_e64,
1486+
AMDGPU::V_CMP_EQ_I16_e64, AMDGPU::V_CMP_EQ_F16_e64};
14821487

14831488
std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
14841489
if (tryFoldCndMaskCmp(Src1, Src1Imm, EQCmpOpcodes, AMDGPU::OpName::src0))

llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,29 @@
22
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
33
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
44

5+
define bfloat @bf16_oeq_v_i(bfloat %arg, bfloat %arg1) {
6+
; GFX9-LABEL: bf16_oeq_v_i:
7+
; GFX9: ; %bb.0: ; %bb
8+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0
10+
; GFX9-NEXT: s_mov_b32 s4, 0x42420000
11+
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, s4, v2
12+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
13+
; GFX9-NEXT: s_setpc_b64 s[30:31]
14+
;
15+
; GFX10-LABEL: bf16_oeq_v_i:
16+
; GFX10: ; %bb.0: ; %bb
17+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18+
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
19+
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x42420000, v2
20+
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
21+
; GFX10-NEXT: s_setpc_b64 s[30:31]
22+
bb:
23+
%fcmp = fcmp oeq bfloat %arg, 0xR4242
24+
%select = select i1 %fcmp, bfloat %arg, bfloat %arg1
25+
ret bfloat %select
26+
}
27+
528
define float @f32_oeq_v_i(float %arg, float %arg1) {
629
; GFX9-LABEL: f32_oeq_v_i:
730
; GFX9: ; %bb.0: ; %bb
@@ -86,6 +109,90 @@ bb:
86109
ret float %select
87110
}
88111

112+
define half @f16_oeq_v_i(half %arg, half %arg1) {
113+
; GFX9-LABEL: f16_oeq_v_i:
114+
; GFX9: ; %bb.0: ; %bb
115+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116+
; GFX9-NEXT: s_movk_i32 s4, 0x5140
117+
; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, s4, v0
118+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
119+
; GFX9-NEXT: s_setpc_b64 s[30:31]
120+
;
121+
; GFX10-LABEL: f16_oeq_v_i:
122+
; GFX10: ; %bb.0: ; %bb
123+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124+
; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0x5140, v0
125+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo
126+
; GFX10-NEXT: s_setpc_b64 s[30:31]
127+
bb:
128+
%fcmp = fcmp oeq half %arg, 42.0
129+
%select = select i1 %fcmp, half 42.0, half %arg1
130+
ret half %select
131+
}
132+
133+
define half @f16_oeq_i_v(half %arg, half %arg1) {
134+
; GFX9-LABEL: f16_oeq_i_v:
135+
; GFX9: ; %bb.0: ; %bb
136+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137+
; GFX9-NEXT: s_movk_i32 s4, 0x5140
138+
; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, s4, v0
139+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
140+
; GFX9-NEXT: s_setpc_b64 s[30:31]
141+
;
142+
; GFX10-LABEL: f16_oeq_i_v:
143+
; GFX10: ; %bb.0: ; %bb
144+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
145+
; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0x5140, v0
146+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo
147+
; GFX10-NEXT: s_setpc_b64 s[30:31]
148+
bb:
149+
%fcmp = fcmp oeq half 42.0, %arg
150+
%select = select i1 %fcmp, half 42.0, half %arg1
151+
ret half %select
152+
}
153+
154+
define half @f16_one_v_i(half %arg, half %arg1) {
155+
; GFX9-LABEL: f16_one_v_i:
156+
; GFX9: ; %bb.0: ; %bb
157+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158+
; GFX9-NEXT: s_movk_i32 s4, 0x5140
159+
; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0
160+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
161+
; GFX9-NEXT: s_setpc_b64 s[30:31]
162+
;
163+
; GFX10-LABEL: f16_one_v_i:
164+
; GFX10: ; %bb.0: ; %bb
165+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166+
; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x5140, v0
167+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo
168+
; GFX10-NEXT: s_setpc_b64 s[30:31]
169+
bb:
170+
%fcmp = fcmp one half %arg, 42.0
171+
%select = select i1 %fcmp, half %arg1, half 42.0
172+
ret half %select
173+
}
174+
175+
define half @f16_one_i_v(half %arg, half %arg1) {
176+
; GFX9-LABEL: f16_one_i_v:
177+
; GFX9: ; %bb.0: ; %bb
178+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179+
; GFX9-NEXT: s_movk_i32 s4, 0x5140
180+
; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0
181+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
182+
; GFX9-NEXT: s_setpc_b64 s[30:31]
183+
;
184+
; GFX10-LABEL: f16_one_i_v:
185+
; GFX10: ; %bb.0: ; %bb
186+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
187+
; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x5140, v0
188+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x5140, v1, vcc_lo
189+
; GFX10-NEXT: s_setpc_b64 s[30:31]
190+
bb:
191+
%fcmp = fcmp one half %arg, 42.0
192+
%select = select i1 %fcmp, half %arg1, half 42.0
193+
ret half %select
194+
}
195+
89196
define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) {
90197
; GFX9-LABEL: i32_eq_v_i:
91198
; GFX9: ; %bb.0: ; %bb
@@ -169,3 +276,87 @@ bb:
169276
%select = select i1 %icmp, i32 %arg1, i32 424242
170277
ret i32 %select
171278
}
279+
280+
define i16 @i16_eq_v_i(i16 %arg, i16 %arg1) {
281+
; GFX9-LABEL: i16_eq_v_i:
282+
; GFX9: ; %bb.0: ; %bb
283+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
284+
; GFX9-NEXT: s_movk_i32 s4, 0x1092
285+
; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
286+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
287+
; GFX9-NEXT: s_setpc_b64 s[30:31]
288+
;
289+
; GFX10-LABEL: i16_eq_v_i:
290+
; GFX10: ; %bb.0: ; %bb
291+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292+
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
293+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
294+
; GFX10-NEXT: s_setpc_b64 s[30:31]
295+
bb:
296+
%icmp = icmp eq i16 %arg, 4242
297+
%select = select i1 %icmp, i16 4242, i16 %arg1
298+
ret i16 %select
299+
}
300+
301+
define i16 @i16_eq_i_v(i16 %arg, i16 %arg1) {
302+
; GFX9-LABEL: i16_eq_i_v:
303+
; GFX9: ; %bb.0: ; %bb
304+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305+
; GFX9-NEXT: s_movk_i32 s4, 0x1092
306+
; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
307+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
308+
; GFX9-NEXT: s_setpc_b64 s[30:31]
309+
;
310+
; GFX10-LABEL: i16_eq_i_v:
311+
; GFX10: ; %bb.0: ; %bb
312+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
313+
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
314+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
315+
; GFX10-NEXT: s_setpc_b64 s[30:31]
316+
bb:
317+
%icmp = icmp eq i16 4242, %arg
318+
%select = select i1 %icmp, i16 4242, i16 %arg1
319+
ret i16 %select
320+
}
321+
322+
define i16 @i16_ne_v_i(i16 %arg, i16 %arg1) {
323+
; GFX9-LABEL: i16_ne_v_i:
324+
; GFX9: ; %bb.0: ; %bb
325+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326+
; GFX9-NEXT: s_movk_i32 s4, 0x1092
327+
; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
328+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
329+
; GFX9-NEXT: s_setpc_b64 s[30:31]
330+
;
331+
; GFX10-LABEL: i16_ne_v_i:
332+
; GFX10: ; %bb.0: ; %bb
333+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334+
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
335+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
336+
; GFX10-NEXT: s_setpc_b64 s[30:31]
337+
bb:
338+
%icmp = icmp ne i16 %arg, 4242
339+
%select = select i1 %icmp, i16 %arg1, i16 4242
340+
ret i16 %select
341+
}
342+
343+
define i16 @i16_ne_i_v(i16 %arg, i16 %arg1) {
344+
; GFX9-LABEL: i16_ne_i_v:
345+
; GFX9: ; %bb.0: ; %bb
346+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347+
; GFX9-NEXT: s_movk_i32 s4, 0x1092
348+
; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0
349+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
350+
; GFX9-NEXT: s_setpc_b64 s[30:31]
351+
;
352+
; GFX10-LABEL: i16_ne_i_v:
353+
; GFX10: ; %bb.0: ; %bb
354+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355+
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
356+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
357+
; GFX10-NEXT: s_setpc_b64 s[30:31]
358+
bb:
359+
%icmp = icmp ne i16 4242, %arg
360+
%select = select i1 %icmp, i16 %arg1, i16 4242
361+
ret i16 %select
362+
}

0 commit comments

Comments
 (0)