Skip to content

Commit 1f8eabd

Browse files
committed
[AMDGPU] Try to reuse in v_cndmask register with constant from compare.
For some targets, the optimization `X == Const ? X : Y` → `X == Const ? Const : Y` can cause extra register usage for the constant in `v_cndmask`. This patch detects such cases and reuses the register from the compare instruction that already holds the constant, instead of materializing it again. For SWDEV-506659.
1 parent da69147 commit 1f8eabd

File tree

2 files changed

+240
-4
lines changed

2 files changed

+240
-4
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1411,15 +1411,80 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
14111411
Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
14121412
return false;
14131413

1414+
// Try to find optimized Y == Const ? Const : Z. If Const can't be directly
1415+
// encoded in the cndmask, try to reuse a register already holding the Const
1416+
// value from the comparison instruction.
1417+
auto tryFoldCndMaskCmp =
1418+
[&](MachineOperand *SrcOp, std::optional<int64_t> SrcImm,
1419+
unsigned CmpOpcodes[4], AMDGPU::OpName CmpValName) -> bool {
1420+
// We'll try to process only register operands with known values.
1421+
if (!SrcImm || !SrcOp->isReg())
1422+
return false;
1423+
1424+
// Find the predicate of the cndmask instruction.
1425+
MachineOperand *PredOp = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1426+
if (!PredOp || !PredOp->isReg())
1427+
return false;
1428+
1429+
MachineInstr *PredI = MRI->getVRegDef(PredOp->getReg());
1430+
if (!PredI || !PredI->isCompare())
1431+
return false;
1432+
1433+
unsigned CmpOpc = PredI->getOpcode();
1434+
1435+
if (CmpOpc != CmpOpcodes[0] && CmpOpc != CmpOpcodes[1] &&
1436+
CmpOpc != CmpOpcodes[2] && CmpOpc != CmpOpcodes[3])
1437+
return false;
1438+
1439+
// Check if the immediate value of the source operand matches the immediate
1440+
// value of either the first or second operand of the comparison
1441+
// instruction.
1442+
MachineOperand *SubstOp = nullptr;
1443+
std::optional<int64_t> CmpValImm = getImmOrMaterializedImm(
1444+
*TII->getNamedOperand(*PredI, AMDGPU::OpName::src0));
1445+
if (CmpValImm && *CmpValImm == *SrcImm) {
1446+
SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src1);
1447+
} else {
1448+
CmpValImm = getImmOrMaterializedImm(
1449+
*TII->getNamedOperand(*PredI, AMDGPU::OpName::src1));
1450+
if (CmpValImm && *CmpValImm == *SrcImm) {
1451+
SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src0);
1452+
} else {
1453+
return false;
1454+
}
1455+
}
1456+
1457+
if (!SubstOp || !SubstOp->isReg())
1458+
return false;
1459+
1460+
LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1461+
SrcOp->setReg(SubstOp->getReg());
1462+
LLVM_DEBUG(dbgs() << MI);
1463+
return true;
1464+
};
1465+
14141466
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
14151467
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
14161468
if (!Src1->isIdenticalTo(*Src0)) {
1417-
std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1418-
if (!Src1Imm)
1419-
return false;
1469+
// Try to fold with not-equal comparisons
1470+
unsigned NECmpOpcodes[4] = {
1471+
AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_LG_F32_e64,
1472+
AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64};
14201473

14211474
std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1422-
if (!Src0Imm || *Src0Imm != *Src1Imm)
1475+
if (tryFoldCndMaskCmp(Src0, Src0Imm, NECmpOpcodes, AMDGPU::OpName::src1))
1476+
return true;
1477+
1478+
// Try to fold with equal comparisons
1479+
unsigned EQCmpOpcodes[4] = {
1480+
AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64,
1481+
AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_EQ_U32_e64};
1482+
1483+
std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1484+
if (tryFoldCndMaskCmp(Src1, Src1Imm, EQCmpOpcodes, AMDGPU::OpName::src0))
1485+
return true;
1486+
1487+
if (!Src0Imm || !Src1Imm || *Src0Imm != *Src1Imm)
14231488
return false;
14241489
}
14251490

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
4+
5+
define float @f32_oeq_v_i(float %arg, float %arg1) {
6+
; GFX9-LABEL: f32_oeq_v_i:
7+
; GFX9: ; %bb.0: ; %bb
8+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
10+
; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
11+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
12+
; GFX9-NEXT: s_setpc_b64 s[30:31]
13+
;
14+
; GFX10-LABEL: f32_oeq_v_i:
15+
; GFX10: ; %bb.0: ; %bb
16+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17+
; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
18+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
19+
; GFX10-NEXT: s_setpc_b64 s[30:31]
20+
bb:
21+
%fcmp = fcmp oeq float %arg, 0x3FCF5C2900000000
22+
%select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
23+
ret float %select
24+
}
25+
26+
define float @f32_oeq_i_v(float %arg, float %arg1) {
27+
; GFX9-LABEL: f32_oeq_i_v:
28+
; GFX9: ; %bb.0: ; %bb
29+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30+
; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
31+
; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
32+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
33+
; GFX9-NEXT: s_setpc_b64 s[30:31]
34+
;
35+
; GFX10-LABEL: f32_oeq_i_v:
36+
; GFX10: ; %bb.0: ; %bb
37+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38+
; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
39+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
40+
; GFX10-NEXT: s_setpc_b64 s[30:31]
41+
bb:
42+
%fcmp = fcmp oeq float 0x3FCF5C2900000000, %arg
43+
%select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
44+
ret float %select
45+
}
46+
47+
define float @f32_one_v_i(float %arg, float %arg1) {
48+
; GFX9-LABEL: f32_one_v_i:
49+
; GFX9: ; %bb.0: ; %bb
50+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51+
; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
52+
; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
53+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
54+
; GFX9-NEXT: s_setpc_b64 s[30:31]
55+
;
56+
; GFX10-LABEL: f32_one_v_i:
57+
; GFX10: ; %bb.0: ; %bb
58+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59+
; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
60+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
61+
; GFX10-NEXT: s_setpc_b64 s[30:31]
62+
bb:
63+
%fcmp = fcmp one float %arg, 0x3FCF5C2900000000
64+
%select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
65+
ret float %select
66+
}
67+
68+
define float @f32_one_i_v(float %arg, float %arg1) {
69+
; GFX9-LABEL: f32_one_i_v:
70+
; GFX9: ; %bb.0: ; %bb
71+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72+
; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
73+
; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
74+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
75+
; GFX9-NEXT: s_setpc_b64 s[30:31]
76+
;
77+
; GFX10-LABEL: f32_one_i_v:
78+
; GFX10: ; %bb.0: ; %bb
79+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80+
; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
81+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
82+
; GFX10-NEXT: s_setpc_b64 s[30:31]
83+
bb:
84+
%fcmp = fcmp one float %arg, 0x3FCF5C2900000000
85+
%select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
86+
ret float %select
87+
}
88+
89+
define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) {
90+
; GFX9-LABEL: i32_eq_v_i:
91+
; GFX9: ; %bb.0: ; %bb
92+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93+
; GFX9-NEXT: s_mov_b32 s4, 0x67932
94+
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
95+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
96+
; GFX9-NEXT: s_setpc_b64 s[30:31]
97+
;
98+
; GFX10-LABEL: i32_eq_v_i:
99+
; GFX10: ; %bb.0: ; %bb
100+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101+
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
102+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
103+
; GFX10-NEXT: s_setpc_b64 s[30:31]
104+
bb:
105+
%icmp = icmp eq i32 %arg, 424242
106+
%select = select i1 %icmp, i32 424242, i32 %arg1
107+
ret i32 %select
108+
}
109+
110+
define i32 @i32_eq_i_v(i32 %arg, i32 %arg1) {
111+
; GFX9-LABEL: i32_eq_i_v:
112+
; GFX9: ; %bb.0: ; %bb
113+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114+
; GFX9-NEXT: s_mov_b32 s4, 0x67932
115+
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
116+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
117+
; GFX9-NEXT: s_setpc_b64 s[30:31]
118+
;
119+
; GFX10-LABEL: i32_eq_i_v:
120+
; GFX10: ; %bb.0: ; %bb
121+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122+
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
123+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
124+
; GFX10-NEXT: s_setpc_b64 s[30:31]
125+
bb:
126+
%icmp = icmp eq i32 424242, %arg
127+
%select = select i1 %icmp, i32 424242, i32 %arg1
128+
ret i32 %select
129+
}
130+
131+
define i32 @i32_ne_v_i(i32 %arg, i32 %arg1) {
132+
; GFX9-LABEL: i32_ne_v_i:
133+
; GFX9: ; %bb.0: ; %bb
134+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135+
; GFX9-NEXT: s_mov_b32 s4, 0x67932
136+
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
137+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
138+
; GFX9-NEXT: s_setpc_b64 s[30:31]
139+
;
140+
; GFX10-LABEL: i32_ne_v_i:
141+
; GFX10: ; %bb.0: ; %bb
142+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143+
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
144+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
145+
; GFX10-NEXT: s_setpc_b64 s[30:31]
146+
bb:
147+
%icmp = icmp ne i32 %arg, 424242
148+
%select = select i1 %icmp, i32 %arg1, i32 424242
149+
ret i32 %select
150+
}
151+
152+
define i32 @i32_ne_i_v(i32 %arg, i32 %arg1) {
153+
; GFX9-LABEL: i32_ne_i_v:
154+
; GFX9: ; %bb.0: ; %bb
155+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156+
; GFX9-NEXT: s_mov_b32 s4, 0x67932
157+
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
158+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
159+
; GFX9-NEXT: s_setpc_b64 s[30:31]
160+
;
161+
; GFX10-LABEL: i32_ne_i_v:
162+
; GFX10: ; %bb.0: ; %bb
163+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164+
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
165+
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
166+
; GFX10-NEXT: s_setpc_b64 s[30:31]
167+
bb:
168+
%icmp = icmp ne i32 424242, %arg
169+
%select = select i1 %icmp, i32 %arg1, i32 424242
170+
ret i32 %select
171+
}

0 commit comments

Comments
 (0)