Skip to content

Commit b3fb40b

Browse files
dfukalovzmodem
authored andcommitted
[AMDGPU] Fix for folding v2.16 literals.
It was found some packed immediate operands (e.g. `<half 1.0, half 2.0>`) are incorrectly processed so one of two packed values were lost. Introduced new function to check immediate 32-bit operand can be folded. Converted condition about current op_sel flags value to fall-through. Fixes: SWDEV-247595 Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D87158 (cherry picked from commit d03c4034dc80c944ec4a5833ba8f87d60183f866)
1 parent 01be54e commit b3fb40b

File tree

4 files changed

+40
-24
lines changed

4 files changed

+40
-24
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,8 @@ static bool updateOperand(FoldCandidate &Fold,
192192
if (Fold.isImm()) {
193193
if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
194194
!(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
195-
AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
196-
ST.hasInv2PiInlineImm())) {
195+
AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
196+
ST.hasInv2PiInlineImm())) {
197197
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
198198
// already set.
199199
unsigned Opcode = MI->getOpcode();
@@ -209,30 +209,30 @@ static bool updateOperand(FoldCandidate &Fold,
209209
ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
210210
MachineOperand &Mod = MI->getOperand(ModIdx);
211211
unsigned Val = Mod.getImm();
212-
if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
213-
return false;
214-
// Only apply the following transformation if that operand requries
215-
// a packed immediate.
216-
switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
217-
case AMDGPU::OPERAND_REG_IMM_V2FP16:
218-
case AMDGPU::OPERAND_REG_IMM_V2INT16:
219-
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
220-
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
221-
// If upper part is all zero we do not need op_sel_hi.
222-
if (!isUInt<16>(Fold.ImmToFold)) {
223-
if (!(Fold.ImmToFold & 0xffff)) {
224-
Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
212+
if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
213+
// Only apply the following transformation if that operand requries
214+
// a packed immediate.
215+
switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
216+
case AMDGPU::OPERAND_REG_IMM_V2FP16:
217+
case AMDGPU::OPERAND_REG_IMM_V2INT16:
218+
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
219+
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
220+
// If upper part is all zero we do not need op_sel_hi.
221+
if (!isUInt<16>(Fold.ImmToFold)) {
222+
if (!(Fold.ImmToFold & 0xffff)) {
223+
Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
224+
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
225+
Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
226+
return true;
227+
}
225228
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
226-
Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
229+
Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
227230
return true;
228231
}
229-
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
230-
Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
231-
return true;
232+
break;
233+
default:
234+
break;
232235
}
233-
break;
234-
default:
235-
break;
236236
}
237237
}
238238
}

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,6 +1282,19 @@ bool isInlinableIntLiteralV216(int32_t Literal) {
12821282
return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
12831283
}
12841284

1285+
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
1286+
assert(HasInv2Pi);
1287+
1288+
int16_t Lo16 = static_cast<int16_t>(Literal);
1289+
if (isInt<16>(Literal) || isUInt<16>(Literal))
1290+
return true;
1291+
1292+
int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
1293+
if (!(Literal & 0xffff))
1294+
return true;
1295+
return Lo16 == Hi16;
1296+
}
1297+
12851298
bool isArgPassedInSGPR(const Argument *A) {
12861299
const Function *F = A->getParent();
12871300

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,9 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
660660
LLVM_READNONE
661661
bool isInlinableIntLiteralV216(int32_t Literal);
662662

663+
LLVM_READNONE
664+
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
665+
663666
bool isArgPassedInSGPR(const Argument *Arg);
664667

665668
LLVM_READONLY

llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,7 +1166,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out,
11661166
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2
11671167
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
11681168
; GFX10-NEXT: s_waitcnt vmcnt(0)
1169-
; GFX10-NEXT: v_pk_sub_i16 v2, v3, 7 op_sel_hi:[1,0]
1169+
; GFX10-NEXT: v_pk_sub_i16 v2, v3, 0x400007
11701170
; GFX10-NEXT: global_store_dword v[0:1], v2, off
11711171
; GFX10-NEXT: s_endpgm
11721172
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1250,7 +1250,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
12501250
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2
12511251
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
12521252
; GFX10-NEXT: s_waitcnt vmcnt(0)
1253-
; GFX10-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0]
1253+
; GFX10-NEXT: v_pk_sub_i16 v2, v3, 0x7b0040
12541254
; GFX10-NEXT: global_store_dword v[0:1], v2, off
12551255
; GFX10-NEXT: s_endpgm
12561256
%tid = call i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)