Skip to content

Commit 6665642

Browse files
authored
[AMDGPU] Don't fold an i64 immediate value if it can't be replicated from its lower 32-bit (#168458)
On some targets, a packed f32 instruction can only read 32 bits from a scalar operand (SGPR or literal) and replicates the bits to both channels. In this case, we should not fold an immediate value if it can't be replicated from its lower 32-bit. Fixes SWDEV-567139.
1 parent 1e3ea03 commit 6665642

File tree

4 files changed

+117
-4
lines changed

4 files changed

+117
-4
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,6 +1420,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
14201420
/// \returns true if the target has instructions with xf32 format support.
14211421
bool hasXF32Insts() const { return HasXF32Insts; }
14221422

1423+
/// \returns true if the target has packed f32 instructions that only read 32
1424+
/// bits from a scalar operand (SGPR or literal) and replicates the bits to
1425+
/// both channels.
1426+
bool hasPKF32InstsReplicatingLow32BitsOfScalarInput() const {
1427+
return getGeneration() == GFX12 && GFX1250Insts;
1428+
}
1429+
14231430
bool hasBitOp3Insts() const { return HasBitOp3Insts; }
14241431

14251432
bool hasPermlane16Swap() const { return HasPermlane16Swap; }

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -766,6 +766,37 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
766766
FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
767767
}
768768

769+
// Returns true if the instruction is a packed f32 instruction that only reads
770+
// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to
771+
// both channels.
772+
static bool
773+
isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST,
774+
MachineInstr *MI) {
775+
if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput())
776+
return false;
777+
switch (MI->getOpcode()) {
778+
case AMDGPU::V_PK_ADD_F32:
779+
case AMDGPU::V_PK_MUL_F32:
780+
case AMDGPU::V_PK_FMA_F32:
781+
return true;
782+
default:
783+
return false;
784+
}
785+
llvm_unreachable("unknown instruction");
786+
}
787+
788+
// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
789+
// literal) and replicates the bits to both channels. Therefore, if the hi and
790+
// lo are not same, we can't fold it.
791+
static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(
792+
const FoldableDef &OpToFold) {
793+
assert(OpToFold.isImm() && "Expected immediate operand");
794+
uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
795+
uint32_t Lo = Lo_32(ImmVal);
796+
uint32_t Hi = Hi_32(ImmVal);
797+
return Lo == Hi;
798+
}
799+
769800
bool SIFoldOperandsImpl::tryAddToFoldList(
770801
SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
771802
const FoldableDef &OpToFold) const {
@@ -919,6 +950,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
919950
return true;
920951
}
921952

953+
// Special case for PK_F32 instructions if we are trying to fold an imm to
954+
// src0 or src1.
955+
if (OpToFold.isImm() &&
956+
isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) &&
957+
!checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold))
958+
return false;
959+
922960
appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
923961
return true;
924962
}
@@ -1134,6 +1172,9 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
11341172
return false;
11351173

11361174
if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1175+
if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) &&
1176+
!checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold))
1177+
return false;
11371178
appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
11381179
return true;
11391180
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1250 -run-pass=si-fold-operands -o - %s | FileCheck %s
3+
4+
---
5+
name: pk_add_f32_imm_fold
6+
body: |
7+
bb.0.entry:
8+
liveins: $sgpr0_sgpr1
9+
10+
; CHECK-LABEL: name: pk_add_f32_imm_fold
11+
; CHECK: liveins: $sgpr0_sgpr1
12+
; CHECK-NEXT: {{ $}}
13+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
14+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
15+
; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
16+
; CHECK-NEXT: S_ENDPGM 0
17+
%0:vreg_64_align2 = IMPLICIT_DEF
18+
%1:sreg_64 = S_MOV_B64 1065353216
19+
%2:vreg_64_align2 = COPY killed %1
20+
%3:vreg_64_align2 = V_PK_ADD_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
21+
S_ENDPGM 0
22+
...
23+
24+
---
25+
name: pk_mul_f32_imm_fold
26+
body: |
27+
bb.0.entry:
28+
liveins: $sgpr0_sgpr1
29+
30+
; CHECK-LABEL: name: pk_mul_f32_imm_fold
31+
; CHECK: liveins: $sgpr0_sgpr1
32+
; CHECK-NEXT: {{ $}}
33+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
34+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
35+
; CHECK-NEXT: [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_MUL_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
36+
; CHECK-NEXT: S_ENDPGM 0
37+
%0:vreg_64_align2 = IMPLICIT_DEF
38+
%1:sreg_64 = S_MOV_B64 1065353216
39+
%2:vreg_64_align2 = COPY killed %1
40+
%3:vreg_64_align2 = V_PK_MUL_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
41+
S_ENDPGM 0
42+
...
43+
44+
---
45+
name: pk_fma_f32_imm_fold
46+
body: |
47+
bb.0.entry:
48+
liveins: $sgpr0_sgpr1
49+
50+
; CHECK-LABEL: name: pk_fma_f32_imm_fold
51+
; CHECK: liveins: $sgpr0_sgpr1
52+
; CHECK-NEXT: {{ $}}
53+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
54+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
55+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
56+
; CHECK-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 0, [[DEF]], 8, [[DEF1]], 11, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
57+
; CHECK-NEXT: S_ENDPGM 0
58+
%0:vreg_64_align2 = IMPLICIT_DEF
59+
%1:vreg_64_align2 = IMPLICIT_DEF
60+
%2:sreg_64 = S_MOV_B64 1065353216
61+
%3:vreg_64_align2 = COPY killed %2
62+
%4:vreg_64_align2 = V_PK_FMA_F32 0, %0, 8, %1, 11, %3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
63+
S_ENDPGM 0
64+
...

llvm/test/CodeGen/AMDGPU/packed-fp32.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -732,12 +732,13 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
732732
; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0:
733733
; GFX1250-SDAG: ; %bb.0:
734734
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
735-
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
735+
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
736+
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x3f800000
736737
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
737-
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
738+
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
738739
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
739-
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0
740-
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
740+
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
741+
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
741742
; GFX1250-SDAG-NEXT: s_endpgm
742743
;
743744
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0:

0 commit comments

Comments
 (0)