Skip to content

Commit 5d07f7f

Browse files
committed
[AMDGPU] Don't fold an i64 immediate value if it can't be replicated from its lower 32-bit
On some targets, a packed f32 instruction can only read 32 bits from a scalar operand (SGPR or literal) and replicates the bits to both channels. In this case, we should not fold an immediate value if it can't be replicated from its lower 32-bit.
1 parent b48f293 commit 5d07f7f

File tree

5 files changed

+124
-4
lines changed

5 files changed

+124
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,13 @@ def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature<
14731473
"Target requires waits for loads and atomics before system scope stores"
14741474
>;
14751475

1476+
def FeaturePKF32Insts : SubtargetFeature<"pk-f32-insts",
1477+
"HasPKF32Insts",
1478+
"true",
1479+
"Has packed F32 instructions that only read 32 bits from a scalar operand "
1480+
"(SGPR or literal) and replicates the bits to both channels."
1481+
>;
1482+
14761483
// Dummy feature used to disable assembler instructions.
14771484
def FeatureDisable : SubtargetFeature<"",
14781485
"FeatureDisable","true",
@@ -2145,6 +2152,7 @@ def FeatureISAVersion12_50 : FeatureSet<
21452152
FeatureXNACK,
21462153
FeatureClusters,
21472154
FeatureD16Writes32BitVgpr,
2155+
FeaturePKF32Insts,
21482156
]>;
21492157

21502158
def FeatureISAVersion12_51 : FeatureSet<

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
190190
bool HasEmulatedSystemScopeAtomics = false;
191191
bool HasDefaultComponentBroadcast = false;
192192
bool HasXF32Insts = false;
193+
bool HasPKF32Insts = false;
193194
/// The maximum number of instructions that may be placed within an S_CLAUSE,
194195
/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
195196
/// indicates a lack of S_CLAUSE support.
@@ -1420,6 +1421,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
14201421
/// \returns true if the target has instructions with xf32 format support.
14211422
bool hasXF32Insts() const { return HasXF32Insts; }
14221423

1424+
/// \returns true if the target has packed f32 instructions that only read 32
1425+
/// bits from a scalar operand (SGPR or literal) and replicates the bits to
1426+
/// both channels.
1427+
bool hasPKF32Insts() const { return HasPKF32Insts; }
1428+
14231429
bool hasBitOp3Insts() const { return HasBitOp3Insts; }
14241430

14251431
bool hasPermlane16Swap() const { return HasPermlane16Swap; }

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ struct FoldableDef {
8989

9090
bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
9191

92+
uint64_t getImm() const {
93+
assert(isImm());
94+
return ImmToFold;
95+
}
96+
9297
bool isFI() const {
9398
return Kind == MachineOperand::MO_FrameIndex;
9499
}
@@ -766,6 +771,34 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
766771
FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
767772
}
768773

774+
// Returns true if the instruction is a packed f32 instruction that only reads
775+
// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to
776+
// both channels.
777+
static bool isPKF32Instr(const GCNSubtarget *ST, MachineInstr *MI) {
778+
if (!ST->hasPKF32Insts())
779+
return false;
780+
switch (MI->getOpcode()) {
781+
case AMDGPU::V_PK_ADD_F32:
782+
case AMDGPU::V_PK_MUL_F32:
783+
case AMDGPU::V_PK_FMA_F32:
784+
return true;
785+
default:
786+
return false;
787+
}
788+
llvm_unreachable("unknown instruction");
789+
}
790+
791+
// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
792+
// literal) and replicates the bits to both channels. Therefore, if the hi and
793+
// lo are not same, we can't fold it.
794+
static bool checkImmOpForPKF32Instr(const FoldableDef &OpToFold) {
795+
assert(OpToFold.isImm() && "Expected immediate operand");
796+
uint64_t ImmVal = OpToFold.getImm();
797+
uint32_t Lo = Lo_32(ImmVal);
798+
uint32_t Hi = Hi_32(ImmVal);
799+
return Lo == Hi;
800+
}
801+
769802
bool SIFoldOperandsImpl::tryAddToFoldList(
770803
SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
771804
const FoldableDef &OpToFold) const {
@@ -919,6 +952,12 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
919952
return true;
920953
}
921954

955+
// Special case for PK_F32 instructions if we are trying to fold an imm to
956+
// src0 or src1.
957+
if (OpToFold.isImm() && isPKF32Instr(ST, MI) &&
958+
!checkImmOpForPKF32Instr(OpToFold))
959+
return false;
960+
922961
appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
923962
return true;
924963
}
@@ -1134,6 +1173,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
11341173
return false;
11351174

11361175
if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1176+
if (isPKF32Instr(ST, UseMI) && !checkImmOpForPKF32Instr(OpToFold))
1177+
return false;
11371178
appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
11381179
return true;
11391180
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1250 -run-pass=si-fold-operands -o - %s | FileCheck %s
3+
4+
---
5+
name: pk_add_f32_imm_fold
6+
body: |
7+
bb.0.entry:
8+
liveins: $sgpr0_sgpr1
9+
10+
; CHECK-LABEL: name: pk_add_f32_imm_fold
11+
; CHECK: liveins: $sgpr0_sgpr1
12+
; CHECK-NEXT: {{ $}}
13+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
14+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
15+
; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
16+
; CHECK-NEXT: S_ENDPGM 0
17+
%0:vreg_64_align2 = IMPLICIT_DEF
18+
%1:sreg_64 = S_MOV_B64 1065353216
19+
%2:vreg_64_align2 = COPY killed %1
20+
%3:vreg_64_align2 = V_PK_ADD_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
21+
S_ENDPGM 0
22+
...
23+
24+
---
25+
name: pk_mul_f32_imm_fold
26+
body: |
27+
bb.0.entry:
28+
liveins: $sgpr0_sgpr1
29+
30+
; CHECK-LABEL: name: pk_mul_f32_imm_fold
31+
; CHECK: liveins: $sgpr0_sgpr1
32+
; CHECK-NEXT: {{ $}}
33+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
34+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
35+
; CHECK-NEXT: [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_MUL_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
36+
; CHECK-NEXT: S_ENDPGM 0
37+
%0:vreg_64_align2 = IMPLICIT_DEF
38+
%1:sreg_64 = S_MOV_B64 1065353216
39+
%2:vreg_64_align2 = COPY killed %1
40+
%3:vreg_64_align2 = V_PK_MUL_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
41+
S_ENDPGM 0
42+
...
43+
44+
---
45+
name: pk_fma_f32_imm_fold
46+
body: |
47+
bb.0.entry:
48+
liveins: $sgpr0_sgpr1
49+
50+
; CHECK-LABEL: name: pk_fma_f32_imm_fold
51+
; CHECK: liveins: $sgpr0_sgpr1
52+
; CHECK-NEXT: {{ $}}
53+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
54+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
55+
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
56+
; CHECK-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 0, [[DEF]], 8, [[DEF1]], 11, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
57+
; CHECK-NEXT: S_ENDPGM 0
58+
%0:vreg_64_align2 = IMPLICIT_DEF
59+
%1:vreg_64_align2 = IMPLICIT_DEF
60+
%2:sreg_64 = S_MOV_B64 1065353216
61+
%3:vreg_64_align2 = COPY killed %2
62+
%4:vreg_64_align2 = V_PK_FMA_F32 0, %0, 8, %1, 11, %3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
63+
S_ENDPGM 0
64+
...

llvm/test/CodeGen/AMDGPU/packed-fp32.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -732,12 +732,13 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
732732
; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0:
733733
; GFX1250-SDAG: ; %bb.0:
734734
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
735-
; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
735+
; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
736+
; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x3f800000
736737
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
737-
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
738+
; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
738739
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
739-
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0
740-
; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
740+
; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
741+
; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
741742
; GFX1250-SDAG-NEXT: s_endpgm
742743
;
743744
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0:

0 commit comments

Comments
 (0)