From 67f5f1fa46b2fb6408b1c095dd239230d9bbce01 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 17 Nov 2025 18:32:56 -0500 Subject: [PATCH 1/5] [AMDGPU] Don't fold an i64 immediate value if it can't be replicated from its lower 32-bit On some targets, a packed f32 instruction can only read 32 bits from a scalar operand (SGPR or literal) and replicates the bits to both channels. In this case, we should not fold an immediate value if it can't be replicated from its lower 32-bit. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 8 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 6 ++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 41 ++++++++++++ .../CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir | 64 +++++++++++++++++++ llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 9 +-- 5 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b008354cfd462..7b795b46885e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1473,6 +1473,13 @@ def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature< "Target requires waits for loads and atomics before system scope stores" >; +def FeaturePKF32Insts : SubtargetFeature<"pk-f32-insts", + "HasPKF32Insts", + "true", + "Has packed F32 instructions that only read 32 bits from a scalar operand " + "(SGPR or literal) and replicates the bits to both channels." +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -2145,6 +2152,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureXNACK, FeatureClusters, FeatureD16Writes32BitVgpr, + FeaturePKF32Insts, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f377b8aaf1333..b0fa907cc3715 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -190,6 +190,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasEmulatedSystemScopeAtomics = false; bool HasDefaultComponentBroadcast = false; bool HasXF32Insts = false; + bool HasPKF32Insts = false; /// The maximum number of instructions that may be placed within an S_CLAUSE, /// which is one greater than the maximum argument to S_CLAUSE. A value of 0 /// indicates a lack of S_CLAUSE support. @@ -1420,6 +1421,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the target has instructions with xf32 format support. bool hasXF32Insts() const { return HasXF32Insts; } + /// \returns true if the target has packed f32 instructions that only read 32 + /// bits from a scalar operand (SGPR or literal) and replicates the bits to + /// both channels. + bool hasPKF32Insts() const { return HasPKF32Insts; } + bool hasBitOp3Insts() const { return HasBitOp3Insts; } bool hasPermlane16Swap() const { return HasPermlane16Swap; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 18d90346d1d88..cf5e25bd9e0f8 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -89,6 +89,11 @@ struct FoldableDef { bool isImm() const { return Kind == MachineOperand::MO_Immediate; } + uint64_t getImm() const { + assert(isImm()); + return ImmToFold; + } + bool isFI() const { return Kind == MachineOperand::MO_FrameIndex; } @@ -766,6 +771,34 @@ static void appendFoldCandidate(SmallVectorImpl &FoldList, FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } +// Returns true if the instruction is a packed f32 instruction that only reads +// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to +// both channels. +static bool isPKF32Instr(const GCNSubtarget *ST, MachineInstr *MI) { + if (!ST->hasPKF32Insts()) + return false; + switch (MI->getOpcode()) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_FMA_F32: + return true; + default: + return false; + } + llvm_unreachable("unknown instruction"); +} + +// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or +// literal) and replicates the bits to both channels. Therefore, if the hi and +// lo are not same, we can't fold it. +static bool checkImmOpForPKF32Instr(const FoldableDef &OpToFold) { + assert(OpToFold.isImm() && "Expected immediate operand"); + uint64_t ImmVal = OpToFold.getImm(); + uint32_t Lo = Lo_32(ImmVal); + uint32_t Hi = Hi_32(ImmVal); + return Lo == Hi; +} + bool SIFoldOperandsImpl::tryAddToFoldList( SmallVectorImpl &FoldList, MachineInstr *MI, unsigned OpNo, const FoldableDef &OpToFold) const { @@ -919,6 +952,12 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return true; } + // Special case for PK_F32 instructions if we are trying to fold an imm to + // src0 or src1. + if (OpToFold.isImm() && isPKF32Instr(ST, MI) && + !checkImmOpForPKF32Instr(OpToFold)) + return false; + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } @@ -1134,6 +1173,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { + if (isPKF32Instr(ST, UseMI) && !checkImmOpForPKF32Instr(OpToFold)) + return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir new file mode 100644 index 0000000000000..a725fab5e40c1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir @@ -0,0 +1,64 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1250 -run-pass=si-fold-operands -o - %s | FileCheck %s + +--- +name: pk_add_f32_imm_fold +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: pk_add_f32_imm_fold + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec + ; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %0:vreg_64_align2 = IMPLICIT_DEF + %1:sreg_64 = S_MOV_B64 1065353216 + %2:vreg_64_align2 = COPY killed %1 + %3:vreg_64_align2 = V_PK_ADD_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_mul_f32_imm_fold +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: pk_mul_f32_imm_fold + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec + ; CHECK-NEXT: [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_MUL_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %0:vreg_64_align2 = IMPLICIT_DEF + %1:sreg_64 = S_MOV_B64 1065353216 + %2:vreg_64_align2 = COPY killed %1 + %3:vreg_64_align2 = V_PK_MUL_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... + +--- +name: pk_fma_f32_imm_fold +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: pk_fma_f32_imm_fold + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec + ; CHECK-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 0, [[DEF]], 8, [[DEF1]], 11, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %0:vreg_64_align2 = IMPLICIT_DEF + %1:vreg_64_align2 = IMPLICIT_DEF + %2:sreg_64 = S_MOV_B64 1065353216 + %3:vreg_64_align2 = COPY killed %2 + %4:vreg_64_align2 = V_PK_FMA_F32 0, %0, 8, %1, 11, %3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 1177474f5b4f5..6b45d31da0e95 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -732,12 +732,13 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x3f800000 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 -; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0: From d679d573e1d60d014050362f7198059ec0ad3cd1 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 18 Nov 2025 14:37:23 -0500 Subject: [PATCH 2/5] remove target feature --- llvm/lib/Target/AMDGPU/AMDGPU.td | 8 -------- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 +++-- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7b795b46885e6..b008354cfd462 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1473,13 +1473,6 @@ def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature< "Target requires waits for loads and atomics before system scope stores" >; -def FeaturePKF32Insts : SubtargetFeature<"pk-f32-insts", - "HasPKF32Insts", - "true", - "Has packed F32 instructions that only read 32 bits from a scalar operand " - "(SGPR or literal) and replicates the bits to both channels." ->; - // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -2152,7 +2145,6 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureXNACK, FeatureClusters, FeatureD16Writes32BitVgpr, - FeaturePKF32Insts, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index b0fa907cc3715..ca98b80787fb4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -190,7 +190,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasEmulatedSystemScopeAtomics = false; bool HasDefaultComponentBroadcast = false; bool HasXF32Insts = false; - bool HasPKF32Insts = false; /// The maximum number of instructions that may be placed within an S_CLAUSE, /// which is one greater than the maximum argument to S_CLAUSE. A value of 0 /// indicates a lack of S_CLAUSE support. @@ -1424,7 +1423,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the target has packed f32 instructions that only read 32 /// bits from a scalar operand (SGPR or literal) and replicates the bits to /// both channels. - bool hasPKF32Insts() const { return HasPKF32Insts; } + bool hasPKF32InstsReplicatingLow32BitsOfScalarInput() const { + return getGeneration() == GFX12 && GFX1250Insts; + } bool hasBitOp3Insts() const { return HasBitOp3Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index cf5e25bd9e0f8..2df15b0e8a5f0 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -775,7 +775,7 @@ static void appendFoldCandidate(SmallVectorImpl &FoldList, // 32 bits from a scalar operand (SGPR or literal) and replicates the bits to // both channels. static bool isPKF32Instr(const GCNSubtarget *ST, MachineInstr *MI) { - if (!ST->hasPKF32Insts()) + if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput()) return false; switch (MI->getOpcode()) { case AMDGPU::V_PK_ADD_F32: From 58aa511ab34d698c9b1a8edd009809575d1ea295 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 18 Nov 2025 14:59:48 -0500 Subject: [PATCH 3/5] update helper function name --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2df15b0e8a5f0..3dbb6c2485601 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -774,7 +774,9 @@ static void appendFoldCandidate(SmallVectorImpl &FoldList, // Returns true if the instruction is a packed f32 instruction that only reads // 32 bits from a scalar operand (SGPR or literal) and replicates the bits to // both channels. -static bool isPKF32Instr(const GCNSubtarget *ST, MachineInstr *MI) { +static bool +isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST, + MachineInstr *MI) { if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput()) return false; switch (MI->getOpcode()) { @@ -954,7 +956,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList( // Special case for PK_F32 instructions if we are trying to fold an imm to // src0 or src1. - if (OpToFold.isImm() && isPKF32Instr(ST, MI) && + if (OpToFold.isImm() && + isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) && !checkImmOpForPKF32Instr(OpToFold)) return false; @@ -1173,7 +1176,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { - if (isPKF32Instr(ST, UseMI) && !checkImmOpForPKF32Instr(OpToFold)) + if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) && + !checkImmOpForPKF32Instr(OpToFold)) return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; From a3e8aa69b12448ecfd82554d93d14a2d0487bd29 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 18 Nov 2025 15:19:18 -0500 Subject: [PATCH 4/5] use `getEffectiveImmVal` instead of adding a new API function --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3dbb6c2485601..897fe8fdd1bdd 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -89,11 +89,6 @@ struct FoldableDef { bool isImm() const { return Kind == MachineOperand::MO_Immediate; } - uint64_t getImm() const { - assert(isImm()); - return ImmToFold; - } - bool isFI() const { return Kind == MachineOperand::MO_FrameIndex; } @@ -795,7 +790,7 @@ isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST, // lo are not same, we can't fold it. static bool checkImmOpForPKF32Instr(const FoldableDef &OpToFold) { assert(OpToFold.isImm() && "Expected immediate operand"); - uint64_t ImmVal = OpToFold.getImm(); + uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); uint32_t Lo = Lo_32(ImmVal); uint32_t Hi = Hi_32(ImmVal); return Lo == Hi; From bb033594ee5fdbb53936b8ddf2310ed5dd48eba7 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 18 Nov 2025 15:27:40 -0500 Subject: [PATCH 5/5] further name --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 897fe8fdd1bdd..289bf1a563ffc 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -788,7 +788,8 @@ isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST, // Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or // literal) and replicates the bits to both channels. Therefore, if the hi and // lo are not same, we can't fold it. -static bool checkImmOpForPKF32Instr(const FoldableDef &OpToFold) { +static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput( + const FoldableDef &OpToFold) { assert(OpToFold.isImm() && "Expected immediate operand"); uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); uint32_t Lo = Lo_32(ImmVal); @@ -953,7 +954,7 @@ bool SIFoldOperandsImpl::tryAddToFoldList( // src0 or src1. if (OpToFold.isImm() && isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) && - !checkImmOpForPKF32Instr(OpToFold)) + !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) return false; appendFoldCandidate(FoldList, MI, OpNo, OpToFold); @@ -1172,7 +1173,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) && - !checkImmOpForPKF32Instr(OpToFold)) + !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true;