[AMDGPU] Don't fold an i64 immediate value if it can't be replicated from its lower 32-bit (#168458)

shiltian · web-flow · commit 6665642ce40c · 2025-11-18T17:11:10.000-05:00
On some targets, a packed f32 instruction can only read 32 bits from a
scalar operand (SGPR or literal) and replicates the bits to both
channels. In this case, we should not fold an immediate value if it
can't be replicated from its lower 32-bit.

Fixes SWDEV-567139.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1420,6 +1420,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns true if the target has instructions with xf32 format support.
   bool hasXF32Insts() const { return HasXF32Insts; }
 
+  /// \returns true if the target has packed f32 instructions that only read 32
+  /// bits from a scalar operand (SGPR or literal) and replicates the bits to
+  /// both channels.
+  bool hasPKF32InstsReplicatingLow32BitsOfScalarInput() const {
+    return getGeneration() == GFX12 && GFX1250Insts;
+  }
+
   bool hasBitOp3Insts() const { return HasBitOp3Insts; }
 
   bool hasPermlane16Swap() const { return HasPermlane16Swap; }
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -766,6 +766,37 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
                       FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
 }
 
+// Returns true if the instruction is a packed f32 instruction that only reads
+// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to
+// both channels.
+static bool
+isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST,
+                                              MachineInstr *MI) {
+  if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput())
+    return false;
+  switch (MI->getOpcode()) {
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_FMA_F32:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("unknown instruction");
+}
+
+// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
+// literal) and replicates the bits to both channels. Therefore, if the hi and
+// lo are not same, we can't fold it.
+static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(
+    const FoldableDef &OpToFold) {
+  assert(OpToFold.isImm() && "Expected immediate operand");
+  uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
+  uint32_t Lo = Lo_32(ImmVal);
+  uint32_t Hi = Hi_32(ImmVal);
+  return Lo == Hi;
+}
+
 bool SIFoldOperandsImpl::tryAddToFoldList(
     SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
     const FoldableDef &OpToFold) const {
@@ -919,6 +950,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
       return true;
   }
 
+  // Special case for PK_F32 instructions if we are trying to fold an imm to
+  // src0 or src1.
+  if (OpToFold.isImm() &&
+      isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) &&
+      !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold))
+    return false;
+
   appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
   return true;
 }
@@ -1134,6 +1172,9 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
     return false;
 
   if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
+    if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) &&
+        !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold))
+      return false;
     appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
     return true;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-pk-f32-imm-fold.mir
@@ -0,0 +1,64 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1250 -run-pass=si-fold-operands -o - %s | FileCheck %s
+
+---
+name: pk_add_f32_imm_fold
+body: |
+  bb.0.entry:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: pk_add_f32_imm_fold
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
+    ; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:sreg_64 = S_MOV_B64 1065353216
+    %2:vreg_64_align2 = COPY killed %1
+    %3:vreg_64_align2 = V_PK_ADD_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: pk_mul_f32_imm_fold
+body: |
+  bb.0.entry:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: pk_mul_f32_imm_fold
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
+    ; CHECK-NEXT: [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_MUL_F32 11, [[DEF]], 8, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:sreg_64 = S_MOV_B64 1065353216
+    %2:vreg_64_align2 = COPY killed %1
+    %3:vreg_64_align2 = V_PK_MUL_F32 11, %0, 8, %2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: pk_fma_f32_imm_fold
+body: |
+  bb.0.entry:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: pk_fma_f32_imm_fold
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
+    ; CHECK-NEXT: [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_FMA_F32 0, [[DEF]], 8, [[DEF1]], 11, [[V_MOV_B]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64 1065353216
+    %3:vreg_64_align2 = COPY killed %2
+    %4:vreg_64_align2 = V_PK_FMA_F32 0, %0, 8, %1, 11, %3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -732,12 +732,13 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
 ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x3f800000
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT:    global_load_b64 v[0:1], v4, s[0:1] scale_offset
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 1.0
-; GFX1250-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1] scale_offset
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0: