llvm · michaelselehov · Aug 27, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,41 @@ class LiveRegOptimizer {
     return LK.first != TargetLoweringBase::TypeLegal;
   }
 
-  bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+  bool isOpLegal(Instruction *I) {
-  bool isOpLegal(Instruction *I) {
+  bool isOpLegal(const Instruction *I) {
-  bool isOpLegal(Instruction *I) {
+  bool isOpLegal(const Instruction *I) {
+    if (dyn_cast<IntrinsicInst>(I))
-    if (dyn_cast<IntrinsicInst>(I))
+    if (isa<IntrinsicInst>(I))
-    if (dyn_cast<IntrinsicInst>(I))
+    if (isa<IntrinsicInst>(I))
+      return true; // FIXME: narrow to known native intrinsics
+                   // (DOT/MFMA/tbuffer) or use TTI cost.
+
+    // Any store is a profitable sink (prevents flip-flopping)
+    if (isa<StoreInst>(I))
+      return true;
+
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (auto *VTy = dyn_cast<VectorType>(BO->getType())) {
+        Type *Elt = VTy->getElementType();
+        // Treat small-int vector binops as profitable when SDWA is available.
+        // We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior
+        // tight.
+        // Require SDWA for both i8 and i16, and keep vectors within 32 bits.
+        std::optional<unsigned> Bits = VTy->getPrimitiveSizeInBits();
+        if (ST.hasSDWA() && Bits && *Bits <= 32 &&
+            (Elt->isIntegerTy(8) || Elt->isIntegerTy(16))) {
+          switch (BO->getOpcode()) {
+          case Instruction::Add:
+          case Instruction::Sub:
+          case Instruction::And:
+          case Instruction::Or:
+          case Instruction::Xor:
+            return true;
+          default:
+            break;
+          }
+        }
+      }
+    }
+
+    return false;
+  }
 
   bool isCoercionProfitable(Instruction *II) {
     SmallPtrSet<Instruction *, 4> CVisited;
@@ -150,7 +184,12 @@ class LiveRegOptimizer {
       if (!CVisited.insert(CII).second)
         continue;
 
-      if (CII->getParent() == II->getParent() && !IsLookThru(II))
+      // Allow same-BB non-lookthrough users when the def is a PHI:
+      // loop headers frequently consume the carried value in the header block
+      // (e.g. byte-wise vector binops). We *do* want to coerce across the
+      // backedge in that common case to enable packed i32 + SDWA lowering.
+      if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+          !isa<PHINode>(II))
         continue;
 
       if (isOpLegal(CII))

diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
@@ -0,0 +1,63 @@
+; REQUIRES: amdgpu-registered-target
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN:   -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Purpose:
+;  - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
+;    loop header (same basic block as the PHI).
+;  - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
+;    the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
+;    placed in the header (enabling SDWA-friendly lowering later).
+;
+; What we check:
+;  - PHI is i32 (no loop-carried <4 x i8> PHI remains).
+;  - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
+;  - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
+
+define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  ; Loop index
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+  ; Loop-carried accumulator in vector-of-bytes form (problematic on input).
+  %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+  ; Make up four i8 values derived from %i to avoid memory noise.
+  %i0 = trunc i32 %i to i8
+  %i1i = add i32 %i, 1
+  %i1 = trunc i32 %i1i to i8
+  %i2i = add i32 %i, 2
+  %i2 = trunc i32 %i2i to i8
+  %i3i = add i32 %i, 3
+  %i3 = trunc i32 %i3i to i8
+
+  ; Pack them into <4 x i8>.
+  %v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
+  %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
+  %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
+  %v   = insertelement <4 x i8> %v03, i8 %i3, i32 3
+
+  ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
+  %acc.next = add <4 x i8> %acc, %v
+
+  ; Loop control.
+  %i.next = add i32 %i, 4
+  %cond = icmp slt i32 %i.next, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
+; CHECK: loop:
+; CHECK: %i = phi i32
+; CHECK-NOT: phi <4 x i8>
+; CHECK: %[[ACCI32:[^ ]+]] = phi i32
+; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
+; CHECK: add <4 x i8> %[[HDRCAST]],
+; CHECK: br i1
+