Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,38 @@ class LiveRegOptimizer {
return LK.first != TargetLoweringBase::TypeLegal;
}

bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
bool isOpLegal(const Instruction *I) {
if (isa<IntrinsicInst>(I))
return true;

// Any store is a profitable sink (prevents flip-flopping)
if (isa<StoreInst>(I))
return true;

if (auto *BO = dyn_cast<BinaryOperator>(I)) {
if (auto *VT = dyn_cast<FixedVectorType>(BO->getType())) {
if (const auto *IT = dyn_cast<IntegerType>(VT->getElementType())) {
unsigned EB = IT->getBitWidth();
unsigned EC = VT->getNumElements();
// Check for SDWA-compatible operation
if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) {
switch (BO->getOpcode()) {
case Instruction::Add:
case Instruction::Sub:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
return true;
default:
break;
}
}
}
}
}

return false;
}

bool isCoercionProfitable(Instruction *II) {
SmallPtrSet<Instruction *, 4> CVisited;
Expand Down
63 changes: 63 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
; REQUIRES: amdgpu-registered-target
; RUN: opt -S -passes=amdgpu-late-codegenprepare \
; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s

; Purpose:
; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
; loop header (same basic block as the PHI).
; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
; placed in the header (enabling SDWA-friendly lowering later).
;
; What we check:
; - PHI is i32 (no loop-carried <4 x i8> PHI remains).
; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.

define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
entry:
br label %loop

loop:
; Loop index
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]

; Loop-carried accumulator in vector-of-bytes form (problematic on input).
%acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]

; Make up four i8 values derived from %i to avoid memory noise.
%i0 = trunc i32 %i to i8
%i1i = add i32 %i, 1
%i1 = trunc i32 %i1i to i8
%i2i = add i32 %i, 2
%i2 = trunc i32 %i2i to i8
%i3i = add i32 %i, 3
%i3 = trunc i32 %i3i to i8

; Pack them into <4 x i8>.
%v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
%v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
%v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
%v = insertelement <4 x i8> %v03, i8 %i3, i32 3

; Byte-wise add in the same block as the PHI (this must make coercion profitable).
%acc.next = add <4 x i8> %acc, %v

; Loop control.
%i.next = add i32 %i, 4
%cond = icmp slt i32 %i.next, %n
br i1 %cond, label %loop, label %exit

exit:
ret void
}

; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
; CHECK: loop:
; CHECK: %i = phi i32
; CHECK-NOT: phi <4 x i8>
; CHECK: %[[ACCI32:[^ ]+]] = phi i32
; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
; CHECK: add <4 x i8> %[[HDRCAST]],
; CHECK: br i1