diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 7504f1a8cea09..63e265612cbf7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -126,7 +126,38 @@ class LiveRegOptimizer { return LK.first != TargetLoweringBase::TypeLegal; } - bool isOpLegal(Instruction *I) { return isa(I); } + bool isOpLegal(const Instruction *I) { + if (isa(I)) + return true; + + // Any store is a profitable sink (prevents flip-flopping) + if (isa(I)) + return true; + + if (auto *BO = dyn_cast(I)) { + if (auto *VT = dyn_cast(BO->getType())) { + if (const auto *IT = dyn_cast(VT->getElementType())) { + unsigned EB = IT->getBitWidth(); + unsigned EC = VT->getNumElements(); + // Check for SDWA-compatible operation + if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) { + switch (BO->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return true; + default: + break; + } + } + } + } + } + + return false; + } bool isCoercionProfitable(Instruction *II) { SmallPtrSet CVisited; diff --git a/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll new file mode 100644 index 0000000000000..dd534eb063315 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll @@ -0,0 +1,63 @@ +; REQUIRES: amdgpu-registered-target +; RUN: opt -S -passes=amdgpu-late-codegenprepare \ +; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s + +; Purpose: +; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the +; loop header (same basic block as the PHI). +; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across +; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be +; placed in the header (enabling SDWA-friendly lowering later). +; +; What we check: +; - PHI is i32 (no loop-carried <4 x i8> PHI remains). +; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add. +; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge. + +define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) { +entry: + br label %loop + +loop: + ; Loop index + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + + ; Loop-carried accumulator in vector-of-bytes form (problematic on input). + %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ] + + ; Make up four i8 values derived from %i to avoid memory noise. + %i0 = trunc i32 %i to i8 + %i1i = add i32 %i, 1 + %i1 = trunc i32 %i1i to i8 + %i2i = add i32 %i, 2 + %i2 = trunc i32 %i2i to i8 + %i3i = add i32 %i, 3 + %i3 = trunc i32 %i3i to i8 + + ; Pack them into <4 x i8>. + %v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0 + %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1 + %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2 + %v = insertelement <4 x i8> %v03, i8 %i3, i32 3 + + ; Byte-wise add in the same block as the PHI (this must make coercion profitable). + %acc.next = add <4 x i8> %acc, %v + + ; Loop control. + %i.next = add i32 %i, 4 + %cond = icmp slt i32 %i.next, %n + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi( +; CHECK: loop: +; CHECK: %i = phi i32 +; CHECK-NOT: phi <4 x i8> +; CHECK: %[[ACCI32:[^ ]+]] = phi i32 +; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8> +; CHECK: add <4 x i8> %[[HDRCAST]], +; CHECK: br i1 +