-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] LiveRegOptimizer: fix PHI same-BB filter; consider i8/i16 binops on SDWA #155800
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
57301a3
7e1412f
555dada
58630e3
e49e804
77da27f
34bc9a5
10e5f32
63cd09e
a5f189c
a239189
e61e251
9c1028e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -126,7 +126,39 @@ class LiveRegOptimizer { | |
return LK.first != TargetLoweringBase::TypeLegal; | ||
} | ||
|
||
bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); } | ||
bool isOpLegal(Instruction *I) { | ||
if (auto *Intr = dyn_cast<IntrinsicInst>(I)) | ||
return true; // FIXME: narrow to known native intrinsics | ||
// (DOT/MFMA/tbuffer) or use TTI cost. | ||
|
||
// Any store is a profitable sink (prevents flip-flopping) | ||
if (isa<StoreInst>(I)) | ||
return true; | ||
|
||
// Treat small-int vector binops as profitable when SDWA is available | ||
if (auto *BO = dyn_cast<BinaryOperator>(I)) { | ||
if (auto *VTy = dyn_cast<VectorType>(BO->getType())) { | ||
Type *Elt = VTy->getElementType(); | ||
// Treat small-int vector binops as profitable when SDWA is available. | ||
|
||
// We explicitly gate to 8/16-bit to avoid i1 vectors and keep behavior | ||
// tight. | ||
if ((Elt->isIntegerTy(8) || (Elt->isIntegerTy(16)) && ST.hasSDWA())) { | ||
switch (BO->getOpcode()) { | ||
case Instruction::Add: | ||
case Instruction::Sub: | ||
case Instruction::And: | ||
case Instruction::Or: | ||
case Instruction::Xor: | ||
return true; | ||
default: | ||
break; | ||
} | ||
} | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
|
||
bool isCoercionProfitable(Instruction *II) { | ||
SmallPtrSet<Instruction *, 4> CVisited; | ||
|
@@ -150,7 +182,12 @@ class LiveRegOptimizer { | |
if (!CVisited.insert(CII).second) | ||
continue; | ||
|
||
if (CII->getParent() == II->getParent() && !IsLookThru(II)) | ||
// Allow same-BB non-lookthrough users when the def is a PHI: | ||
// loop headers frequently consume the carried value in the header block | ||
// (e.g. byte-wise vector binops). We *do* want to coerce across the | ||
// backedge in that common case to enable packed i32 + SDWA lowering. | ||
if (CII->getParent() == II->getParent() && !IsLookThru(CII) && | ||
!isa<PHINode>(II)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The addition of phi here feels like a separate change from the above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You’re right—there are two separate changes:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you split this into a separate PR? It's hard to see that all the cases are adequately tested when they're together There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I can. Given this PR will be for isOpLegal/isProfitableSink improvement, what would you like to see separated? II->CII fix? PHI exception? Or maybe two separate PRs, one for each? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @arsenm, I instrumented the TTI queries on gfx90a: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, TTI is a mess and we need to go fix all the costs, especially for illegal operations. In particular the costs for memory instructions are way too low |
||
continue; | ||
|
||
if (isOpLegal(CII)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
; REQUIRES: amdgpu-registered-target | ||
; RUN: opt -S -passes=amdgpu-late-codegenprepare \ | ||
; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s | ||
|
||
; Purpose: | ||
; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the | ||
; loop header (same basic block as the PHI). | ||
; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across | ||
; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be | ||
; placed in the header (enabling SDWA-friendly lowering later). | ||
; | ||
; What we check: | ||
; - PHI is i32 (no loop-carried <4 x i8> PHI remains). | ||
; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add. | ||
; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge. | ||
|
||
define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) { | ||
entry: | ||
br label %loop | ||
|
||
loop: | ||
; Loop index | ||
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ] | ||
|
||
; Loop-carried accumulator in vector-of-bytes form (problematic on input). | ||
%acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ] | ||
|
||
; Make up four i8 values derived from %i to avoid memory noise. | ||
%i0 = trunc i32 %i to i8 | ||
%i1i = add i32 %i, 1 | ||
%i1 = trunc i32 %i1i to i8 | ||
%i2i = add i32 %i, 2 | ||
%i2 = trunc i32 %i2i to i8 | ||
%i3i = add i32 %i, 3 | ||
%i3 = trunc i32 %i3i to i8 | ||
|
||
; Pack them into <4 x i8>. | ||
%v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0 | ||
%v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1 | ||
%v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2 | ||
%v = insertelement <4 x i8> %v03, i8 %i3, i32 3 | ||
|
||
; Byte-wise add in the same block as the PHI (this must make coercion profitable). | ||
%acc.next = add <4 x i8> %acc, %v | ||
|
||
; Loop control. | ||
%i.next = add i32 %i, 4 | ||
%cond = icmp slt i32 %i.next, %n | ||
br i1 %cond, label %loop, label %exit | ||
|
||
exit: | ||
ret void | ||
} | ||
|
||
; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi( | ||
; CHECK: loop: | ||
; CHECK: %i = phi i32 | ||
; CHECK-NOT: phi <4 x i8> | ||
; CHECK: %[[ACCI32:[^ ]+]] = phi i32 | ||
; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8> | ||
; CHECK: add <4 x i8> %[[HDRCAST]], | ||
; CHECK: br i1 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.