-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Apply alignment attr for make.buffer.rsrc #166914
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Apply alignment attr for make.buffer.rsrc #166914
Conversation
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-amdgpu Author: None (Shoreshen) ChangesCalculating alignment for For example: define float @<!-- -->foo(ptr addrspace(1) align X %ptr) {
%fat.ptr = call ptr addrspace(7) @<!-- -->llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 C, i32 0)
%y = load float, ptr addrspace(7) %fat.ptr, align Y
ret float %y
}It hopes that Full diff: https://github.com/llvm/llvm-project/pull/166914.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 56ab040706a13..70f2fbae08ada 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1603,7 +1603,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
&AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
- &AAAMDGPUClusterDims::ID});
+ &AAAMDGPUClusterDims::ID, &AAAlign::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1657,6 +1657,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
Ptr = RMW->getPointerOperand();
else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
Ptr = CmpX->getPointerOperand();
+ else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
+ A.getOrCreateAAFor<AAAlign>(IRPosition::value(*II));
if (Ptr) {
A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index a6ac7610a2c7a..37ff282343889 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -5279,6 +5279,12 @@ struct AAAlignImpl : AAAlign {
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
+ // For make.buffer.rsrc, the alignment strictly equals to the base's
+ // alignment
+ if (Instruction *I = dyn_cast<Instruction>(&getAssociatedValue()))
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
+ return;
SmallVector<Attribute, 4> Attrs;
A.getAttrs(getIRPosition(), {Attribute::Alignment}, Attrs);
for (const Attribute &Attr : Attrs)
@@ -5300,10 +5306,19 @@ struct AAAlignImpl : AAAlign {
if (isa<ConstantData>(AssociatedValue))
return ChangeStatus::UNCHANGED;
+ // For use of amdgcn.make.buffer.rsrc, the alignment equals to
+ // min(base, load/store)
+ bool IsMakeBufferRsrc = false;
+ if (Instruction *I = dyn_cast<Instruction>(&getAssociatedValue()))
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
+ IsMakeBufferRsrc = true;
for (const Use &U : AssociatedValue.uses()) {
if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
if (SI->getPointerOperand() == &AssociatedValue)
- if (SI->getAlign() < getAssumedAlign()) {
+ if (IsMakeBufferRsrc) {
+ SI->setAlignment(std::min(SI->getAlign(), getAssumedAlign()));
+ } else if (SI->getAlign() < getAssumedAlign()) {
STATS_DECLTRACK(AAAlign, Store,
"Number of times alignment added to a store");
SI->setAlignment(getAssumedAlign());
@@ -5311,14 +5326,18 @@ struct AAAlignImpl : AAAlign {
}
} else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
if (LI->getPointerOperand() == &AssociatedValue)
- if (LI->getAlign() < getAssumedAlign()) {
+ if (IsMakeBufferRsrc) {
+ LI->setAlignment(std::min(LI->getAlign(), getAssumedAlign()));
+ } else if (LI->getAlign() < getAssumedAlign()) {
LI->setAlignment(getAssumedAlign());
STATS_DECLTRACK(AAAlign, Load,
"Number of times alignment added to a load");
InstrChanged = ChangeStatus::CHANGED;
}
} else if (auto *RMW = dyn_cast<AtomicRMWInst>(U.getUser())) {
- if (RMW->getPointerOperand() == &AssociatedValue) {
+ if (IsMakeBufferRsrc) {
+ RMW->setAlignment(std::min(RMW->getAlign(), getAssumedAlign()));
+ } else if (RMW->getPointerOperand() == &AssociatedValue) {
if (RMW->getAlign() < getAssumedAlign()) {
STATS_DECLTRACK(AAAlign, AtomicRMW,
"Number of times alignment added to atomicrmw");
@@ -5328,7 +5347,9 @@ struct AAAlignImpl : AAAlign {
}
}
} else if (auto *CAS = dyn_cast<AtomicCmpXchgInst>(U.getUser())) {
- if (CAS->getPointerOperand() == &AssociatedValue) {
+ if (IsMakeBufferRsrc) {
+ CAS->setAlignment(std::min(CAS->getAlign(), getAssumedAlign()));
+ } else if (CAS->getPointerOperand() == &AssociatedValue) {
if (CAS->getAlign() < getAssumedAlign()) {
STATS_DECLTRACK(AAAlign, AtomicCmpXchg,
"Number of times alignment added to cmpxchg");
@@ -5554,6 +5575,15 @@ struct AAAlignCallSiteReturned final
std::min(this->getAssumedAlign(), Alignment).value());
break;
}
+ case Intrinsic::amdgcn_make_buffer_rsrc: {
+ const auto *AlignAA =
+ A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+ DepClassTy::REQUIRED);
+ if (AlignAA && AlignAA->isValidState())
+ return clampStateAndIndicateChange<StateType>(
+ this->getState(), AlignAA->getAssumedAlign().value());
+ break;
+ }
default:
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-align.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-align.ll
new file mode 100644
index 0000000000000..8d2bfab09460b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-align.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o - | FileCheck %s
+
+define float @load_gt_base(ptr align 4 %p) {
+; CHECK-LABEL: define float @load_gt_base(
+; CHECK-SAME: ptr align 4 [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[PTR:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr align 4 [[P]], i16 0, i64 0, i32 0)
+; CHECK-NEXT: [[LOADED:%.*]] = load float, ptr addrspace(7) [[PTR]], align 4
+; CHECK-NEXT: ret float [[LOADED]]
+;
+ %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %p, i16 0, i64 0, i32 0)
+ %loaded = load float, ptr addrspace(7) %ptr, align 8
+ ret float %loaded
+}
+
+define float @load_lt_base(ptr align 8 %p) {
+; CHECK-LABEL: define float @load_lt_base(
+; CHECK-SAME: ptr align 8 [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR:%.*]] = call align 8 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr align 8 [[P]], i16 0, i64 0, i32 0)
+; CHECK-NEXT: [[LOADED:%.*]] = load float, ptr addrspace(7) [[PTR]], align 4
+; CHECK-NEXT: ret float [[LOADED]]
+;
+ %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %p, i16 0, i64 0, i32 0)
+ %loaded = load float, ptr addrspace(7) %ptr, align 4
+ ret float %loaded
+}
diff --git a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
index 1ab607465dbbb..34bbfa8974747 100644
--- a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
+++ b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
@@ -306,7 +306,7 @@ define amdgpu_kernel void @test_call_untouched_ptr() {
define amdgpu_kernel void @test_make_buffer(ptr addrspace(1) %ptr) {
; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer(
; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT: [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i64 noundef 0, i32 noundef 0) #[[ATTR11:[0-9]+]]
+; AMDGCN-NEXT: [[RSRC:%.*]] = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i64 noundef 0, i32 noundef 0) #[[ATTR11:[0-9]+]]
; AMDGCN-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4
; AMDGCN-NEXT: call void @clobber(i32 [[VAL]]) #[[ATTR7]]
; AMDGCN-NEXT: ret void
@@ -321,7 +321,7 @@ define amdgpu_kernel void @test_make_buffer(ptr addrspace(1) %ptr) {
define amdgpu_kernel void @test_make_buffer_noalias(ptr addrspace(1) noalias %ptr) {
; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer_noalias(
; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT: [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i64 noundef 0, i32 noundef 0) #[[ATTR11]]
+; AMDGCN-NEXT: [[RSRC:%.*]] = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i64 noundef 0, i32 noundef 0) #[[ATTR11]]
; AMDGCN-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4, !invariant.load [[META0]]
; AMDGCN-NEXT: call void @clobber(i32 [[VAL]]) #[[ATTR7]]
; AMDGCN-NEXT: ret void
|
shiltian
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we should put the handling of AMDGPU specific code into the generic code. We can potentially create a class inheriting the existing ones in AMDGPUAttributor dedicated for the handling of AMDGPU specific stuff.
AMDGPUAttributor isn't really the place for it either. It's not an AMDGPU specific attribute |
|
Title should make it clear this is about inferring it, not changing the intrinsic definition |
| const auto *AlignAA = | ||
| A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), | ||
| DepClassTy::REQUIRED); | ||
| if (AlignAA && AlignAA->isValidState()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does TargetTransformInfo have some kind of alignment propagation already? I thought it did
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @arsenm , I had a look but cannot be sure. There are lots of align related function but mainly used for legality check and cost computation.
I'm not really familiar with the struct, if you could be more specific?? So that I can have some directions to search~~
Thanks
It is an AMDGPU specific intrinsic. |
|
I'd like to flag #167553 as related (and to make sure we're on the same page as to what we mean by |
| llvm_unreachable("AAAMDGPUClusterDims is only valid for function position"); | ||
| } | ||
|
|
||
| struct AAAMDGPUMakeBufferRsrcAlign |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd make it something like AAAMDGPUAlign, and then use it to deal with all AMDGPU related alignments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, document the new class.
That does not make the transform AMDGPU specific. This is applying target intrinsic knowledge to a generic intrinsic. The handling should be directly in AAAlign. I would rank handling it as an AMDGPU specific Attributor significantly worse than just hardcoding the target intrinsic in AAAlign. Better would be to have TTI handle the relevant parsing |
|
We probably could use the TTI information from #140802 to get the alignment |
krzysz00
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm with @arsenm on "this goes in the generic align handling"
Overall, can we see a few more test cases - a load/store that is underaligned and a function argument with aligns on it, for example.
I'd also like to see (especially after my docs PR lands) a note somewhere in the code that this is a conservatively correct approach to this alignment inference.
|
This shouldn't go into generic handling because the target intrinsic handling rules are highly target-dependent, and I don't see a TTI hook improving that. In this case, we expect that when a pointer in AS7 has alignment X and a load has alignment Y, it should be On the other hand, with the Lastly, based on the description, this logic doesn't seem to belong in the attributor because it doesn't require iterative inference. |
| Ptr = CmpX->getPointerOperand(); | ||
| else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) | ||
| if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc) | ||
| A.getOrCreateAAFor<AAAMDGPUMakeBufferRsrcAlign>( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The creation of the AA is weird here as well. I'd expect that we want to know the alignment of a load/store instruction, and we see the pointer is a buffer pointer, then we do something. Here it looks like it is completely up side down.
|
I'll note that the property we're trying to capture here is "buffer fat pointers have component-wise alignment guarantees". That is, per documentation I just landed, an align(4) access to a buffer fat pointer is a claim that the base pointer and the offset are align(4). The can't-be-incorrect thing to do here is to clamp the alignment of these accesses to the inferrable alignment of the base pointer. (You could also go the other way, assume that all the loads and stores are aligned as specified, and infer an alignment for the base pointer from that, but ... that feels like a footgun) And re target-specific intrinsic handling ... we already put in infrastructure for handling |
|
|
Not really, the range of what is possible is quite bounded. The mechanics are universal, and then the target information is just how to evaluate within those constraints. |
|
Noting that we tried to discuss this broad design question in the weekly backend team meeting and @arsenm unfortunately wasn't there |
|
In #137639, we plan to cover "arbitrary future behaviours" by introducing a TTI callback that returns an enum of documented rules. The client (which is UniformityAnalysis in this case), interprets the returned enum and applies the appropriate semantics. An equivalent option would have been to let the backend implement arbitrary logic when it overrides the callback, but we wanted something cacheable in that change. |
|
I think we are discussing two problems here. The first one is, where to put the logic. Regarding the hook in TTI, I don't see it as something like Second, do we really need this in the attributor? Can the pointer operand of a load from AS7 be something not from the intrinsic such that we will need an iterative inference? I did ask this question in a separate PR, and IIUC, we don't really support things like GEP at this moment. Are we gonna support it in the future? Sure, but even with that, do we need iterative inference? I'm not sure. Attributor framework is extremely expensive but it doesn't seem like this really benefits from it. I think we can just run the attributor like normal, and then just query the alignment of the pointer operand of the intrinsic, do the math, update the alignment of the instruction, that's all. We really don't need to create a node on the attributor graph and nobody else would use its intermediate value at all. |
|
The pointer operand could be a function argument or a GEP result or ... anything you can do to a pointer, really. I think the simplest thing to do would be to say that Therefore, if I have a So I think that adding generic handling for "this returns an existing pointer without capturing" might be the right call |
In the previous PR that handles the |
|
The one question is do we want the following define float @overaligned(ptr addrspace(1) align(2) inreg %x, i32 inreg %off) {
%rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc(%x, ...)
%spot = getelementptr i8, ptr addrspace(7) %rsrc, i32 %off
;;; UB! This load is "align 4" but the base pointer is only known to be aligned to 2
;;; Because both the base and `%off` are workgroup-uniform, this `load` is (though we don't do
;;; this atm) promotable to s_buffer_load, where the last two bits of `%x`'s address would be silently
;;; dropped.
%ret = load float, ptr addrspace(7) %spot, align 4, !invariant.load
ret float %ret
}to just stay as UB or do we want to fix it to be |
|
The attributor will change the alignment of |
🐧 Linux x64 Test Results
|
Calculating alignment for
make.buffer.rsrcintrinsic. The logic is the alignment on use of return value ofmake.buffer.rsrcshould be capped by the base operand's alignment ofmake.buffer.rsrc.For example:
We hopes that
Y = min(X, Y)