-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[Attributor] Take the address space from addrspacecast directly #108258
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-transforms Author: Shilei Tian (shiltian) ChangesIf the value to be analyzed is directly from addrspacecast, we take the source Fix SWDEV-482640. Full diff: https://github.com/llvm/llvm-project/pull/108258.diff 4 Files Affected:
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 6ab63ba582c546..921fe945539510 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -6249,7 +6249,7 @@ struct AAAddressSpace : public StateWrapper<BooleanState, AbstractAttribute> {
/// Return the address space of the associated value. \p NoAddressSpace is
/// returned if the associated value is dead. This functions is not supposed
/// to be called if the AA is invalid.
- virtual int32_t getAddressSpace() const = 0;
+ virtual uint32_t getAddressSpace() const = 0;
/// Create an abstract attribute view for the position \p IRP.
static AAAddressSpace &createForPosition(const IRPosition &IRP,
@@ -6268,7 +6268,7 @@ struct AAAddressSpace : public StateWrapper<BooleanState, AbstractAttribute> {
}
// No address space which indicates the associated value is dead.
- static const int32_t NoAddressSpace = -1;
+ static const uint32_t NoAddressSpace = ~0U;
/// Unique ID (due to the unique address)
static const char ID;
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index b73f5265f87874..49195da211035d 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12562,7 +12562,7 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
AAAddressSpaceImpl(const IRPosition &IRP, Attributor &A)
: AAAddressSpace(IRP, A) {}
- int32_t getAddressSpace() const override {
+ uint32_t getAddressSpace() const override {
assert(isValidState() && "the AA is invalid");
return AssumedAddressSpace;
}
@@ -12571,12 +12571,37 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
void initialize(Attributor &A) override {
assert(getAssociatedType()->isPtrOrPtrVectorTy() &&
"Associated value is not a pointer");
- if (getAssociatedType()->getPointerAddressSpace())
+ // If the pointer already has non-generic address space, we assume it is the
+ // correct one.
+ if (getAssociatedType()->getPointerAddressSpace()) {
+ [[maybe_unused]] bool R =
+ takeAddressSpace(getAssociatedType()->getPointerAddressSpace());
+ assert(R && "the take should happen");
indicateOptimisticFixpoint();
+ return;
+ }
+ // If the pointer is an addrspacecast, we assume the source address space is
+ // the correct one.
+ Value *V = &getAssociatedValue();
+ if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
+ [[maybe_unused]] bool R = takeAddressSpace(ASC->getSrcAddressSpace());
+ assert(R && "the take should happen");
+ indicateOptimisticFixpoint();
+ return;
+ }
+ if (auto *C = dyn_cast<ConstantExpr>(V)) {
+ if (C->getOpcode() == Instruction::AddrSpaceCast) {
+ [[maybe_unused]] bool R = takeAddressSpace(
+ C->getOperand(0)->getType()->getPointerAddressSpace());
+ assert(R && "the take should happen");
+ indicateOptimisticFixpoint();
+ return;
+ }
+ }
}
ChangeStatus updateImpl(Attributor &A) override {
- int32_t OldAddressSpace = AssumedAddressSpace;
+ uint32_t OldAddressSpace = AssumedAddressSpace;
auto *AUO = A.getOrCreateAAFor<AAUnderlyingObjects>(getIRPosition(), this,
DepClassTy::REQUIRED);
auto Pred = [&](Value &Obj) {
@@ -12594,19 +12619,17 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
/// See AbstractAttribute::manifest(...).
ChangeStatus manifest(Attributor &A) override {
- Value *AssociatedValue = &getAssociatedValue();
- Value *OriginalValue = peelAddrspacecast(AssociatedValue);
- if (getAddressSpace() == NoAddressSpace ||
- static_cast<uint32_t>(getAddressSpace()) ==
- getAssociatedType()->getPointerAddressSpace())
+ unsigned NewAS = getAddressSpace();
+ if (NewAS == NoAddressSpace ||
+ NewAS == getAssociatedType()->getPointerAddressSpace())
return ChangeStatus::UNCHANGED;
- PointerType *NewPtrTy =
- PointerType::get(getAssociatedType()->getContext(),
- static_cast<uint32_t>(getAddressSpace()));
+ Value *AssociatedValue = &getAssociatedValue();
+ Value *OriginalValue = peelAddrspacecast(AssociatedValue);
bool UseOriginalValue =
- OriginalValue->getType()->getPointerAddressSpace() ==
- static_cast<uint32_t>(getAddressSpace());
+ OriginalValue->getType()->getPointerAddressSpace() == NewAS;
+ PointerType *NewPtrTy =
+ PointerType::get(getAssociatedType()->getContext(), NewAS);
bool Changed = false;
@@ -12656,9 +12679,9 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
}
private:
- int32_t AssumedAddressSpace = NoAddressSpace;
+ uint32_t AssumedAddressSpace = NoAddressSpace;
- bool takeAddressSpace(int32_t AS) {
+ bool takeAddressSpace(uint32_t AS) {
if (AssumedAddressSpace == NoAddressSpace) {
AssumedAddressSpace = AS;
return true;
@@ -12667,11 +12690,16 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
}
static Value *peelAddrspacecast(Value *V) {
- if (auto *I = dyn_cast<AddrSpaceCastInst>(V))
- return peelAddrspacecast(I->getPointerOperand());
+ if (auto *I = dyn_cast<AddrSpaceCastInst>(V)) {
+ assert(I->getSrcAddressSpace() && "there should not be AS X -> AS 0");
+ return I->getPointerOperand();
+ }
if (auto *C = dyn_cast<ConstantExpr>(V))
- if (C->getOpcode() == Instruction::AddrSpaceCast)
- return peelAddrspacecast(C->getOperand(0));
+ if (C->getOpcode() == Instruction::AddrSpaceCast) {
+ assert(C->getOperand(0)->getType()->getPointerAddressSpace() &&
+ "there should not be AS X -> AS 0");
+ return C->getOperand(0);
+ }
return V;
}
};
diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
index fdc5debb18915c..37a53c514603ad 100644
--- a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
+++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
@@ -243,3 +243,15 @@ define void @foo(ptr addrspace(3) %val) {
ret void
}
+define void @kernel_argument_promotion_pattern(ptr %p, i32 %val) {
+; CHECK-LABEL: define void @kernel_argument_promotion_pattern(
+; CHECK-SAME: ptr [[P:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[P_CAST_0:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[P_CAST_0]], align 4
+; CHECK-NEXT: ret void
+;
+ %p.cast.0 = addrspacecast ptr %p to ptr addrspace(1)
+ %p.cast.1 = addrspacecast ptr addrspace(1) %p.cast.0 to ptr
+ store i32 %val, ptr %p.cast.1
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 7336543b41cbc8..3deb882354a3bd 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -215,11 +215,14 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #0 {
}
; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
+; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
+; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
%cast = addrspacecast ptr addrspace(3) null to ptr
store volatile i32 7, ptr %cast
@@ -259,10 +262,14 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
; FIXME: Shouldn't need to enable queue ptr
; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
+; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
+; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base
+
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
%cast = addrspacecast ptr addrspace(5) null to ptr
store volatile i32 7, ptr %cast
|
6728531 to
e168d2b
Compare
e168d2b to
3211bfc
Compare
| return true; | ||
| // If an argument in generic address space has addrspace cast uses, and | ||
| // those casts are same, then we take the dst addrspace. | ||
| if (auto *Arg = dyn_cast<Argument>(&Obj)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering if it is safe for us to directly say here that, if it is an argument of an entry point function, it is in global address space.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you can say that generically with the information available in a non-target pass. I would also consider this case a frontend bug, and they should emit parameters with the correct address space to begin with. We already do some hacking around clang using flat pointers in the backend
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, I'd say that as well, but this is exactly what we do in the kernel argument promotion pass. The front end emits things correctly but we did this on purpose in the pass, hoping InferAddressSpacePass to pick it up. I have conservative opinions on whether the pass should do that way. I don't know why we can't just update the argument type to ptr addrspace(1) instead of making this hassle. In OpenCL pre 2.0 we have that requirement anyway.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I talked to @rampitec that we can't change kernel's signature. We can only recognize the pattern and support it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Of course you can change the kernel signature. clang should emit IR with addrspace(1) values directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Per discussion with @rampitec, in HIP all pointers which can be passed to a kernel are flat.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At a source level, yes. But that has no bearing on what IR clang produces. clang can and should emit ptr addrspace(1) pointers with a cast to addrspace(0)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If that's the case, the AMDGPUPromoteKernelArgumentsPass should not exist at the first place.
Both Flang and Clang (for OpenMP target offloading) still generate generic pointer.
HIP and OpenCL front ends generate AS1 pointer, which is convenient.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it shouldn't. It's a hack for frontends not emitting the nicer IR
| return true; | ||
| // If an argument in generic address space has addrspace cast uses, and | ||
| // those casts are same, then we take the dst addrspace. | ||
| if (auto *Arg = dyn_cast<Argument>(&Obj)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you can say that generically with the information available in a non-target pass. I would also consider this case a frontend bug, and they should emit parameters with the correct address space to begin with. We already do some hacking around clang using flat pointers in the backend
3211bfc to
d749116
Compare
| return true; | ||
| // If an argument in generic address space has addrspace cast uses, and | ||
| // those casts are same, then we take the dst addrspace. | ||
| if (auto *Arg = dyn_cast<Argument>(&Obj)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Of course you can change the kernel signature. clang should emit IR with addrspace(1) values directly.
d749116 to
cc9440a
Compare
cc9440a to
5f9e01b
Compare
c06fe68 to
674a1a5
Compare
5f9e01b to
d16be3f
Compare
674a1a5 to
98f4627
Compare
d16be3f to
9beeba0
Compare
I don't think this is valid in general. You are allow to speculatively produce invalid addrspacecasts. For example: |
98f4627 to
30c83b8
Compare
9beeba0 to
20e76a5
Compare
That doesn't matter because the scope is the instruction. In your example here, the pointer of the store is already in the specific address space. That will not change no matter whether the branch is taken or not. |
20e76a5 to
f9d7257
Compare
30c83b8 to
7da66b7
Compare
f9d7257 to
dd7eeb4
Compare
7da66b7 to
6356a15
Compare
dd7eeb4 to
1b06067
Compare
6356a15 to
02b52be
Compare
1b06067 to
f85ea41
Compare
02b52be to
cfce39f
Compare
f85ea41 to
081015e
Compare
cfce39f to
fb2ed73
Compare
081015e to
4f56200
Compare
fb2ed73 to
8a04c03
Compare
4f56200 to
f79d612
Compare
|
I unstacked from #108786 to unblock this since the ticket needs to be fixed promptly. |
f79d612 to
8066997
Compare
|
bump |
8066997 to
242582b
Compare
| ret void | ||
| } | ||
|
|
||
| define void @vec_kernel_argument_promotion_pattern_intra_procedure(<2 x ptr> %p, i32 %val) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm I added the test here but they don't make too much sense. The AA only works for load/store like instructions when a pointer is used as an operand. Apparently <2 x ptr> is not a valid operand for load/store like instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this pass is much more limited compared to InferAddressSpaces by only operating per use (and only for specific uses). Plus this will produce a lot more spurious casts
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes
I removed them.
this pass is much more limited compared to InferAddressSpaces
Yet. I can definitely add more handling. I already added atomic operations. The assumption is only load/store like (including their atomic operations) can benefit from more specific address space. What else can also benefit from it? Some memory related intrinsics can be the next target.
Besides, this is inter-procedural address space inference.
by only operating per use
This is more like a top-to-bottom vs bottom-to-top approach I'd argue.
Plus this will produce a lot more spurious casts
I don't think so. The use has to be proven to be in the more specific address space and a change will be manifested; otherwise it simply does nothing. The reason we have patch here is, other patch just adds a pattern that InferAddressSpaces can recognize.
If the value to be analyzed is directly from addrspacecast, we take the source address space directly. This is to improve the case where in `AMDGPUPromoteKernelArgumentsPass`, the kernel argument is promoted by insertting an addrspacecast directly from a generic pointer. However, during the analysis, the underlying object will be the generic pointer, instead of the addrspacecast, thus the inferred address space is the generic one, which is not ideal.
242582b to
685d9bf
Compare
…#108258) Currently `AAAddressSpace` relies on identifying the address spaces of all underlying objects. However, it might infer sub-optimal address space when the underlying object is a function argument. In `AMDGPUPromoteKernelArgumentsPass`, the promotion of a pointer kernel argument is by adding a series of `addrspacecast` instructions (as shown below), and hoping `InferAddressSpacePass` can pick it up and do the rewriting accordingly. Before promotion: ``` define amdgpu_kernel void @kernel(ptr %to_be_promoted) { %val = load i32, ptr %to_be_promoted ... ret void } ``` After promotion: ``` define amdgpu_kernel void @kernel(ptr %to_be_promoted) { %ptr.cast.0 = addrspace cast ptr % to_be_promoted to ptr addrspace(1) %ptr.cast.1 = addrspace cast ptr addrspace(1) %ptr.cast.0 to ptr # all the use of %to_be_promoted will use %ptr.cast.1 %val = load i32, ptr %ptr.cast.1 ... ret void } ``` When `AAAddressSpace` analyzes the code after promotion, it will take `%to_be_promoted` as the underlying object of `%ptr.cast.1`, and use its address space (which is 0) as its final address space, thus simply do nothing in `manifest`. The attributor framework will them eliminate the address space cast from 0 to 1 and back to 0, and replace `%ptr.cast.1` with `%to_be_promoted`, which basically reverts all changes by `AMDGPUPromoteKernelArgumentsPass`. IMHO I'm not sure if `AMDGPUPromoteKernelArgumentsPass` promotes the argument in a proper way. To improve the handling of this case, this PR adds an extra handling when iterating over all underlying objects. If an underlying object is a function argument, it means it reaches a terminal such that we can't futher deduce its underlying object further. In this case, we check all uses of the argument. If they are all `addrspacecast` instructions and their destination address spaces are same, we take the destination address space. Fixes: SWDEV-482640.

Currently
AAAddressSpacerelies on identifying the address spaces of all underlying objects. However, it might infer sub-optimal address space when the underlying object is a function argument. InAMDGPUPromoteKernelArgumentsPass, the promotion of a pointer kernel argument is by adding a series ofaddrspacecastinstructions (as shown below), and hopingInferAddressSpacePasscan pick it up and do the rewriting accordingly.Before promotion:
After promotion:
When
AAAddressSpaceanalyzes the code after promotion, it will take%to_be_promotedas the underlying object of%ptr.cast.1, and use its address space (which is 0) as its final address space, thus simply do nothing inmanifest. The attributor framework will them eliminate the address space cast from 0 to 1 and back to 0, and replace%ptr.cast.1with%to_be_promoted, which basically reverts all changes byAMDGPUPromoteKernelArgumentsPass.IMHO I'm not sure if
AMDGPUPromoteKernelArgumentsPasspromotes the argument in a proper way. To improve the handling of this case, this PR adds an extra handling when iterating over all underlying objects. If an underlying object is a function argument, it means it reaches a terminal such that we can't futher deduce its underlying object further. In this case, we check all uses of the argument. If they are alladdrspacecastinstructions and their destination address spaces are same, we take the destination address space.Fixes: SWDEV-482640.