-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[llvm][AMDGPU] Fold llvm.amdgcn.wavefrontsize early
#114481
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
3ba88ce
1376596
826c291
ab6f5a2
f8705fb
ed870a8
f5751a5
026ed00
195decc
1a7abaf
9aed76c
246c22f
5a11720
7cf7558
6a77b8a
be414a8
dedc593
c634b4e
c7be46f
ed9f19f
d30cb95
dcfe7be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { | |
| } | ||
| break; | ||
| } | ||
| case Intrinsic::amdgcn_wavefrontsize: { | ||
| // TODO: this is a workaround for the pseudo-generic target one gets with no | ||
| // specified mcpu, which spoofs its wave size to 64; it should be removed. | ||
| if ((ST->getCPU().empty() || ST->getCPU().starts_with("generic")) && | ||
jhuber6 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| !ST->getFeatureString().contains("+wavefrontsize")) | ||
|
||
| break; | ||
| return IC.replaceInstUsesWith( | ||
| II, ConstantInt::get(II.getType(), ST->getWavefrontSize())); | ||
| } | ||
jhuber6 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| case Intrinsic::amdgcn_wqm_vote: { | ||
| // wqm_vote is identity when the argument is constant. | ||
| if (!isa<Constant>(II.getArgOperand(0))) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
|
|
||
| define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
| ; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { | ||
| ; OPT-NEXT: [[BB:.*:]] | ||
| ; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]] | ||
| ; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-NEXT: ret void | ||
| ; | ||
| ; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
| ; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { | ||
| ; OPT-W32-NEXT: [[BB:.*:]] | ||
| ; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W32-NEXT: ret void | ||
| ; | ||
| ; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
| ; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { | ||
| ; OPT-W64-NEXT: [[BB:.*:]] | ||
| ; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W64-NEXT: ret void | ||
| ; | ||
| bb: | ||
| %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
| store i32 %tmp, ptr addrspace(1) %arg, align 4 | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
| ; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { | ||
| ; OPT-NEXT: [[BB:.*:]] | ||
| ; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] | ||
| ; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 | ||
| ; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 | ||
| ; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-NEXT: ret void | ||
| ; | ||
| ; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
| ; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { | ||
| ; OPT-W32-NEXT: [[BB:.*:]] | ||
| ; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W32-NEXT: ret void | ||
| ; | ||
| ; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
| ; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { | ||
| ; OPT-W64-NEXT: [[BB:.*:]] | ||
| ; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W64-NEXT: ret void | ||
| ; | ||
| bb: | ||
| %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
| %tmp1 = icmp ugt i32 %tmp, 32 | ||
| %tmp2 = select i1 %tmp1, i32 2, i32 1 | ||
| store i32 %tmp2, ptr addrspace(1) %arg | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
| ; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { | ||
| ; OPT-NEXT: [[BB:.*:]] | ||
| ; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] | ||
| ; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 | ||
| ; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] | ||
| ; OPT: [[BB2]]: | ||
| ; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-NEXT: br label %[[BB3]] | ||
| ; OPT: [[BB3]]: | ||
| ; OPT-NEXT: ret void | ||
| ; | ||
| ; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
| ; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { | ||
| ; OPT-W32-NEXT: [[BB:.*:]] | ||
| ; OPT-W32-NEXT: br i1 false, label %[[BB2:.*]], label %[[BB3:.*]] | ||
| ; OPT-W32: [[BB2]]: | ||
| ; OPT-W32-NEXT: br label %[[BB3]] | ||
| ; OPT-W32: [[BB3]]: | ||
| ; OPT-W32-NEXT: ret void | ||
| ; | ||
| ; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
| ; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { | ||
| ; OPT-W64-NEXT: [[BB:.*:]] | ||
| ; OPT-W64-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB3:.*]] | ||
| ; OPT-W64: [[BB2]]: | ||
| ; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W64-NEXT: br label %[[BB3]] | ||
| ; OPT-W64: [[BB3]]: | ||
| ; OPT-W64-NEXT: ret void | ||
| ; | ||
| bb: | ||
| %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
| %tmp1 = icmp ugt i32 %tmp, 32 | ||
| br i1 %tmp1, label %bb2, label %bb3 | ||
|
|
||
| bb2: ; preds = %bb | ||
| store i32 1, ptr addrspace(1) %arg, align 4 | ||
| br label %bb3 | ||
|
|
||
| bb3: ; preds = %bb2, %bb | ||
| ret void | ||
| } | ||
|
|
||
| declare i32 @llvm.amdgcn.wavefrontsize() #0 | ||
|
|
||
| attributes #0 = { nounwind readnone speculatable } |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A real solution would be two builds, but spoofing it as 64 works, (likely unintentinally) because we don't do any w64 specific changes yet and w64 can always be narrowed to w32 and not the other way around.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that this interpretation is actually correct, if you rely on lockstep of a full wave and you optimise around wavesize this will break in bad ways on wave32. The current
genericis not particularly god, but we have to live with it for now I guess.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We already do some light 64->32 folds, that are only sort of correct.
Technically we could make exec_hi an allocatable scratch register in wave32, but what we do now bakes in an assumption that exec_hi must always be 0.
But yes, the only way to really avoid any possible edge cases (and support a future of machine linked libraries) requires just having totally separate builds