-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[llvm][AMDGPU] Fold llvm.amdgcn.wavefrontsize early
#114481
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
3ba88ce
1376596
826c291
ab6f5a2
f8705fb
ed870a8
f5751a5
026ed00
195decc
1a7abaf
9aed76c
246c22f
5a11720
7cf7558
6a77b8a
be414a8
dedc593
c634b4e
c7be46f
ed9f19f
d30cb95
dcfe7be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| //===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // This file implements a pass that deals with expanding AMDGCN generic pseudo- | ||
| // intrinsics into target specific quantities / sequences. In this context, a | ||
| // pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a | ||
| // specific instruction, but rather is intended as a mechanism for abstractly | ||
| // conveying target specific info to a HLL / the FE, without concretely | ||
| // impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. | ||
| // This pass should run as early as possible / immediately after Clang CodeGen, | ||
| // so that the optimisation pipeline and the BE operate with concrete target | ||
| // data. | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include "AMDGPU.h" | ||
| #include "AMDGPUTargetMachine.h" | ||
| #include "GCNSubtarget.h" | ||
|
|
||
| #include "llvm/IR/Constants.h" | ||
| #include "llvm/IR/Function.h" | ||
| #include "llvm/IR/IntrinsicsAMDGPU.h" | ||
| #include "llvm/IR/Module.h" | ||
| #include "llvm/Pass.h" | ||
|
|
||
| using namespace llvm; | ||
|
|
||
| static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST, | ||
| Function *WaveSize) { | ||
| if (WaveSize->hasZeroLiveUses()) | ||
| return PreservedAnalyses::all(); | ||
|
|
||
| for (auto &&U : WaveSize->users()) | ||
| U->replaceAllUsesWith( | ||
| ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize())); | ||
|
|
||
| return PreservedAnalyses::none(); | ||
| } | ||
|
|
||
| PreservedAnalyses | ||
| AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { | ||
| if (M.empty()) | ||
| return PreservedAnalyses::all(); | ||
|
|
||
| const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin()); | ||
|
|
||
| // This is not a concrete target, we should not fold early. | ||
| if (ST.getCPU().empty() || ST.getCPU() == "generic") | ||
| return PreservedAnalyses::all(); | ||
|
|
||
| if (auto WS = Intrinsic::getDeclarationIfExists( | ||
| &M, Intrinsic::amdgcn_wavefrontsize)) | ||
| return expandWaveSizeIntrinsic(ST, WS); | ||
|
|
||
| return PreservedAnalyses::all(); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { | |
| } | ||
| break; | ||
| } | ||
| case Intrinsic::amdgcn_wavefrontsize: { | ||
| // TODO: this is a workaround for the pseudo-generic target one gets with no | ||
| // specified mcpu, which spoofs its wave size to 64; it should be removed. | ||
|
||
| if ((ST->getCPU().empty() || ST->getCPU() == "generic") && | ||
|
||
| !ST->getFeatureString().contains("+wavefrontsize")) | ||
|
||
| break; | ||
| return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), | ||
| ST->getWavefrontSize())); | ||
rampitec marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
jhuber6 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| case Intrinsic::amdgcn_wqm_vote: { | ||
| // wqm_vote is identity when the argument is constant. | ||
| if (!isa<Constant>(II.getArgOperand(0))) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
AlexVlx marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s | ||
|
|
@@ -6,48 +7,78 @@ | |
|
|
||
| ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s | ||
jhuber6 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s | ||
| ; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
|
||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
| ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
|
|
||
| ; GCN-LABEL: {{^}}fold_wavefrontsize: | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
|
|
||
| ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 | ||
| ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64 | ||
| ; GCN: store_{{dword|b32}} v{{.+}}, [[V]] | ||
|
|
||
| ; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() | ||
| ; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4 | ||
| ; OPT-NEXT: ret void | ||
|
|
||
| define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
| ; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
| ; OPT-NEXT: [[BB:.*:]] | ||
| ; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]] | ||
| ; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-NEXT: ret void | ||
| ; | ||
| ; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
| ; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
| ; OPT-W64-NEXT: [[BB:.*:]] | ||
| ; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W64-NEXT: ret void | ||
| ; | ||
| ; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
| ; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
| ; OPT-W32-NEXT: [[BB:.*:]] | ||
| ; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W32-NEXT: ret void | ||
| ; | ||
| bb: | ||
| %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
| store i32 %tmp, ptr addrspace(1) %arg, align 4 | ||
| ret void | ||
| } | ||
|
|
||
| ; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize: | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
|
|
||
| ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}} | ||
| ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}} | ||
| ; GCN-NOT: cndmask | ||
| ; GCN: store_{{dword|b32}} v{{.+}}, [[V]] | ||
|
|
||
| ; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() | ||
| ; OPT: %tmp1 = icmp ugt i32 %tmp, 32 | ||
| ; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1 | ||
| ; OPT: store i32 %tmp2, ptr addrspace(1) %arg | ||
| ; OPT-NEXT: ret void | ||
|
|
||
| define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
| ; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
| ; OPT-NEXT: [[BB:.*:]] | ||
| ; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] | ||
| ; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 | ||
| ; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 | ||
| ; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-NEXT: ret void | ||
| ; | ||
| ; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
| ; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
| ; OPT-W64-NEXT: [[BB:.*:]] | ||
| ; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W64-NEXT: ret void | ||
| ; | ||
| ; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
| ; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
| ; OPT-W32-NEXT: [[BB:.*:]] | ||
| ; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W32-NEXT: ret void | ||
| ; | ||
| bb: | ||
| %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
| %tmp1 = icmp ugt i32 %tmp, 32 | ||
|
|
@@ -57,15 +88,31 @@ bb: | |
| } | ||
|
|
||
| ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
|
|
||
| ; OPT: bb: | ||
| ; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() | ||
| ; OPT: %tmp1 = icmp ugt i32 %tmp, 32 | ||
| ; OPT: bb3: | ||
| ; OPT-NEXT: ret void | ||
|
|
||
| define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
| ; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
| ; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
| ; OPT-NEXT: [[BB:.*:]] | ||
| ; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] | ||
| ; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 | ||
| ; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] | ||
| ; OPT: [[BB2]]: | ||
| ; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-NEXT: br label %[[BB3]] | ||
| ; OPT: [[BB3]]: | ||
| ; OPT-NEXT: ret void | ||
| ; | ||
| ; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
| ; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
| ; OPT-W64-NEXT: [[BB:.*:]] | ||
| ; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
| ; OPT-W64-NEXT: ret void | ||
| ; | ||
| ; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
| ; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { | ||
| ; OPT-W32-NEXT: [[BB:.*:]] | ||
| ; OPT-W32-NEXT: ret void | ||
| ; | ||
| bb: | ||
| %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
| %tmp1 = icmp ugt i32 %tmp, 32 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pass isn't needed now?