Skip to content

Commit eda3535

Browse files
authored
[AMD] Do not pipeline via AsyncCopy for unsupported vec sizes (#7676)
This PR makes `canBeConvertedToAsyncLoad` more general and strict to only allow cases where there is a supported direct-to-lds `vecSize` smaller or equal to the `vecSize` based on contiguity. This catches cases where we load less than 32bits, which was already rejected before this PR. Additionally it catches case where we cannot lower the vecSize to a supported size. In such cases we can also not use `ttg.async_copy_global_to_local` since we cannot split contiguous elements owned by a thread into multiple load instructions. e.g. fp64 with vecSize==1 does not work with `ttg.async_copy_global_to_local` on GFX9.
1 parent 239e17f commit eda3535

File tree

5 files changed

+99
-20
lines changed

5 files changed

+99
-20
lines changed

test/TritonGPU/loop-pipeline-hip.mlir

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,3 +738,62 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
738738
tt.return
739739
}
740740
}
741+
742+
// -----
743+
744+
#AL = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
745+
#C = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
746+
#A = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth=2}>
747+
#B = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth=2}>
748+
#smem = #ttg.shared_memory
749+
750+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
751+
// Verify that we do not get AsyncCopies because we cannot lower it on gfx942 since we only have 32bit wide loads to lds
752+
// COMMON-LABEL: @reject_fp64_pipelining_with_async_copy_gfx942
753+
// ASYNC-NOT: ttg.async_copy_global_to_local
754+
tt.func @reject_fp64_pipelining_with_async_copy_gfx942(
755+
%a_ptr : tensor<128x32x!tt.ptr<f64>, #AL> {tt.divisibility = 16 : i32, tt.contiguity = 16 : i32},
756+
%B : tensor<32x128xf64, #B>, %lb: i32, %ub: i32, %step: i32) -> tensor<128x128xf64, #C> {
757+
%c_init = arith.constant dense<0.00e+00> : tensor<128x128xf64, #C>
758+
%loop = scf.for %iv = %lb to %ub step %step iter_args(%prev_c = %c_init) -> (tensor<128x128xf64, #C>) : i32 {
759+
%a_ = tt.load %a_ptr : tensor<128x32x!tt.ptr<f64>, #AL>
760+
%a = ttg.convert_layout %a_ : tensor<128x32xf64, #AL> -> tensor<128x32xf64, #A>
761+
%c = tt.dot %a, %B, %prev_c : tensor<128x32xf64, #A> * tensor<32x128xf64, #B> -> tensor<128x128xf64, #C>
762+
scf.yield %c : tensor<128x128xf64, #C>
763+
}
764+
tt.return %loop: tensor<128x128xf64, #C>
765+
}
766+
}
767+
768+
// -----
769+
770+
#AL = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
771+
#BL = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
772+
#C = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
773+
#A = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth=2}>
774+
#B = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth=2}>
775+
#smem = #ttg.shared_memory
776+
777+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
778+
// On GFX950 we can use AsyncCopy if sizePerThread >= 2 and it's contiguous because we can load 2 fp64 with one direct to lds instruction
779+
// COMMON-LABEL: @pipeline_fp64_with_async_copy_gfx950
780+
// ASYNC: ttg.async_copy_global_to_local
781+
// ASYNC: tt.load
782+
// ASYNC: ttg.async_copy_global_to_local
783+
// ASYNC: tt.load
784+
tt.func @pipeline_fp64_with_async_copy_gfx950(
785+
%a_ptr : tensor<128x32x!tt.ptr<f64>, #AL> {tt.divisibility = 16 : i32, tt.contiguity = 16 : i32},
786+
%b_ptr : tensor<32x128x!tt.ptr<f64>, #BL> {tt.divisibility = 16 : i32, tt.contiguity = 2 : i32},
787+
%lb: i32, %ub: i32, %step: i32) -> tensor<128x128xf64, #C> {
788+
%c_init = arith.constant dense<0.00e+00> : tensor<128x128xf64, #C>
789+
%loop = scf.for %iv = %lb to %ub step %step iter_args(%prev_c = %c_init) -> (tensor<128x128xf64, #C>) : i32 {
790+
%a_ = tt.load %a_ptr : tensor<128x32x!tt.ptr<f64>, #AL>
791+
%a = ttg.convert_layout %a_ : tensor<128x32xf64, #AL> -> tensor<128x32xf64, #A>
792+
%b_ = tt.load %b_ptr : tensor<32x128x!tt.ptr<f64>, #BL>
793+
%b = ttg.convert_layout %b_ : tensor<32x128xf64, #BL> -> tensor<32x128xf64, #B>
794+
%c = tt.dot %a, %b, %prev_c : tensor<128x32xf64, #A> * tensor<32x128xf64, #B> -> tensor<128x128xf64, #C>
795+
scf.yield %c : tensor<128x128xf64, #C>
796+
}
797+
tt.return %loop: tensor<128x128xf64, #C>
798+
}
799+
}

third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "AsyncUtility.h"
22

33
#include "Dialect/TritonAMDGPU/IR/Dialect.h"
4+
#include "TargetInfo.h"
45
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
56

67
namespace mlir::triton::AMD {
@@ -128,4 +129,14 @@ void addLocalLoadNoAliasScope(LLVM::AliasAnalysisOpInterface llLoadOp) {
128129
llLoadOp.setAliasScopes(aliasScopes);
129130
}
130131

132+
unsigned
133+
fitToValidDirectToLdsVecSize(unsigned maxVecSize, unsigned elemBitwidth,
134+
const triton::AMD::TargetInfo &targetInfo) {
135+
while (maxVecSize > 0 && !targetInfo.supportsDirectToLdsLoadBitWidth(
136+
maxVecSize * elemBitwidth)) {
137+
maxVecSize /= 2;
138+
}
139+
return maxVecSize;
140+
}
141+
131142
} // namespace mlir::triton::AMD

third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
88

99
namespace mlir::triton::AMD {
10+
class TargetInfo;
11+
1012
// Annotates LocalLoadOps with ttg.amdgpu.syncedByAsyncWait=true if they are
1113
// synced by an AsyncWait.
1214
void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod);
@@ -39,6 +41,12 @@ void addLocalLoadNoAliasScope(LLVM::AliasAnalysisOpInterface llLoadOp);
3941
// Attaches the "AsyncCopies" alias scope to llLoadDirectToLdsOp
4042
void addAsyncCopyAliasScope(LLVM::AliasAnalysisOpInterface llLoadDirectToLdsOp);
4143

44+
// Finds the largest supported vecSize smaller than maxVecSize. Returns 0 if
45+
// there is none
46+
unsigned
47+
fitToValidDirectToLdsVecSize(unsigned maxVecSize, unsigned elemBitwidth,
48+
const triton::AMD::TargetInfo &targetInfo);
49+
4250
} // namespace mlir::triton::AMD
4351

4452
#endif

third_party/amd/lib/TritonAMDGPUTransforms/CoalesceAsyncCopy.cpp

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "TritonAMDGPUToLLVM/TargetUtils.h"
22
#include "TritonAMDGPUTransforms/Passes.h"
3+
#include "amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h"
34
#include "amd/lib/TritonAMDGPUToLLVM/Utility.h"
45
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
56
#include "third_party/amd/include/Analysis/AxisInfoExt.h"
@@ -22,9 +23,8 @@ namespace {
2223

2324
// On gfx9 global and buffer loads directly to shared memory need to write
2425
// coalesced. This pattern converts the layout of the src, mask and other to
25-
// ensure the owned data per thread is contigious and does no exceed the
26-
// supported load vector size. The swizzle pattern is ignored here and is
27-
// handled when lowering to LLVMIR
26+
// ensure the owned data per thread is contiguous and does no exceed the
27+
// supported load vector size.
2828
struct CoalesceAsyncCopyWrites
2929
: public OpRewritePattern<ttg::AsyncCopyGlobalToLocalOp> {
3030
CoalesceAsyncCopyWrites(const triton::AMD::TargetInfo &targetInfo,
@@ -49,12 +49,6 @@ struct CoalesceAsyncCopyWrites
4949
return rewriter.notifyMatchFailure(copyOp,
5050
"src encoding must be #blocked");
5151

52-
auto sharedEnc =
53-
dyn_cast<ttg::SwizzledSharedEncodingAttr>(dstTy.getEncoding());
54-
if (!sharedEnc)
55-
return rewriter.notifyMatchFailure(
56-
copyOp, "destination encoding must be #SwizzledShared");
57-
5852
// We start from the precomputed contiguity we got from AxisAnalysis.
5953
unsigned loadContig = 0;
6054
if (auto it = asyncCopyContiguity.find(copyOp);
@@ -77,10 +71,8 @@ struct CoalesceAsyncCopyWrites
7771

7872
// Select the largest supported load width equal or smaller than loadContig
7973
auto elemBitWidth = dstTy.getElementTypeBitWidth();
80-
while (loadContig > 0 && !targetInfo.supportsDirectToLdsLoadBitWidth(
81-
loadContig * elemBitWidth)) {
82-
loadContig /= 2;
83-
}
74+
loadContig =
75+
fitToValidDirectToLdsVecSize(loadContig, elemBitWidth, targetInfo);
8476

8577
if (loadContig == 0) {
8678
return rewriter.notifyMatchFailure(

third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "TritonAMDGPUTransforms/Passes.h"
2+
#include "amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h"
23
#include "amd/lib/TritonAMDGPUToLLVM/TargetInfo.h"
34
#include "third_party/amd/include/Analysis/AxisInfoExt.h"
45
#include "triton/Analysis/AxisInfo.h"
@@ -280,7 +281,8 @@ getSharedEncIfAllUsersAreDotEnc(Value loadedValue) {
280281

281282
bool canBeConvertedToAsyncLoad(unsigned numBuffers, tt::LoadOp loadOp,
282283
Value alloc,
283-
tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
284+
tt::ModuleAxisInfoAnalysis &axisInfoAnalysis,
285+
const tt::AMD::TargetInfo &targetInfo) {
284286
// If we have a single buffer we would require another barrier after the
285287
// local_reads so instead we fall back to pipeline with registers
286288
// Removing this check will create incorrect IR, see
@@ -289,7 +291,9 @@ bool canBeConvertedToAsyncLoad(unsigned numBuffers, tt::LoadOp loadOp,
289291
return false;
290292

291293
// Compute the final vecSize we can use for the combination of sourceEncoding
292-
// and sharedEncoding. We can only use AsyncCopy if the width is >= 32 bit
294+
// and sharedEncoding. We can only use AsyncCopy if the target supports the
295+
// requested or a smaller vecSize because we cannot stride when loading
296+
// directly to lds
293297
auto srcTy = cast<RankedTensorType>(loadOp.getPtr().getType());
294298
auto dstTy = cast<ttg::MemDescType>(alloc.getType());
295299
auto regLayout = triton::gpu::toLinearLayout(srcTy);
@@ -298,9 +302,11 @@ bool canBeConvertedToAsyncLoad(unsigned numBuffers, tt::LoadOp loadOp,
298302
auto sharedLayout =
299303
triton::gpu::toLinearLayout(srcShape, dstTy.getEncoding(), srcShape);
300304
auto regToSharedLayout = regLayout.invertAndCompose(sharedLayout);
301-
unsigned loadContig = regToSharedLayout.getNumConsecutiveInOut();
302-
unsigned width = loadContig * dstTy.getElementTypeBitWidth();
303-
if (width < 32)
305+
306+
unsigned vecSize = regToSharedLayout.getNumConsecutiveInOut();
307+
unsigned elemBitWidth = dstTy.getElementTypeBitWidth();
308+
309+
if (fitToValidDirectToLdsVecSize(vecSize, elemBitWidth, targetInfo) == 0)
304310
return false;
305311

306312
// Checks whether the global pointer's contiguity and mask alignment allows
@@ -354,10 +360,13 @@ createStreamOps(const LoadToInfoMap &loadToInfo, scf::ForOp &forOp,
354360
Value alloc = triton::createAlloc(forOp, ty, loadOp->getLoc(),
355361
info.sharedEncoding, numBuffers);
356362
assert(alloc && "Failed to create alloc for the async load.");
363+
auto arch = getAMDArch(loadOp->getParentOfType<ModuleOp>());
364+
triton::AMD::TargetInfo targetInfo(arch ? arch->str() : "");
357365

358366
// Replace the old load with multi-buffered loads
359-
if (useAsyncCopy && canBeConvertedToAsyncLoad(numBuffers, loadOp, alloc,
360-
axisInfoAnalysis)) {
367+
if (useAsyncCopy &&
368+
canBeConvertedToAsyncLoad(numBuffers, loadOp, alloc, axisInfoAnalysis,
369+
targetInfo)) {
361370
loadToStreamOp[loadOp] = createAsyncCopy(loadOp, alloc, extractIdx);
362371
} else {
363372
loadToStreamOp[loadOp] = createStreamCopy(loadOp, alloc, extractIdx);

0 commit comments

Comments
 (0)