Skip to content

Commit 85e1eb3

Browse files
authored
[AMD] Retire local prefetch schedule hint variant (#7395)
This variant was from some prior experiments. We have a better way to implement later.
1 parent 3b27971 commit 85e1eb3

File tree

16 files changed

+17
-525
lines changed

16 files changed

+17
-525
lines changed

include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,6 @@ class TargetInfoBase {
9898
virtual bool supportStMatrix() const { return false; }
9999
virtual bool isCuda() const { return false; }
100100

101-
// Annotate target specific information to local store operations during
102-
// lowering to LLVM.
103-
virtual void localStoreOpAnnotation(triton::gpu::LocalStoreOp op,
104-
size_t localStoreOpCount,
105-
Type type) const {}
106101
// Annotate target specific information to local load operations during
107102
// lowering to LLVM. `llLoadOp` is the generated LLVM load op.
108103
virtual void localLoadOpAnnotation(triton::gpu::LocalLoadOp localLoadOp,

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -548,11 +548,12 @@ SmallVector<Value> loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp,
548548
Location loc, RewriterBase &rewriter,
549549
const TargetInfoBase &target);
550550

551-
void storeDistributedToShared(
552-
triton::gpu::MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy,
553-
ArrayRef<Value> srcVals, const SharedMemoryObject &smemObj, Location loc,
554-
RewriterBase &rewriter, const TargetInfoBase &target,
555-
std::pair<size_t, Type> *const llvmOpCount = nullptr);
551+
void storeDistributedToShared(triton::gpu::MemDescType dstTy,
552+
RankedTensorType srcTy, Type elemLlvmTy,
553+
ArrayRef<Value> srcVals,
554+
const SharedMemoryObject &smemObj, Location loc,
555+
RewriterBase &rewriter,
556+
const TargetInfoBase &target);
556557

557558
// Close cousin of lowerLdStMatrix in MemoryOpToLLVM.cpp
558559
// We might want to merge them at some point, but having to support

lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,19 @@ using namespace mlir::triton::gpu;
1515
// blocked -> shared.
1616
// Swizzling in shared memory to avoid bank conflict. Normally used for
1717
// A/B operands of dots.
18-
void lowerDistributedToShared(
19-
Location loc, Value src, Value dst, Value adaptorSrc,
20-
const SharedMemoryObject &smemObj, const LLVMTypeConverter *typeConverter,
21-
ConversionPatternRewriter &rewriter, const TargetInfoBase &targetInfo,
22-
std::pair<size_t, Type> *const llvmOpCount = nullptr) {
18+
void lowerDistributedToShared(Location loc, Value src, Value dst,
19+
Value adaptorSrc,
20+
const SharedMemoryObject &smemObj,
21+
const LLVMTypeConverter *typeConverter,
22+
ConversionPatternRewriter &rewriter,
23+
const TargetInfoBase &targetInfo) {
2324
auto srcTy = cast<RankedTensorType>(src.getType());
2425
auto dstTy = cast<MemDescType>(dst.getType());
2526
auto elemTy = typeConverter->convertType(srcTy.getElementType());
2627

2728
auto inVals = unpackLLElements(loc, adaptorSrc, rewriter);
2829
storeDistributedToShared(dstTy, srcTy, elemTy, inVals, smemObj, loc, rewriter,
29-
targetInfo, llvmOpCount);
30+
targetInfo);
3031
}
3132

3233
LogicalResult lowerLocalStore(Location loc, MLIRContext *ctx, Value regVal,
@@ -245,20 +246,16 @@ struct LocalStoreOpConversion
245246
auto smemObj = LLVM::getSharedMemoryObjectFromStruct(loc, adaptor.getDst(),
246247
llvmElemTy, rewriter);
247248
auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
248-
std::pair<size_t, Type> llvmOpCount;
249249
if (targetInfo.isCuda()) {
250250
if (failed(lowerLocalStore(loc, ctx, regVal, memDescTy, smemObj, inVals,
251251
typeConverter, rewriter, targetInfo))) {
252252
return failure();
253253
}
254254
} else {
255255
lowerDistributedToShared(loc, regVal, memDescVal, adaptor.getSrc(),
256-
smemObj, typeConverter, rewriter, targetInfo,
257-
&llvmOpCount);
256+
smemObj, typeConverter, rewriter, targetInfo);
258257
}
259258

260-
targetInfo.localStoreOpAnnotation(op, llvmOpCount.first,
261-
llvmOpCount.second);
262259
rewriter.eraseOp(op);
263260
return success();
264261
}

lib/Conversion/TritonGPUToLLVM/Utility.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -791,8 +791,7 @@ void storeDistributedToShared(triton::gpu::MemDescType dstTy,
791791
ArrayRef<Value> srcVals,
792792
const SharedMemoryObject &smemObj, Location loc,
793793
RewriterBase &rewriter,
794-
const TargetInfoBase &target,
795-
std::pair<size_t, Type> *const llvmOpCount) {
794+
const TargetInfoBase &target) {
796795
auto b = TritonLLVMOpBuilder(loc, rewriter);
797796
bool success = emitTransferBetweenRegistersAndShared(
798797
srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemObj, loc,
@@ -807,10 +806,6 @@ void storeDistributedToShared(triton::gpu::MemDescType dstTy,
807806
b.store(vec, vecAddr)
808807
.setAlignment(vecTy.getNumElements() *
809808
elemLlvmTy.getIntOrFloatBitWidth() / 8);
810-
if (llvmOpCount) {
811-
++(llvmOpCount->first);
812-
llvmOpCount->second = vecTy;
813-
}
814809
});
815810

816811
if (!success)

test/TritonGPU/amd/amd-instruction-sched.mlir

Lines changed: 0 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,7 @@
1-
// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=1" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS1
2-
// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS2
3-
// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -triton-amdgpu-lower-insert-instruction-sched-hints="arch=gfx942 num_stages=2" -debug-only="lower-insert-instruction-sched-hints" -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_LOCAL_PREFETCH_GLOBAL_LOAD
41
// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=1" | FileCheck %s -check-prefix=LABELING_PS_1
52
// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" | FileCheck %s -check-prefix=LABELING_PS_2
63

74
module {
8-
// INSTR_COUNT_NS1-LABEL: @test_dot_op
9-
// INSTR_COUNT_NS2-LABEL: @test_dot_op
10-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: @test_dot_op
115
// LABELING_PS_1-LABEL: @test_dot_op
126
// LABELING_PS_2-LABEL: @test_dot_op
137
tt.func @test_dot_op(%lb : index, %ub : index, %step : index,
@@ -40,96 +34,6 @@ module {
4034
%a = tt.load %a_ptr : tensor<128x32x!tt.ptr<f16>>
4135
%b = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr<f16>>
4236

43-
// INSTR_COUNT_NS1: amdgpu.instruction_sched_hint
44-
// INSTR_COUNT_NS1-SAME: isBufferLoadsAEnabled = false
45-
// INSTR_COUNT_NS1-SAME: isBufferLoadsBEnabled = false
46-
// INSTR_COUNT_NS1-SAME: numDsReadsA = #amdgpu.InstCounter<8, vector<4xf16>>
47-
// INSTR_COUNT_NS1-SAME: numDsReadsB = #amdgpu.InstCounter<32, vector<1xf16>>
48-
// INSTR_COUNT_NS1-SAME: numDsWritesA = #amdgpu.InstCounter<0, none>
49-
// INSTR_COUNT_NS1-SAME: numDsWritesB = #amdgpu.InstCounter<0, none>
50-
// INSTR_COUNT_NS1-SAME: numGlobalLoadsA = #amdgpu.InstCounter<4, vector<4xf16>>
51-
// INSTR_COUNT_NS1-SAME: numGlobalLoadsB = #amdgpu.InstCounter<4, vector<4xf16>>
52-
// INSTR_COUNT_NS1-SAME: numMMAs = #amdgpu.InstCounter<16, tensor<32x32x8xf16>>
53-
54-
// INSTR_COUNT_NS2: amdgpu.instruction_sched_hint
55-
// INSTR_COUNT_NS2-SAME: isBufferLoadsAEnabled = false
56-
// INSTR_COUNT_NS2-SAME: isBufferLoadsBEnabled = false
57-
// INSTR_COUNT_NS2-SAME: numDsReadsA = #amdgpu.InstCounter<8, vector<4xf16>>
58-
// INSTR_COUNT_NS2-SAME: numDsReadsB = #amdgpu.InstCounter<32, vector<1xf16>>
59-
// INSTR_COUNT_NS2-SAME: numDsWritesA = #amdgpu.InstCounter<4, vector<4xf16>>
60-
// INSTR_COUNT_NS2-SAME: numDsWritesB = #amdgpu.InstCounter<4, vector<4xf16>>
61-
// INSTR_COUNT_NS2-SAME: numGlobalLoadsA = #amdgpu.InstCounter<4, vector<4xf16>>
62-
// INSTR_COUNT_NS2-SAME: numGlobalLoadsB = #amdgpu.InstCounter<4, vector<4xf16>>
63-
// INSTR_COUNT_NS2-SAME: numMMAs = #amdgpu.InstCounter<16, tensor<32x32x8xf16>>
64-
65-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.barrier [[SCHED_GUARD:.+]]
66-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE:512]], 1, 0
67-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA:8]], 1, 0
68-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ:32]], 1, 0
69-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0
70-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
71-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0
72-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0
73-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
74-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0
75-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0
76-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
77-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0
78-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0
79-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
80-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0
81-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0
82-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
83-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0
84-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0
85-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
86-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0
87-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0
88-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
89-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0
90-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ:256]], 2, 0
91-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
92-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
93-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
94-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
95-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
96-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
97-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
98-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
99-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
100-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
101-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
102-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
103-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
104-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
105-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
106-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
107-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
108-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
109-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
110-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
111-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
112-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
113-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
114-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
115-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
116-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
117-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
118-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
119-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
120-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
121-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
122-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
123-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
124-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
125-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
126-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
127-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
128-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0
129-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0
130-
// USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.barrier [[SCHED_GUARD]]
131-
132-
13337
// LABELING_PS_1: scf.for
13438
// LABELING_PS_1: %[[REG0_OP0:.+]] = tt.load {{.*}} {OpIdx = #amdgpu.OpIdx<0>}
13539
// LABELING_PS_1: %[[REG0_OP1:.+]] = tt.load {{.*}} {OpIdx = #amdgpu.OpIdx<1>}

third_party/amd/backend/compiler.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,6 @@ class HIPOptions:
5959
#
6060
# Current experimental scheduling variants:
6161
#
62-
# local-prefetch: implements instruction scheduling similar to the one from the ROCm Composable
63-
# Kernel library. Note, this variant requires the use of buffer load/store ops
64-
# and a special software pipelining style - i.e., 1x LDS and 1x register
65-
# prefetch buffers for each GEMM tile.
6662
# attention: enables a bunch of optimizations for attention kernels, including:
6763
# - iglp 2 and sched.barrier around it
6864
# - sink-insts-to-avoid-spills flag to avoid register spills
@@ -237,10 +233,6 @@ def make_ttgir(mod, metadata, options):
237233
local_prefetch = knobs.amd.local_prefetch
238234
use_async_copy = knobs.amd.use_async_copy
239235

240-
# The `local-prefetch` scheduling variant requires turning on buffer ops.
241-
if options.schedule_hint == "local-prefetch":
242-
global_prefetch = local_prefetch = 1
243-
244236
amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy)
245237
if use_async_copy:
246238
amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)

third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -254,28 +254,7 @@ def InstructionSchedHint : TT_AMDGPU_Op<"instruction_sched_hint", []> {
254254
interleave for better instruction level parallelism.
255255
}];
256256

257-
let arguments = (ins
258-
TritonAMDGPU_SchedHintVariantAttr:$variant,
259-
TritonAMDGPU_InstCounter:$numDsReadsA,
260-
TritonAMDGPU_InstCounter:$numDsReadsB,
261-
TritonAMDGPU_InstCounter:$numDsWritesA,
262-
TritonAMDGPU_InstCounter:$numDsWritesB,
263-
TritonAMDGPU_InstCounter:$numGlobalLoadsA,
264-
TritonAMDGPU_InstCounter:$numGlobalLoadsB,
265-
BoolAttr:$isBufferLoadsAEnabled,
266-
BoolAttr:$isBufferLoadsBEnabled,
267-
TritonAMDGPU_InstCounter:$numMMAs
268-
);
269-
270-
let builders = [
271-
OpBuilder<(ins "amdgpu::SchedHint":$variant), [{
272-
auto ctx = $_state.getContext();
273-
auto noneType = NoneType::get(ctx);
274-
auto emptyAttr = amdgpu::InstCounterAttr::get(ctx, 0, noneType);
275-
build($_builder, $_state, variant, emptyAttr, emptyAttr, emptyAttr, emptyAttr,
276-
emptyAttr, emptyAttr, false, false, emptyAttr);
277-
}]>
278-
];
257+
let arguments = (ins TritonAMDGPU_SchedHintVariantAttr:$variant);
279258

280259
let assemblyFormat = [{ attr-dict }];
281260
}

third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,6 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
400400

401401
for (auto op : tensor.getUsers()) {
402402
if (auto localLoadOp = llvm::dyn_cast<triton::gpu::LocalLoadOp>(op)) {
403-
const size_t numDsReadsCount =
404-
repB * numRepNonK * numRepK * loadsPerThread;
405-
setNumGeneratedDsReads(localLoadOp, numDsReadsCount, loadVecTy);
406-
407403
for (auto llLoad : llLoads) {
408404
AMD::addLocalLoadNoAliasScope(localLoadOp, llLoad);
409405
}

third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -249,14 +249,6 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
249249
}
250250
}
251251

252-
for (auto op : tensor.getUsers()) {
253-
if (auto localLoadOp = llvm::dyn_cast<triton::gpu::LocalLoadOp>(op)) {
254-
const size_t numDsReadsCount =
255-
repB * numRepNonK * numRepK * loadsPerThread;
256-
setNumGeneratedDsReads(localLoadOp, numDsReadsCount, loadVecTy);
257-
}
258-
}
259-
260252
MLIRContext *ctx = wmmaLayout.getContext();
261253
Type structTy = LLVM::LLVMStructType::getLiteral(
262254
ctx, SmallVector<Type>(loadedValues.size(), loadedValues[0].getType()));

third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,6 @@ struct DotOpMFMAConversionHelper {
237237
ctx, SmallVector<Type>(fc.size(), dstElemTy));
238238
Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
239239

240-
setNumGeneratedMMAs(op, mmaCount, maybeMfmaIntrinsic->mDim,
241-
maybeMfmaIntrinsic->nDim, maybeMfmaIntrinsic->kDim,
242-
elemtTy);
243-
244240
rewriter.replaceOp(op, res);
245241
}
246242

0 commit comments

Comments
 (0)