Skip to content

Commit 6c196eb

Browse files
Merge commit '882a02e806af3d156843a272b8501ac666fe4d02'
2 parents 9b25ea1 + 882a02e commit 6c196eb

21 files changed

+198
-27
lines changed

lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ static Interval<int> getLiveIntervals(Value value, Liveness &liveness,
142142
}
143143

144144
static void updateMap(MemoryBitMap &memoryMap, Interval<int> liveInterval,
145-
std::map<int, TMemChunk> &intervalLiverangeEnd) {
145+
std::multimap<int, TMemChunk> &intervalLiverangeEnd) {
146146
int start = liveInterval.start();
147147
// Add any dead liverange to the list of free intervals.
148148
for (auto it = intervalLiverangeEnd.begin();
@@ -247,7 +247,7 @@ allocateTMem(Operation *parentOp,
247247
int totalMemorySize = 0;
248248
MemoryBitMap memoryMap;
249249
Liveness liveness(parentOp);
250-
std::map<int, TMemChunk> intervalLiverangeEnd;
250+
std::multimap<int, TMemChunk> intervalLiverangeEnd;
251251
DenseMap<TMEMAllocOp, TMemChunk> allocChunks;
252252
// Implement a linear scan first fit algorithm. We expect that fragmentation
253253
// won't be a problem, if it is this should be revisited.
@@ -283,7 +283,7 @@ allocateTMem(Operation *parentOp,
283283
allocChunks.insert({alloc, chunkAllocated});
284284
// currently naively constraint allocs based on the first one we find.
285285
rowIdConstraints.addConstraints(alloc, chunkAllocated.startRow);
286-
intervalLiverangeEnd[liveInterval.end()] = chunkAllocated;
286+
intervalLiverangeEnd.insert({liveInterval.end(), chunkAllocated});
287287
int colOffset = chunkAllocated.startCol;
288288
int rowOffset = chunkAllocated.startRow * 16;
289289

test/TritonGPU/amd/accelerate-amd-matmul-chain-dot.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=16' | FileCheck %s --check-prefixes MFMA16,CHECK
2-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32' | FileCheck %s --check-prefixes MFMA32,CHECK
1+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16" | FileCheck %s --check-prefixes MFMA16,CHECK
2+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32" | FileCheck %s --check-prefixes MFMA32,CHECK
33

44
// Check the warpsPerCTA parameter of #mma layout of the two dot's.
55
// The 1st dot always has warpsPerCTA = [4, 1].

test/TritonGPU/amd/accelerate-amd-matmul-fma.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942' | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942" | FileCheck %s
22

33
// CHECK: fma_dot_fp16_fp16
44
// CHECK: %[[D:.*]] = tt.dot {{.*}} : tensor<2x64xf16, {{.*}}> * tensor<64x64xf16, {{.*}}> -> tensor<2x64xf16, {{.*}}>

test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx950 matrix-instruction-size=0' | FileCheck %s --check-prefixes CHECK
1+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx950 matrix-instruction-size=0" | FileCheck %s --check-prefixes CHECK
22

33
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
44
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>

test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=0' | FileCheck %s --check-prefixes MFMA0,CHECK
2-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=16' | FileCheck %s --check-prefixes MFMA16,CHECK
1+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=0" | FileCheck %s --check-prefixes MFMA0,CHECK
2+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16" | FileCheck %s --check-prefixes MFMA16,CHECK
33

44
#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [1, 0]}>
55
// CHECK-LABEL: mfma_dot_fp8e5m2

test/TritonGPU/amd/accelerate-amd-matmul-wmma-gen1.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx1100 matrix-instruction-size=0' | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx1100 matrix-instruction-size=0" | FileCheck %s
22

33
// CHECK: #[[DOT_OP_PARENT:.+]] = #ttg.blocked<{{.*}}>
44
// CHECK: #[[WMMA_0:.+]] = #ttg.amd_wmma<{version = 1, isTranspose = false, warpsPerCTA = [1, 4]}>

test/TritonGPU/amd/accelerate-amd-matmul-wmma-gen2.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx1200 matrix-instruction-size=0' | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx1200 matrix-instruction-size=0" | FileCheck %s
22

33
// CHECK: #[[DOT_OP_PARENT:.+]] = #ttg.blocked<{{.*}}>
44
// CHECK: #[[WMMA_0:.+]] = #ttg.amd_wmma<{version = 2, isTranspose = false, warpsPerCTA = [1, 4]}>

test/TritonGPU/amd/amd-block-pingpong.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong='num-stages=2' | FileCheck %s
1+
// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=2" | FileCheck %s
22

33
//CHECK-LABEL: pingpong_small
44
//CHECK: ttg.local_load

test/TritonGPU/amd/amd-conditional-barrier.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
1+
// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
22

33
module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
44
tt.func @conditional_barrier() {

test/TritonGPU/amd/amd-convert-buffer-ops-range-analysis.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
22

3-
// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect --tritonamdgpu-convert-buffer-ops='arch-generation-name=gfx942' | FileCheck %s
3+
// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx942" | FileCheck %s
44

55
// CHECK-LABEL: tt.func @conversion1(
66
// CHECK-SAME: %[[VAL_0:.*]]: !tt.ptr<f32>) -> tensor<1024xf32> {

0 commit comments

Comments
 (0)