[BE] Fix for colliding tmem allocation boundaries (#6318)

pawelszczerbuk · web-flow · commit 14a85e74207b · 2025-03-27T04:28:18.000Z
It may happen that two tmem allocations share the same liverange end
boundary (if it ends at block bound). This case was not handled properly
in the tmem allocation pass, causing tmem overallocation.
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp
@@ -142,7 +142,7 @@ static Interval<int> getLiveIntervals(Value value, Liveness &liveness,
 }
 
 static void updateMap(MemoryBitMap &memoryMap, Interval<int> liveInterval,
-                      std::map<int, TMemChunk> &intervalLiverangeEnd) {
+                      std::multimap<int, TMemChunk> &intervalLiverangeEnd) {
   int start = liveInterval.start();
   // Add any dead liverange to the list of free intervals.
   for (auto it = intervalLiverangeEnd.begin();
@@ -247,7 +247,7 @@ allocateTMem(Operation *parentOp,
   int totalMemorySize = 0;
   MemoryBitMap memoryMap;
   Liveness liveness(parentOp);
-  std::map<int, TMemChunk> intervalLiverangeEnd;
+  std::multimap<int, TMemChunk> intervalLiverangeEnd;
   DenseMap<TMEMAllocOp, TMemChunk> allocChunks;
   // Implement a linear scan first fit algorithm. We expect that fragmentation
   // won't be a problem, if it is this should be revisited.
@@ -283,7 +283,7 @@ allocateTMem(Operation *parentOp,
     allocChunks.insert({alloc, chunkAllocated});
     // currently naively constraint allocs based on the first one we find.
     rowIdConstraints.addConstraints(alloc, chunkAllocated.startRow);
-    intervalLiverangeEnd[liveInterval.end()] = chunkAllocated;
+    intervalLiverangeEnd.insert({liveInterval.end(), chunkAllocated});
     int colOffset = chunkAllocated.startCol;
     int rowOffset = chunkAllocated.startRow * 16;
 
diff --git a/test/TritonNvidiaGPU/test_tensor_memory_allocation.mlir b/test/TritonNvidiaGPU/test_tensor_memory_allocation.mlir
@@ -62,7 +62,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #tmem1 = #ttng.tensor_memory_encoding<blockM = 64, blockN = 128, unpacked = true>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65536 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
   // CHECK: ttg.tensor_memory_size = 512
-  // CHECK: alloc_tensor_memory
+  // CHECK: alloc_tensor_memory_re_use
   tt.func public @alloc_tensor_memory_re_use(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>) {
     %true = arith.constant true
     %c1 = arith.constant 1 : i32
@@ -113,6 +113,50 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 64, unpacked = true>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65536 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK: ttg.tensor_memory_size = 128
+  // CHECK: alloc_tensor_memory_re_use_liverange_end_collision
+  tt.func public @alloc_tensor_memory_re_use_liverange_end_collision(
+                                             %arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>,
+                                             %lb: index, %ub: index, %step: index) {
+    %true = arith.constant true
+    %c1 = arith.constant 1 : i32
+    %c0 = arith.constant 0 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
+    %cst0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
+    %cst1 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
+    %cst2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
+
+    // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32}
+    %a = ttng.tmem_alloc %cst0 : (tensor<128x64xf32, #blocked>) -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 64 : i32, tensor_memory_row_offset = 0 : i32}
+    %b = ttng.tmem_alloc %cst : (tensor<128x64xf32, #blocked>) -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    scf.for %i = %lb to %ub step %step {
+      ttng.tmem_store %cst2, %a, %true : tensor<128x64xf32, #blocked> -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+      ttng.tmem_store %cst2, %b, %true : tensor<128x64xf32, #blocked> -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+      scf.yield
+    }
+    // Liveranges of both allocations end at the same time, at the boundary of the loop. Make sure we can handle this case.
+
+    // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32}
+    %c = ttng.tmem_alloc %cst0 : (tensor<128x64xf32, #blocked>) -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 64 : i32, tensor_memory_row_offset = 0 : i32}
+    %d = ttng.tmem_alloc %cst : (tensor<128x64xf32, #blocked>) -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    ttng.tmem_store %cst2, %c, %true : tensor<128x64xf32, #blocked> -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+    ttng.tmem_store %cst2, %d, %true : tensor<128x64xf32, #blocked> -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [2, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true, CTASplitM = 2>
 #tmem1 = #ttng.tensor_memory_encoding<blockM = 128, blockN = 64, unpacked = true, CTASplitN = 2>