Merge commit '882a02e806af3d156843a272b8501ac666fe4d02'

whitneywhtsang · whitneywhtsang · commit 6c196eb2922a · 2025-03-27T10:48:30.000Z
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp
@@ -142,7 +142,7 @@ static Interval<int> getLiveIntervals(Value value, Liveness &liveness,
 }
 
 static void updateMap(MemoryBitMap &memoryMap, Interval<int> liveInterval,
-                      std::map<int, TMemChunk> &intervalLiverangeEnd) {
+                      std::multimap<int, TMemChunk> &intervalLiverangeEnd) {
   int start = liveInterval.start();
   // Add any dead liverange to the list of free intervals.
   for (auto it = intervalLiverangeEnd.begin();
@@ -247,7 +247,7 @@ allocateTMem(Operation *parentOp,
   int totalMemorySize = 0;
   MemoryBitMap memoryMap;
   Liveness liveness(parentOp);
-  std::map<int, TMemChunk> intervalLiverangeEnd;
+  std::multimap<int, TMemChunk> intervalLiverangeEnd;
   DenseMap<TMEMAllocOp, TMemChunk> allocChunks;
   // Implement a linear scan first fit algorithm. We expect that fragmentation
   // won't be a problem, if it is this should be revisited.
@@ -283,7 +283,7 @@ allocateTMem(Operation *parentOp,
     allocChunks.insert({alloc, chunkAllocated});
     // currently naively constraint allocs based on the first one we find.
     rowIdConstraints.addConstraints(alloc, chunkAllocated.startRow);
-    intervalLiverangeEnd[liveInterval.end()] = chunkAllocated;
+    intervalLiverangeEnd.insert({liveInterval.end(), chunkAllocated});
     int colOffset = chunkAllocated.startCol;
     int rowOffset = chunkAllocated.startRow * 16;
 
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-chain-dot.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-chain-dot.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=16' | FileCheck %s --check-prefixes MFMA16,CHECK
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32' | FileCheck %s --check-prefixes MFMA32,CHECK
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16" | FileCheck %s --check-prefixes MFMA16,CHECK
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32" | FileCheck %s --check-prefixes MFMA32,CHECK
 
 // Check the warpsPerCTA parameter of #mma layout of the two dot's.
 // The 1st dot always has warpsPerCTA = [4, 1].
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-fma.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-fma.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942' | FileCheck %s
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942" | FileCheck %s
 
 // CHECK: fma_dot_fp16_fp16
 // CHECK: %[[D:.*]] = tt.dot {{.*}} : tensor<2x64xf16, {{.*}}> * tensor<64x64xf16, {{.*}}> -> tensor<2x64xf16, {{.*}}>
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-mfma-gfx950.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx950 matrix-instruction-size=0' | FileCheck %s --check-prefixes CHECK
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx950 matrix-instruction-size=0" | FileCheck %s --check-prefixes CHECK
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=0' | FileCheck %s --check-prefixes MFMA0,CHECK
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=16' | FileCheck %s --check-prefixes MFMA16,CHECK
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=0" | FileCheck %s --check-prefixes MFMA0,CHECK
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16" | FileCheck %s --check-prefixes MFMA16,CHECK
 
 #blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [1, 0]}>
 // CHECK-LABEL: mfma_dot_fp8e5m2
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gen1.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gen1.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx1100 matrix-instruction-size=0' | FileCheck %s
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx1100 matrix-instruction-size=0" | FileCheck %s
 
 // CHECK: #[[DOT_OP_PARENT:.+]] = #ttg.blocked<{{.*}}>
 // CHECK: #[[WMMA_0:.+]] = #ttg.amd_wmma<{version = 1, isTranspose = false, warpsPerCTA = [1, 4]}>
diff --git a/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gen2.mlir b/test/TritonGPU/amd/accelerate-amd-matmul-wmma-gen2.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx1200 matrix-instruction-size=0' | FileCheck %s
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx1200 matrix-instruction-size=0" | FileCheck %s
 
 // CHECK: #[[DOT_OP_PARENT:.+]] = #ttg.blocked<{{.*}}>
 // CHECK: #[[WMMA_0:.+]] = #ttg.amd_wmma<{version = 2, isTranspose = false, warpsPerCTA = [1, 4]}>
diff --git a/test/TritonGPU/amd/amd-block-pingpong.mlir b/test/TritonGPU/amd/amd-block-pingpong.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong='num-stages=2' | FileCheck %s
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=2" | FileCheck %s
 
 //CHECK-LABEL: pingpong_small
 //CHECK: ttg.local_load
diff --git a/test/TritonGPU/amd/amd-conditional-barrier.mlir b/test/TritonGPU/amd/amd-conditional-barrier.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
 
 module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
   tt.func @conditional_barrier() {
diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops-range-analysis.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops-range-analysis.mlir
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
-// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect --tritonamdgpu-convert-buffer-ops='arch-generation-name=gfx942' | FileCheck %s
+// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx942" | FileCheck %s
 
 // CHECK-LABEL:   tt.func @conversion1(
 // CHECK-SAME:  %[[VAL_0:.*]]: !tt.ptr<f32>) -> tensor<1024xf32> {
diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops='arch-generation-name=gfx942'| FileCheck %s
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops="arch-generation-name=gfx942"| FileCheck %s
 
 #blocked0 = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
diff --git a/test/TritonGPU/amd/amd-extractslice-op.mlir b/test/TritonGPU/amd/amd-extractslice-op.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
 
 #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
diff --git a/test/TritonGPU/amd/amd-instruction-sched.mlir b/test/TritonGPU/amd/amd-instruction-sched.mlir
@@ -1,10 +1,10 @@
-// RUN: triton-opt %s -triton-amdgpu-insert-instruction-sched-hints='variant=llvm_iglp_0' -triton-amdgpu-lower-insert-instruction-sched-hints -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP0
-// RUN: triton-opt %s -triton-amdgpu-insert-instruction-sched-hints='variant=llvm_iglp_1' -triton-amdgpu-lower-insert-instruction-sched-hints -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP1
-// RUN: triton-opt %s -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=1' -triton-amdgpu-insert-instruction-sched-hints='variant=local_prefetch' -tritongpu-reduce-data-duplication -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS1
-// RUN: triton-opt %s -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints='variant=local_prefetch' -tritongpu-reduce-data-duplication -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS2
-// RUN: triton-opt %s -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=16 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints='variant=local_prefetch' -tritongpu-reduce-data-duplication -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -triton-amdgpu-lower-insert-instruction-sched-hints='arch=gfx942 num_stages=2' -debug-only='lower-insert-instruction-sched-hints' -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_LOCAL_PREFETCH_GLOBAL_LOAD
-// RUN: triton-opt %s -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=1' | FileCheck %s -check-prefix=LABELING_PS_1
-// RUN: triton-opt %s -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline='num_stages=2' | FileCheck %s -check-prefix=LABELING_PS_2
+// RUN: triton-opt %s -triton-amdgpu-insert-instruction-sched-hints="variant=llvm_iglp_0" -triton-amdgpu-lower-insert-instruction-sched-hints -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP0
+// RUN: triton-opt %s -triton-amdgpu-insert-instruction-sched-hints="variant=llvm_iglp_1" -triton-amdgpu-lower-insert-instruction-sched-hints -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP1
+// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=1" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS1
+// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS2
+// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -triton-amdgpu-lower-insert-instruction-sched-hints="arch=gfx942 num_stages=2" -debug-only="lower-insert-instruction-sched-hints" -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_LOCAL_PREFETCH_GLOBAL_LOAD
+// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=1" | FileCheck %s -check-prefix=LABELING_PS_1
+// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" | FileCheck %s -check-prefix=LABELING_PS_2
 
 module {
   // INSERT_IGLP0-LABEL: @test_dot_op
diff --git a/test/TritonGPU/amd/mfma-double-rate.mlir b/test/TritonGPU/amd/mfma-double-rate.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s  -split-input-file --convert-triton-amdgpu-to-llvm='arch=gfx950' | FileCheck %s
+// RUN: triton-opt %s  -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx950" | FileCheck %s
 
 // CHECK-LABEL:mfma_16x16x32_f16
 
diff --git a/test/TritonGPU/amd/mfma-xf32.mlir b/test/TritonGPU/amd/mfma-xf32.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s  -split-input-file --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+// RUN: triton-opt %s  -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
 
 // CHECK-LABEL:mfma_xf32
 
diff --git a/test/TritonGPU/amd/sink-setprio-mfma.mlir b/test/TritonGPU/amd/sink-setprio-mfma.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
 
 // CHECK-LABEL: llvm.func @sink_setprio
 // CHECK: rocdl.mfma
diff --git a/test/TritonNvidiaGPU/test_tensor_memory_allocation.mlir b/test/TritonNvidiaGPU/test_tensor_memory_allocation.mlir
@@ -62,7 +62,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #tmem1 = #ttng.tensor_memory_encoding<blockM = 64, blockN = 128, unpacked = true>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65536 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
   // CHECK: ttg.tensor_memory_size = 512
-  // CHECK: alloc_tensor_memory
+  // CHECK: alloc_tensor_memory_re_use
   tt.func public @alloc_tensor_memory_re_use(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>) {
     %true = arith.constant true
     %c1 = arith.constant 1 : i32
@@ -113,6 +113,50 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 64, unpacked = true>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65536 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK: ttg.tensor_memory_size = 128
+  // CHECK: alloc_tensor_memory_re_use_liverange_end_collision
+  tt.func public @alloc_tensor_memory_re_use_liverange_end_collision(
+                                             %arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>,
+                                             %lb: index, %ub: index, %step: index) {
+    %true = arith.constant true
+    %c1 = arith.constant 1 : i32
+    %c0 = arith.constant 0 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
+    %cst0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
+    %cst1 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
+    %cst2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked>
+
+    // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32}
+    %a = ttng.tmem_alloc %cst0 : (tensor<128x64xf32, #blocked>) -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 64 : i32, tensor_memory_row_offset = 0 : i32}
+    %b = ttng.tmem_alloc %cst : (tensor<128x64xf32, #blocked>) -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    scf.for %i = %lb to %ub step %step {
+      ttng.tmem_store %cst2, %a, %true : tensor<128x64xf32, #blocked> -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+      ttng.tmem_store %cst2, %b, %true : tensor<128x64xf32, #blocked> -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+      scf.yield
+    }
+    // Liveranges of both allocations end at the same time, at the boundary of the loop. Make sure we can handle this case.
+
+    // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32}
+    %c = ttng.tmem_alloc %cst0 : (tensor<128x64xf32, #blocked>) -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 64 : i32, tensor_memory_row_offset = 0 : i32}
+    %d = ttng.tmem_alloc %cst : (tensor<128x64xf32, #blocked>) -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    ttng.tmem_store %cst2, %c, %true : tensor<128x64xf32, #blocked> -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+    ttng.tmem_store %cst2, %d, %true : tensor<128x64xf32, #blocked> -> !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [2, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true, CTASplitM = 2>
 #tmem1 = #ttng.tensor_memory_encoding<blockM = 128, blockN = 64, unpacked = true, CTASplitN = 2>
diff --git a/third_party/nvidia/include/Dialect/NVWS/IR/NVWSAttrDefs.td b/third_party/nvidia/include/Dialect/NVWS/IR/NVWSAttrDefs.td
@@ -31,5 +31,6 @@ class NVWS_Attr<string name, list<Trait> traits = [],
 }
 
 def NVWS_TypeArray : ArrayOfAttr<NVWS_Dialect, "TypeArray", "type_array", "Type"> {}
+def NVWS_IntArray : ArrayOfAttr<NVWS_Dialect, "IntArray", "int_array", "int"> {}
 
 #endif
diff --git a/third_party/nvidia/include/Dialect/NVWS/IR/NVWSOps.td b/third_party/nvidia/include/Dialect/NVWS/IR/NVWSOps.td
@@ -134,4 +134,42 @@ def NVWS_ArefReturnOp: NVWS_Op<"aref.return",
   let hasVerifier = 1;
 }
 
+def NVWS_WarpGroupOp : NVWS_Op<"warp_group", [
+  RecursiveMemoryEffects, RecursivelySpeculatable,
+]> {
+  let summary = "Container Op for Warp Specialization";
+  let description = [{
+    Higher level container for Warp Specialization Analysis.
+
+    Contains a variadic number warp groups, with
+    the number of warps in each group, plus a region to hold the
+    computation for that warp group.
+
+    Regions are not Isolated from Above to aid in analysis,
+    and take inputs purely by reference.
+
+    nvws.warp_group should be lowered to ttg.warp_specialize
+    before execution.
+  }];
+
+  let arguments = (ins DenseI32ArrayAttr:$numWarps);
+
+  let regions = (region VariadicRegion<MinSizedRegion<1>>:$partitionRegions);
+  let hasVerifier=1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+def NVWS_WarpGroupReturnOp : NVWS_Op<"warp_group.return", [
+  Pure, Terminator, HasParent<"WarpGroupOp">
+]> {
+  let summary = "Terminator for a warp group region";
+  let description = [{
+    Warp groups are expected to return values via referential modification
+    of their inputs. Thus, the warp_group.return op takes no values to
+    return from the warp group.
+  }];
+
+  let assemblyFormat = "attr-dict";
+}
+
 #endif
diff --git a/third_party/nvidia/lib/Dialect/NVWS/IR/Ops.cpp b/third_party/nvidia/lib/Dialect/NVWS/IR/Ops.cpp
diff --git a/third_party/nvidia/test/NVWS/ops.mlir b/third_party/nvidia/test/NVWS/ops.mlir

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942' \| FileCheck %s`
	`1`	`+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942" \| FileCheck %s`
`2`	`2`
`3`	`3`	`// CHECK: fma_dot_fp16_fp16`
`4`	`4`	`// CHECK: %[[D:.]] = tt.dot {{.}} : tensor<2x64xf16, {{.}}> tensor<64x64xf16, {{.}}> -> tensor<2x64xf16, {{.}}>`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' \| FileCheck %s`
	`1`	`+// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" \| FileCheck %s`
`2`	`2`
`3`	`3`	`module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {`
`4`	`4`	`tt.func @conditional_barrier() {`