intel
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 14 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 15 additions & 0 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎python/triton/compiler/compiler.py‎
Lines changed: 0 additions & 13 deletions b/‎python/triton/compiler/compiler.py‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎python/triton/experimental/gluon/language/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/triton/experimental/gluon/language/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/_core.py‎
Lines changed: 9 additions & 0 deletions b/‎python/triton/experimental/gluon/language/_core.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/_semantic.py‎
Lines changed: 3 additions & 0 deletions b/‎python/triton/experimental/gluon/language/_semantic.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/Conversion/tritongpu_to_llvm.mlir‎
Lines changed: 25 additions & 0 deletions b/‎test/Conversion/tritongpu_to_llvm.mlir‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎test/Conversion/tritongpu_to_llvm_hopper.mlir‎
Lines changed: 30 additions & 0 deletions b/‎test/Conversion/tritongpu_to_llvm_hopper.mlir‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎test/NVWS/aref-tmem-insertion.mlir‎
Lines changed: 6 additions & 4 deletions b/‎test/NVWS/aref-tmem-insertion.mlir‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎test/NVWS/assign_stage_phase.mlir‎
Lines changed: 52 additions & 0 deletions b/‎test/NVWS/assign_stage_phase.mlir‎
Lines changed: 52 additions & 0 deletions
@@ -179,11 +179,25 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
             "ttg.total-num-warps"))
       numWarps = totalNumWarps.getInt();
 
+    int numCTAs = 1;
+    if (auto module = funcOp->getParentOfType<ModuleOp>()) {
+      if (auto moduleAttr =
+              module->getAttrOfType<IntegerAttr>(triton::gpu::AttrNumCTAsName))
+        numCTAs = moduleAttr.getInt();
+    }
+
     // Set `nvvm.maxnreg` if it was specified on the module.
     if (Attribute maxnregAttr =
             funcOp.getParentOp()->getAttr(triton::gpu::AttrMaxRegistersName))
       newFuncOp->setAttr(NVVM::NVVMDialect::getMaxnregAttrName(), maxnregAttr);
 
+    // Do we want to do this for nCTAs == 1 whenever sm >= 90?
+    if (numCTAs > 1) {
+      // Request a specific number of CTAs per cluster in the generated PTX.
+      newFuncOp->setAttr(NVVM::NVVMDialect::getClusterDimAttrName(),
+                         rewriter.getDenseI32ArrayAttr(numCTAs));
+    }
+
     // Set an attribute for reqntidx, it could be used in latter LLVM codegen
     // for `nvvm.annotation` metadata.
     newFuncOp->setAttr(NVVM::NVVMDialect::getReqntidAttrName(),
 
@@ -3006,6 +3006,12 @@ def print_num_warps():
     print("num_warps", num_warps)
 
 
+@gluon.jit
+def print_num_ctas():
+    num_ctas: ttgl.constexpr = ttgl.num_ctas()
+    print("num_ctas", num_ctas)
+
+
 @filecheck_test
 @gluon.jit
 def test_get_num_warps():
@@ -3030,6 +3036,15 @@ def test_get_num_warps():
     ], [1, 2, 8], [24, 24, 24])
 
 
+@filecheck_test
+@gluon.jit
+def test_num_ctas():
+    # CHECK-LABEL: test_num_ctas
+    # CHECK: tt.func private @{{.*}}print_num_ctas
+    # CHECK-NEXT: arith.constant 1 : i32
+    print_num_ctas()
+
+
 def test_mismatch_shape_and_layout_rank():
 
     @gluon.jit
 
@@ -294,18 +294,6 @@ def compile(src, target=None, options=None, _env_vars=None):
 
     metadata["cache_dir"] = fn_cache_manager.cache_dir
     metadata["triton_version"] = __version__
-    cluster_dims = getattr(options, "cluster_dims", None)
-    if cluster_dims is None:
-        num_ctas = getattr(options, "num_ctas", None)
-        if num_ctas is None:
-            num_ctas = 1
-        cluster_dims = (num_ctas, 1, 1)
-    if not isinstance(cluster_dims, (list, tuple)):
-        cluster_dims = (cluster_dims, )
-    cluster_dims = tuple(cluster_dims)
-    if len(cluster_dims) < 3:
-        cluster_dims = cluster_dims + (1, ) * (3 - len(cluster_dims))
-    metadata["cluster_dims"] = cluster_dims
     # run compilation pipeline  and populate metadata
     stages = dict()
     backend.add_stages(stages, options, src.language)
@@ -432,7 +420,6 @@ def __init__(self, src, metadata_group, hash):
         from collections import namedtuple
         metadata_path = next((Path(p) for c, p in metadata_group.items() if c.endswith(".json")))
         metadata = json.loads(metadata_path.read_text())
-        metadata['cluster_dims'] = tuple(metadata['cluster_dims'])
         # JSON serialization dumps the target as a dict. Restore it to a GPUTarget.
         target = metadata['target']
         metadata['target'] = GPUTarget(target['backend'], target['arch'], target['warp_size'])
 
@@ -52,6 +52,7 @@
     full,
     gather,
     num_warps,
+    num_ctas,
     histogram,
     inline_asm_elementwise,
     join,
 
@@ -74,6 +74,7 @@
     "static_range",
     "tuple",
     "tuple_type",
+    "num_ctas",
 ]
 
 T = TypeVar("T")
@@ -525,6 +526,14 @@ def num_warps(_semantic=None, _generator=None):
     return _semantic.num_warps(_generator)
 
 
+@builtin
+def num_ctas(_semantic=None):
+    """
+    Returns the number of CTAs in the current kernel
+    """
+    return _semantic.num_ctas()
+
+
 @builtin
 def thread_barrier(_semantic=None):
     """
 
@@ -551,6 +551,9 @@ def warp_specialize(self, functions_and_args, worker_num_warps: Sequence[int], w
             return
         return tuple(unflatten_ir_values(mlir_results, [r.type for r in default_results]))
 
+    def num_ctas(self):
+        return ttgl.constexpr(self.builder.options.num_ctas)
+
     def num_warps(self, generator):
         if generator.caller_context is not None:
             assert isinstance(generator.caller_context, GluonCallerContext)
 
@@ -2645,3 +2645,28 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+// We had a bug where DotOp lowering treated any input where shape[1] == 1 as an
+// outer product and rejected it. This was incorrect in 3D tensors, since
+// the dimension to look at would have been shape[2].
+
+#mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [32, 1, 1], instrShape = [1, 16, 8]}>
+#dot_operand_a = #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>
+#dot_operand_b = #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: batched_dot_3d
+  tt.func public @batched_dot_3d(
+    %arg0: tensor<32x1x32xf16, #dot_operand_a>,
+    %arg1: tensor<32x32x32xf16, #dot_operand_b>
+  ) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x1x32xf32, #mma>
+    // CHECK: llvm.inline_asm
+    // CHECK-SAME: mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32
+    %result = tt.dot %arg0, %arg1, %cst, inputPrecision = tf32 :
+      tensor<32x1x32xf16, #dot_operand_a> * tensor<32x32x32xf16, #dot_operand_b> -> tensor<32x1x32xf32, #mma>
+    tt.return
+  }
+}
@@ -1,5 +1,17 @@
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory-nv='compute-capability=90 ptx-version=81' --convert-triton-gpu-to-llvm='compute-capability=90 ptx-version=81' | FileCheck %s
 
+module attributes {"ttg.num-ctas" = 4 : i32, "ttg.num-warps" = 4 : i32} {
+  // CHECK-LABEL: @test_cluster_attr
+  // CHECK: nvvm.cluster_dim = array<i32: 4>
+  // CHECK: nvvm.kernel = 1 : ui1
+  // CHECK: nvvm.reqntid = array<i32: 128>
+  tt.func @test_cluster_attr(%lb : index, %A : !tt.ptr<f16>) {
+    tt.return
+  }
+}
+
+// -----
+
 #mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = false, elementBitWidth = 8}>
 #shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = true, elementBitWidth = 8}>
@@ -74,6 +86,24 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
 
 // -----
 
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [16, 2], instrShape = [16, 256, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: @warp_group_dot_bf16_32_warps
+  tt.func @warp_group_dot_bf16_32_warps(
+      %a: !ttg.memdesc<256x128xbf16, #shared, #smem>,
+      %b: !ttg.memdesc<128x512xbf16, #shared, #smem>,
+      %acc: tensor<256x512xf32, #mma>) {
+    %res = ttng.warp_group_dot %a, %b, %acc {inputPrecision = 0 : i32, isAsync = true} :
+      !ttg.memdesc<256x128xbf16, #shared, #smem> * !ttg.memdesc<128x512xbf16, #shared, #smem> -> tensor<256x512xf32, #mma>
+    // CHECK: nvgpu.wgmma {{.*}} k = 16 : i32, layoutA = 1 : i32, layoutB = 1 : i32, m = 64 : i32, n = 256 : i32}
+    tt.return
+  }
+}
+
+// -----
+
 #mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 64, 16]}>
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
 
@@ -558,15 +558,17 @@ module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
     // CHECK-NEXT: aref.create
     // CHECK-NEXT: aref.put.enter
     %result, %token = ttng.tmem_alloc : () -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
-    scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32  : i32 {
-      %0 = ttg.local_alloc %arg1 : (tensor<128x64xf16, #blocked1>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
+    %5 = scf.for %arg3 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg4 = %token) -> (!ttg.async.token)  : i32 {
+      %0 = ttg.local_alloc %arg1 {ttg.partition = array<i32: 0>} : (tensor<128x64xf16, #blocked1>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
       %1 = tt.descriptor_load %arg2[%arg3, %arg3] {ttg.partition = array<i32: 2>} : !tt.tensordesc<tensor<64x128xf16, #shared>> -> tensor<64x128xf16, #blocked1>
       %2 = arith.addf %1, %1 {ttg.partition = array<i32: 0>} : tensor<64x128xf16, #blocked1>
       %3 = ttg.local_alloc %2 {ttg.partition = array<i32: 0>} : (tensor<64x128xf16, #blocked1>) -> !ttg.memdesc<64x128xf16, #shared, #smem>
       // CHECK: aref.buffer
-      %4 = ttng.tc_gen5_mma %0, %3, %result[%token], %true, %true {ttg.partition = array<i32: 1>} : !ttg.memdesc<128x64xf16, #shared, #smem>, !ttg.memdesc<64x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
-    } {tt.num_stages = 2 : i32, tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32], ttg.warp_specialize.tag = 18 : i32}
+      %4 = ttng.tc_gen5_mma %0, %3, %result[%arg4], %true, %true {ttg.partition = array<i32: 1>} : !ttg.memdesc<128x64xf16, #shared, #smem>, !ttg.memdesc<64x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+      scf.yield %4 : !ttg.async.token
+    } {ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 1>], tt.num_stages = 2 : i32, tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32], ttg.warp_specialize.tag = 18 : i32}
     // CHECK: aref.put.exit
+    ttng.tmem_load %result[%5] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
     tt.return
   }
 
 
@@ -674,3 +674,55 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+#blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
+module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
+  // CHECK-LABEL: @for_loop_control_operand_ppg
+  tt.func @for_loop_control_operand_ppg(%lb: i32, %ub: i32, %step: i32, %ptr0: !tt.ptr<i32>) {
+    %true = arith.constant true
+    %arefBuf = ttng.tmem_alloc : () -> !ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+    %aref = nvws.aref.create %arefBuf : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>
+    %_0, %tok = nvws.aref.put.enter %aref : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+    // CHECK: put.enter
+    // CHECK-NEXT: [[RET:%.*]]:5 = scf.for
+    %tok0 = scf.for %iv0 = %lb to %ub step %step iter_args(%tok1 = %tok) -> (!ttg.async.token) : i32 {
+      // CHECK-NEXT: tt.addptr {{.*}} {ttg.partition = array<i32: 0, 1, 2>}
+      // CHECK-NEXT: tt.load {{.*}} {ttg.partition = array<i32: 0, 1, 2>}
+      // CHECK-NEXT: "lb1"({{.*}}) {ttg.partition = array<i32: 0, 1, 2>}
+      // CHECK-NEXT: "step1"({{.*}}) {ttg.partition = array<i32: 0, 1, 2>}
+      %ptrub = tt.addptr %ptr0, %iv0 {ttg.partition = array<i32: 1, 2>} : !tt.ptr<i32>, i32
+      %ub1 = tt.load %ptrub {ttg.partition = array<i32: 1, 2>} : !tt.ptr<i32>
+      %lb1 = "lb1"(%iv0) {ttg.partition = array<i32: 1, 2>} : (i32) -> i32
+      %step1 = "step1"(%iv0) {ttg.partition = array<i32: 1, 2>} : (i32) -> i32
+      // CHECK-NEXT: [[RET1:%.*]]:3 = scf.for
+      %tok5 = scf.for %iv = %lb1 to %ub1 step %step1 iter_args(%tok2 = %tok1) -> (!ttg.async.token)  : i32 {
+        %sA = "load1"(%iv) {ttg.partition = array<i32: 1>} : (i32) -> !ttg.memdesc<128x64xf32, #shared, #smem>
+        %sB = "load2"(%iv) {ttg.partition = array<i32: 1>} : (i32) -> !ttg.memdesc<64x128xf32, #shared, #smem>
+        %buf = nvws.aref.buffer %aref, %tok2 {ttg.partition = array<i32: 2>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+        ttng.tc_gen5_mma %sA, %sB, %buf, %true, %true {ttg.partition = array<i32: 2>} : !ttg.memdesc<128x64xf32, #shared, #smem>, !ttg.memdesc<64x128xf32, #shared, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+        scf.yield {ttg.partition = array<i32: 1, 2>} %tok2 : !ttg.async.token
+      } {ttg.partition = array<i32: 1, 2>, ttg.partition.outputs = [array<i32: 2>]}
+      // CHECK: scf.yield
+      // CHECK-NEXT: {ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 2>, array<i32: 0, 2>, array<i32: 2>]}
+      // CHECK-NEXT: nvws.aref.put.exit {{.*}}[[[RET1]]#1]
+      nvws.aref.put.exit %aref, %tok5 [#nvws.async_op<tc5mma>] {ttg.partition = array<i32: 2>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token
+      %_1, %token_2 = nvws.aref.get.enter %aref {ttg.partition = array<i32: 1>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+      nvws.aref.get.exit %aref, %token_2 [#nvws.async_op<none>] {ttg.partition = array<i32: 1>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token
+      %buf1, %tok6 = nvws.aref.put.enter %aref {ttg.partition = array<i32: 2>} : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+      // CHECK: aref.put.enter
+      // CHECK-NEXT: scf.yield
+      scf.yield {ttg.partition = array<i32: 1, 2>} %tok6 : !ttg.async.token
+      // CHECK-NEXT: {tt.warp_specialize, ttg.partition = array<i32: 0, 1, 2>, ttg.partition.outputs = [array<i32: 2>, array<i32: 0, 2>, array<i32: 2>, array<i32: 0, 1>, array<i32: 0, 1>]}
+    } {tt.warp_specialize, ttg.partition = array<i32: 1, 2>, ttg.partition.outputs = [array<i32: 2>]}
+    // CHECK-NEXT: aref.put.exit {{.*}}[[[RET]]#1]
+    nvws.aref.put.exit %aref, %tok0 [#nvws.async_op<tc5mma>] : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token
+    %_2, %token_2 = nvws.aref.get.enter %aref : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+    nvws.aref.get.exit %aref, %token_2 [#nvws.async_op<none>] : <[!ttg.memdesc<1x128x128xf32, #tmem, #ttng.tensor_memory, mutable>]>, !ttg.async.token
+    tt.return
+  }
+}