intel
diff --git a/‎Makefile‎
Lines changed: 2 additions & 1 deletion b/‎Makefile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 24 additions & 5 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 4 additions & 95 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 4 additions & 95 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp‎
Lines changed: 6 additions & 6 deletions b/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 17 additions & 0 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎python/test/unit/tools/test_triton_to_gluon.py‎
Lines changed: 28 additions & 9 deletions b/‎python/test/unit/tools/test_triton_to_gluon.py‎
Lines changed: 28 additions & 9 deletions
diff --git a/‎python/triton/experimental/gluon/language/_layouts.py‎
Lines changed: 3 additions & 0 deletions b/‎python/triton/experimental/gluon/language/_layouts.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/_semantic.py‎
Lines changed: 5 additions & 0 deletions b/‎python/triton/experimental/gluon/language/_semantic.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/nvidia/hopper/tma.py‎
Lines changed: 1 addition & 0 deletions b/‎python/triton/experimental/gluon/language/nvidia/hopper/tma.py‎
Lines changed: 1 addition & 0 deletions
@@ -73,8 +73,9 @@ test-interpret: all
 
 .PHONY: test-proton
 test-proton: all
-	$(PYTEST) --tb=short -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py
+	$(PYTEST) --tb=short -s -n 8 third_party/proton/test --ignore=third_party/proton/test/test_override.py -k "not test_overhead"
 	$(PYTEST) --tb=short -s third_party/proton/test/test_override.py
+	$(PYTEST) --tb=short -s third_party/proton/test/test_instrumentation.py::test_overhead
 
 .PHONY: test-python
 test-python: test-unit test-regression test-interpret test-proton
 
@@ -2806,15 +2806,34 @@ struct TritonGPUInferLayoutInterface
         mlir::dyn_cast<triton::gpu::DotOperandEncodingAttr>(operandEncodingB);
     if (!aEncoding && !bEncoding)
       return mlir::success();
-    auto mmaAEncoding =
-        mlir::dyn_cast_or_null<NvidiaMmaEncodingAttr>(aEncoding.getParent());
-    if (mmaAEncoding && mmaAEncoding.isHopper())
-      return success();
-    // Verify that the encodings are valid.
     if (!aEncoding || !bEncoding)
       return op->emitError("mismatching encoding between A and B operands");
+    // Verify that the encodings are valid.
     if (aEncoding.getKWidth() != bEncoding.getKWidth())
       return op->emitError("mismatching kWidth between A and B operands");
+
+    // Check if we have already selected an MMA version for Nvidia. If so,
+    // validate that the encodings are correct and compatible.
+    auto mmaAEncoding =
+        dyn_cast_or_null<NvidiaMmaEncodingAttr>(aEncoding.getParent());
+    auto mmaBEncoding =
+        dyn_cast_or_null<NvidiaMmaEncodingAttr>(bEncoding.getParent());
+    auto dotOp = cast<DotOp>(op);
+    auto resEnc = dotOp.getResult().getType().getEncoding();
+    auto mmaResEncoding = dyn_cast<NvidiaMmaEncodingAttr>(resEnc);
+    if (mmaAEncoding || mmaBEncoding || mmaResEncoding) {
+      // Check that they are all set and have the same version.
+      if (!mmaAEncoding || !mmaBEncoding || !mmaResEncoding)
+        return op->emitError("mismatching MMA encoding");
+      auto mmaBEncoding = cast<NvidiaMmaEncodingAttr>(bEncoding.getParent());
+      if (mmaAEncoding.getVersionMajor() != mmaBEncoding.getVersionMajor() ||
+          mmaAEncoding.getVersionMajor() != mmaResEncoding.getVersionMajor()) {
+        return op->emitError("mismatched MMA version.");
+      }
+      // Verify that the operands are supported on the selected MMA version.
+      if (!supportMMA(dotOp, mmaResEncoding.getVersionMajor()))
+        return op->emitError("unsupported MMA version");
+    }
     return success();
   }
 
 
@@ -470,93 +470,6 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return combineCtaCgaWithShape(tileLayout, getCTALayout(), shape);
 }
 
-std::optional<LinearLayout>
-chooseLLDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
-                       int32_t elemBitWidth, unsigned instBitWidth,
-                       unsigned numLanesInShuffleGroup) {
-  using BaseTy = std::vector<std::vector<int32_t>>;
-  // This function will derive the layout for the ds_read_tr instruction
-  // based on the input layout (LL/DotLayout/...)
-  // The ds_read_tr instruction works on instBitWidth per lane and in groups of
-  // numLanesInShuffleGroup lanes.
-
-  // In this example we look at ds_read_b64_tr (instBitWidth = 64) and
-  // numLanesInShuffleGroup = 16 with 64 lanes per warp. Using M-continuous
-  // 16-bit input tensor A as an example. Each lane will load 4 consecutive
-  // elements (64-bit in total) along M. There are 4 consecutive lanes in total
-  // along M. Then the loaded elements are exchanged within the MxK=16x4 "base
-  // unit".
-  //        K0  K1  K2  K3
-  //      +---+---+---+---+
-  //  M0  |   |   |   |   |       M0, K[0-3]:  T0
-  //  M1  | T | T | T | T |       M1, K[0-3]:  T1
-  //  M2  | 0 | 4 | 8 |12 |       M2, K[0-3]:  T2
-  //  M3  |   |   |   |   |       M3, K[0-3]:  T3
-  //      +---+---+---+---+
-  //  M4  |   |   |   |   |       M4, K[0-3]:  T4
-  //  M5  | T | T | T | T |       M5, K[0-3]:  T5
-  //  M6  | 1 | 5 | 9 |13 |       M6, K[0-3]:  T6
-  //  M7  |   |   |   |   |       M7, K[0-3]:  T7
-  //      +---+---+---+---+  ==>
-  //  M8  |   |   |   |   |       M8, K[0-3]:  T8
-  //  M9  | T | T | T | T |       M9, K[0-3]:  T9
-  // M10  | 2 | 6 |10 |14 |      M10, K[0-3]: T10
-  // M11  |   |   |   |   |      M11, K[0-3]: T11
-  //      +---+---+---+---+
-  // M12  |   |   |   |   |      M12, K[0-3]: T12
-  // M13  | T | T | T | T |      M13, K[0-3]: T13
-  // M14  | 3 | 7 |11 |15 |      M14, K[0-3]: T14
-  // M15  |   |   |   |   |      M15, K[0-3]: T15
-  //      +---+---+---+---+
-
-  // Given the layout represented by `enc` and shape, we can derive the layout
-  // that ds_read_b64_tr need to have in order to perform a vectorized load of
-  // the elements. This can be done by rearranging the inner 4x16 element base
-  // unit in the LL by rearranging the first numReg register bases and the
-  // first numLane lane bases.
-  auto rotatePrefixes = [](BaseTy &regBase, std::size_t numReg,
-                           BaseTy &laneBase, std::size_t numLane) {
-    // Concatenate prefixes of the two vectors. Lane first and then regs.
-    // C D E F | A B
-    // Then copy over numReg to the regBase and numLane to laneBase
-    // C D | E F A B
-    BaseTy baseUnit(laneBase.begin(), laneBase.begin() + numLane);
-    llvm::append_range(
-        baseUnit, llvm::make_range(regBase.begin(), regBase.begin() + numReg));
-
-    std::copy(baseUnit.begin(), baseUnit.begin() + numReg, regBase.begin());
-    std::copy(baseUnit.begin() + numReg, baseUnit.end(), laneBase.begin());
-  };
-
-  auto ctx = enc.getContext();
-  assert(elemBitWidth == 8 || elemBitWidth == 16);
-  // Get how many reg bases and tile bases the ds_read_tr tile spans
-  unsigned numRegBases = llvm::Log2_32(instBitWidth / elemBitWidth);
-  unsigned numLaneBases = llvm::Log2_32(numLanesInShuffleGroup);
-
-  auto ldsTransLayout = triton::gpu::toLinearLayout(shape, enc);
-  auto bases = ldsTransLayout.getBases();
-  auto kRegister = S("register");
-  auto kLane = S("lane");
-
-  // Make sure that we have enough register bases to rotate, otherwise we
-  // can't return a valid ds_read_tr layout
-  if (ldsTransLayout.getInDimSizeLog2(kRegister) < numRegBases) {
-    return std::nullopt;
-  }
-  // We should always have enough lanes
-  assert(ldsTransLayout.getInDimSizeLog2(kLane) >= numLaneBases);
-  rotatePrefixes(bases[kRegister], numRegBases, bases[kLane], numLaneBases);
-  // Scale types double the elements for a total of 16 vgpr (still only 16
-  // elements contiguous). Need to adjust the lane basis to reflect that
-  if (elemBitWidth == 8 && numLanesInShuffleGroup == 8) {
-    assert(ldsTransLayout.getInDimSizeLog2(kLane) >= (numLaneBases + 1));
-    std::swap(bases[kLane][numLaneBases - 1], bases[kLane][numLaneBases]);
-  }
-
-  return LinearLayout(bases, ldsTransLayout.getOutDims(), false);
-}
-
 std::optional<LinearLayout>
 chooseDotDsReadTrLayout(DotOperandEncodingAttr dotMfmaLayout,
                         ArrayRef<int64_t> shape, int32_t elemBitWidth,
@@ -1461,14 +1374,10 @@ std::optional<LinearLayout>
 chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
                      int32_t elemBitWidth, unsigned instBitWidth,
                      unsigned numLanesInShuffleGroup) {
-  if (elemBitWidth == 4) {
-    auto dot = cast<DotOperandEncodingAttr>(enc);
-    return chooseDotDsReadTrLayout(dot, shape, elemBitWidth, instBitWidth,
-                                   numLanesInShuffleGroup);
-  } else {
-    return chooseLLDsReadTrLayout(enc, shape, elemBitWidth, instBitWidth,
-                                  numLanesInShuffleGroup);
-  }
+  assert(elemBitWidth == 4);
+  auto dot = cast<DotOperandEncodingAttr>(enc);
+  return chooseDotDsReadTrLayout(dot, shape, elemBitWidth, instBitWidth,
+                                 numLanesInShuffleGroup);
 }
 
 LinearLayout chooseScaledWmmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
 
@@ -25,6 +25,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/Support/LLVM.h"
+#include "triton/Analysis/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
@@ -91,13 +92,12 @@ LogicalResult WarpGroupDotOp::verify() {
   if (retShapePerCTA[1] % 8 != 0)
     return emitOpError("WGMMA result N dimension must be divisible by 8");
 
-  auto aElemTy = getA().getType().getElementType();
-  if (!(llvm::isa<Float8E5M2Type, Float8E4M3FNType>(aElemTy) ||
-        aElemTy.isInteger(8) || aElemTy.isF16() || aElemTy.isBF16() ||
-        aElemTy.isF32()))
-    return emitOpError("WGMMA result element type must be F16, BF16, F32, "
-                       "F8E5M2, F8E4M3FN, or integer type");
+  // Verify MMA version is supported for operands.
+  int mmaVersion = nvmmaEnc.getVersionMajor();
+  if (!supportMMA(getA(), mmaVersion) || !supportMMA(getB(), mmaVersion))
+    return emitOpError("unsupported MMA version for the given operands");
 
+  auto aElemTy = getA().getType().getElementType();
   if (getMaxNumImpreciseAcc() < 32 &&
       (llvm::isa<Float8E5M2Type, Float8E4M3FNType>(aElemTy)) &&
       resTy.getElementType().isF32()) {
 
@@ -105,6 +105,8 @@ struct GluonLayouts {
   py::handle DistributedLinearLayout;
   py::handle DotOperandLayout;
   py::handle NVMMADistributedLayout;
+  py::handle TensorMemoryScalesLayout;
+  py::handle TensorMemoryLayout;
   py::handle NVMMASharedLayout;
   py::handle SwizzledSharedLayout;
   py::handle SharedLinearLayout;
@@ -120,6 +122,8 @@ struct GluonLayouts {
         py::module::import("triton.experimental.gluon.language.amd._layouts");
     auto intelLayouts =
         py::module::import("triton.experimental.gluon.language.intel._layouts");
+    auto blackwellLayouts = py::module::import(
+        "triton.experimental.gluon.language.nvidia.blackwell");
     AutoLayout = py::object(layouts.attr("AutoLayout")).release();
     BlockedLayout = py::object(layouts.attr("BlockedLayout")).release();
     SliceLayout = py::object(layouts.attr("SliceLayout")).release();
@@ -128,6 +132,10 @@ struct GluonLayouts {
     DotOperandLayout = py::object(layouts.attr("DotOperandLayout")).release();
     NVMMADistributedLayout =
         py::object(layouts.attr("NVMMADistributedLayout")).release();
+    TensorMemoryScalesLayout =
+        py::object(blackwellLayouts.attr("TensorMemoryScalesLayout")).release();
+    TensorMemoryLayout =
+        py::object(blackwellLayouts.attr("TensorMemoryLayout")).release();
     NVMMASharedLayout = py::object(layouts.attr("NVMMASharedLayout")).release();
     SwizzledSharedLayout =
         py::object(layouts.attr("SwizzledSharedLayout")).release();
@@ -268,6 +276,15 @@ py::object layoutToGluon(Attribute layout) {
         intelDpas.getExecutionSize(), intelDpas.getOpsPerChannel(),
         toStdVector(intelDpas.getWarpsPerCTA()),
         toStdVector(intelDpas.getRepCluster()), intelDpas.getThreadsPerWarp());
+  } else if (auto tmemScales =
+                 dyn_cast<ttng::TensorMemoryScalesEncodingAttr>(layout)) {
+    return layouts.TensorMemoryScalesLayout(std::vector<unsigned>{
+        tmemScales.getCTASplitM(), tmemScales.getCTASplitN()});
+  } else if (auto tmem = dyn_cast<ttng::TensorMemoryEncodingAttr>(layout)) {
+    return layouts.TensorMemoryLayout(
+        std::vector<unsigned>{tmem.getBlockM(), tmem.getBlockN()},
+        tmem.getColStride(),
+        std::vector<unsigned>{tmem.getCTASplitM(), tmem.getCTASplitN()});
   }
 
   throw py::value_error("Unhandled encoding encountered");
 
@@ -13,7 +13,7 @@
 
 
 def convert_kernel(kernel, kernel_name, tmp_path):
-    converted = convert_triton_to_gluon(kernel)
+    converted = convert_triton_to_gluon([kernel])
 
     # Write converted kernel to a file so @gluon.jit can retrieve source
     mod_path = tmp_path / "converted_kernel.py"
@@ -52,7 +52,7 @@ def test_simple_kernel(tmp_path):
     ref = torch.empty_like(x)
     add_kernel[grid](x, y, ref, n, BLOCK)
 
-    torch.testing.assert_close(out, ref)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -85,7 +85,7 @@ def test_triton_to_gluon_dot_minimal(tmp_path):
 
     ref = torch.empty_like(c)
     matmul_tile_kernel[grid](a, b, ref, M, N, K, num_warps=8)
-    torch.testing.assert_close(c, ref)
+    torch.testing.assert_close(c, ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -153,7 +153,7 @@ def test_simple_matmul(dtype_src_str, dtype_dst_str, BLOCK_M, BLOCK_N, BLOCK_K,
     ref = torch.empty_like(output)
     matmul_kernel[grid](a, b, ref, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), output.stride(0),
                         output.stride(1), BLOCK_M, BLOCK_N, BLOCK_K)
-    torch.testing.assert_close(output, ref)
+    torch.testing.assert_close(output, ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -177,7 +177,7 @@ def test_triton_to_gluon_descriptor_roundtrip(tmp_path):
     y_ref = torch.zeros((M, N), device="cuda", dtype=torch.float16)
     desc_ref = TensorDescriptor(y_ref, y_ref.shape, y_ref.stride(), block_shape)
     descriptor_store_kernel[grid](desc_ref, M, N, 1.0)
-    torch.testing.assert_close(y, y_ref)
+    torch.testing.assert_close(y, y_ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -204,7 +204,7 @@ def test_triton_to_gluon_descriptor_load_roundtrip(tmp_path):
     y_ref = torch.zeros((M, N), device="cuda", dtype=torch.float16)
     desc_ref = TensorDescriptor(y_ref, y_ref.shape, y_ref.stride(), block_shape)
     descriptor_copy_kernel[grid](in_desc, desc_ref, M, N)
-    torch.testing.assert_close(y, y_ref)
+    torch.testing.assert_close(y, y_ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -232,7 +232,7 @@ def test_triton_reshape_trans(tmp_path):
     kernel[grid](x, y, out, n, BLOCK)
     ref = torch.empty_like(x)
     reshape_trans_kernel[grid](x, y, ref, n, BLOCK)
-    torch.testing.assert_close(out, ref)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
 
 
 BLOCK_SPLIT = tl.constexpr(256)
@@ -262,7 +262,7 @@ def test_split(tmp_path):
     kernel[grid](x, out)
     ref = torch.empty_like(x[:n])
     split_kernel[grid](x, ref)
-    torch.testing.assert_close(out, ref)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -281,4 +281,23 @@ def test_reduce_to_scalar(tmp_path):
     kernel[grid](out)
     ref = torch.empty_like(out)
     reduce_to_scalar_kernel[grid](ref)
-    torch.testing.assert_close(out, ref)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
+
+
+@triton.jit
+def num_threads_kernel(out_ptr):
+    num_threads: tl.constexpr = tl.extra.cuda.num_threads()
+    offs = tl.arange(0, num_threads)
+    tl.store(out_ptr + offs, 1)
+
+
+@pytest.mark.skipif(not (is_blackwell()), reason="Requires Blackwell")
+def test_num_threads(tmp_path):
+    kernel = convert_kernel(num_threads_kernel, "num_threads_kernel", tmp_path)
+
+    num_threads = 256
+    out = torch.empty(num_threads, dtype=torch.int32, device="cuda")
+    kernel[(1, )](out, num_warps=num_threads // 32)
+    ref = torch.empty_like(out)
+    num_threads_kernel[(1, )](ref, num_warps=num_threads // 32)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
@@ -9,6 +9,9 @@ def _realize_cta_layout(layout, rank):
     ctas_per_cga = layout.ctas_per_cga or [1] * rank
     cta_split_num = layout.cta_split_num or [1] * rank
     cta_order = layout.cta_order or list(reversed(range(rank)))
+    # Canonicalize CTA order to [n,n-1,...,0] if CTAsPerCGA is [1...1]. This matches logic in C++.
+    if all(num_cta == 1 for num_cta in ctas_per_cga):
+        cta_order = list(range(rank - 1, -1, -1))
     object.__setattr__(layout, "ctas_per_cga", ctas_per_cga)
     object.__setattr__(layout, "cta_split_num", cta_split_num)
     object.__setattr__(layout, "cta_order", cta_order)
 
@@ -416,6 +416,11 @@ def _check_same_layout(xs):
         _check(all(l == l0 for l in layouts[1:]),
                lambda: f"Expected inputs to have matching layouts, but got: {layouts}")
 
+    def _store_legacy(self, ptr, val, mask, boundary_check, cache, eviction):
+        if ptr.type.is_block() and not val.type.is_block():
+            val = self.splat(val, ptr.type.get_block_shapes(), ptr.type.layout)
+        return super()._store_legacy(ptr, val, mask, boundary_check, cache, eviction)
+
     def associative_scan(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn,
                          reverse: bool) -> Tuple[TensorTy, ...]:
         shape = inputs[0].type.shape
 
@@ -68,6 +68,9 @@ def mangle(self) -> str:
         cta_split_str = (f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else "")
         return f"TL{block_str}{stride_str}{cta_split_str}TL"
 
+    def __hash__(self):
+        return hash((self.block, self.col_stride, self.cta_split_num))
+
 
 @dataclass(frozen=True, eq=True)
 class TensorMemoryScalesLayout:
@@ -91,6 +94,9 @@ def mangle(self) -> str:
         cta_split_str = f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else ""
         return f"TLS{cta_split_str}TLS"
 
+    def __hash__(self):
+        return hash(self.cta_split_num)
+
 
 @constexpr_function
 def get_tmem_reg_layout(
 
@@ -116,6 +116,7 @@ def make_tensor_descriptor(
     _semantic=None,
 ) -> tensor_descriptor:
     padding_option = _unwrap_if_constexpr(padding_option)
+    block_shape = _unwrap_if_constexpr(block_shape)
 
     ndim = len(shape)
     if not (1 <= ndim <= 5):