intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 3 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎python/src/ir.cc‎
Lines changed: 35 additions & 12 deletions b/‎python/src/ir.cc‎
Lines changed: 35 additions & 12 deletions
diff --git a/‎python/src/specialize.cc‎
Lines changed: 18 additions & 6 deletions b/‎python/src/specialize.cc‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 2 additions & 2 deletions b/‎python/test/gluon/test_core.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 36 additions & 0 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎python/test/unit/tools/test_aot.py‎
Lines changed: 32 additions & 0 deletions b/‎python/test/unit/tools/test_aot.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎python/triton/experimental/gluon/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/triton/experimental/gluon/amd/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎python/triton/experimental/gluon/amd/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -13,7 +13,7 @@ jobs:
   integration-tests-amd:
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 45
-    continue-on-error: ${{ matrix.runner[1] == 'gfx90a' }}
+    continue-on-error: ${{ matrix.runner[1] == 'gfx90a' || matrix.runner[0] == 'gfx950' }}
     strategy:
       matrix:
         runner: ${{ fromJson(inputs.matrix) }}
 
@@ -62,7 +62,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
     auto funcTy = funcOp.getFunctionType();
     auto amendedInputTy = llvm::to_vector<4>(funcTy.getInputs());
     bool isKernel = triton::isKernel(funcOp);
-    if (isKernel) {
+    if (isKernel && targetInfo.isCuda()) {
       for (auto i : llvm::seq(amendedInputTy.size())) {
         if (isa<TensorDescType>(amendedInputTy[i])) {
           funcOp.setArgAttr(i, "tt.nv_tma_desc",
 
@@ -887,9 +887,10 @@ LogicalResult getConvertBackwardSlice(
     queue.pop_back();
     if (!isa<RankedTensorType>(currentValue.getType()))
       continue;
-    // Skip propagating through for op results for now.
+    // Skip propagating through for op/while op results for now.
     // TODO: enable this based on needs.
-    if (currentValue.getDefiningOp<scf::ForOp>())
+    if (currentValue.getDefiningOp<scf::ForOp>() ||
+        currentValue.getDefiningOp<scf::WhileOp>())
       return failure();
     if (failed(updateLayout(currentValue, encoding)))
       return failure();
 
@@ -212,19 +212,42 @@ py::list getTensorDescMetadata(ModuleOp &mod) {
 
     auto blockType = descTy.getBlockType();
     auto encoding = blockType.getEncoding();
-    auto mmaEncoding = dyn_cast<ttg::NVMMASharedEncodingAttr>(encoding);
-    auto swizzle = ttng::getTMASwizzleMode(nullptr, descTy);
-    auto elemType = ttng::getTMAElementType(nullptr, descTy);
-    assert(swizzle.has_value());
-    assert(elemType.has_value());
-    auto blockSize = ttng::getTMABlockShape(blockType, /*packedSize=*/false);
+
     py::dict metadata;
-    metadata["swizzle"] = *swizzle;
-    metadata["elem_size"] = descTy.getBlockType().getElementTypeBitWidth() / 8;
-    metadata["elem_type"] = *elemType;
-    metadata["block_size"] =
-        std::vector<int>(blockSize.begin(), blockSize.end());
-    metadata["fp4_padded"] = mmaEncoding && mmaEncoding.getFp4Padded();
+    if (isa<ttg::NVMMASharedEncodingAttr>(encoding)) {
+      auto mmaEncoding = dyn_cast<ttg::NVMMASharedEncodingAttr>(encoding);
+      auto swizzle = ttng::getTMASwizzleMode(nullptr, descTy);
+      auto elemType = ttng::getTMAElementType(nullptr, descTy);
+      assert(swizzle.has_value());
+      assert(elemType.has_value());
+      auto blockSize = ttng::getTMABlockShape(blockType, /*packedSize=*/false);
+      metadata["swizzle"] = *swizzle;
+      metadata["elem_size"] =
+          descTy.getBlockType().getElementTypeBitWidth() / 8;
+      metadata["elem_type"] = *elemType;
+      metadata["block_size"] =
+          std::vector<int>(blockSize.begin(), blockSize.end());
+      metadata["fp4_padded"] = mmaEncoding && mmaEncoding.getFp4Padded();
+    } else {
+      auto blockShape = blockType.getShape();
+      metadata["block_size"] =
+          std::vector<int>(blockShape.begin(), blockShape.end());
+      metadata["elem_bits"] = blockType.getElementTypeBitWidth();
+
+      if (auto paddedEnc = dyn_cast<ttg::PaddedSharedEncodingAttr>(encoding)) {
+        py::list intervalPaddingPairs;
+        for (auto [interval, padding] : llvm::zip_equal(
+                 paddedEnc.getIntervals(), paddedEnc.getPaddings())) {
+          py::list pair;
+          pair.append(interval);
+          pair.append(padding);
+          intervalPaddingPairs.append(pair);
+        }
+        metadata["interval_padding_pairs"] = intervalPaddingPairs;
+
+        auto blockShape = blockType.getShape();
+      }
+    }
     result.append(std::move(metadata));
   }
   return result;
 
@@ -39,7 +39,8 @@ static bool init_called = false;
 static PyObject *constexpr_cls = nullptr;
 static PyObject *jit_callable_cls = nullptr;
 static PyObject *tensor_descriptor_cls = nullptr;
-static PyObject *gluon_tensor_descriptor_cls = nullptr;
+static PyObject *nvidia_tensor_descriptor_cls = nullptr;
+static PyObject *amd_tensor_descriptor_cls = nullptr;
 static PyObject *canonicalize_dtype_fn = nullptr;
 static PyObject *canonicalize_ptr_dtype_fn = nullptr;
 static PyObject *torch_tensor_cls = nullptr;
@@ -123,8 +124,10 @@ bool init_globals() noexcept try {
   jit_callable_cls = import_from("triton.runtime.jit", "JITCallable");
   tensor_descriptor_cls =
       import_from("triton.tools.tensor_descriptor", "TensorDescriptor");
-  gluon_tensor_descriptor_cls = import_from(
+  nvidia_tensor_descriptor_cls = import_from(
       "triton.experimental.gluon.nvidia.hopper", "TensorDescriptor");
+  amd_tensor_descriptor_cls =
+      import_from("triton.experimental.gluon.amd.gfx1250", "TensorDescriptor");
 
   auto m_canonicalize = py::module_::import("triton._utils");
   canonicalize_dtype_fn = import_from("triton._utils", "canonicalize_dtype");
@@ -442,9 +445,13 @@ void init_type_handler_cache() {
         handle_tensor_descriptor;
   }
   // GluonTensorDescriptor
-  if (gluon_tensor_descriptor_cls &&
-      PyType_Check(gluon_tensor_descriptor_cls)) {
-    type_handler_cache[(PyTypeObject *)gluon_tensor_descriptor_cls] =
+  if (nvidia_tensor_descriptor_cls &&
+      PyType_Check(nvidia_tensor_descriptor_cls)) {
+    type_handler_cache[(PyTypeObject *)nvidia_tensor_descriptor_cls] =
+        handle_gluon_tensor_descriptor;
+  }
+  if (amd_tensor_descriptor_cls && PyType_Check(amd_tensor_descriptor_cls)) {
+    type_handler_cache[(PyTypeObject *)amd_tensor_descriptor_cls] =
         handle_gluon_tensor_descriptor;
   }
   // constexpr
@@ -491,7 +498,12 @@ std::pair<py::object, py::object> specialize_arg(PyObject *backend,
                                     align);
   }
 
-  if (PyObject_IsInstance(arg, gluon_tensor_descriptor_cls)) {
+  if (PyObject_IsInstance(arg, nvidia_tensor_descriptor_cls)) {
+    return handle_gluon_tensor_descriptor(backend, arg, is_const,
+                                          specialize_value, align);
+  }
+
+  if (PyObject_IsInstance(arg, amd_tensor_descriptor_cls)) {
     return handle_gluon_tensor_descriptor(backend, arg, is_const,
                                           specialize_value, align);
   }
 
@@ -1583,7 +1583,7 @@ def kernel(out_ptr, M: ttgl.constexpr, N: ttgl.constexpr, K: ttgl.constexpr, a,
     assert "ttng.tc_gen5_mma_scaled" in ttgir
 
 
-@pytest.mark.skipif(not is_ampere_or_newer(), reason="Requires Ampere or newer")
+@pytest.mark.xfail(not is_ampere_or_newer(), reason="Requires Ampere or newer", run=False)
 def test_coalesced_layout():
 
     @gluon.jit
@@ -1628,7 +1628,7 @@ def kernel(in_ptr, out_ptr,  #
     torch.testing.assert_close(output, ref)
 
 
-@pytest.mark.skipif(not is_ampere_or_newer(), reason="Requires Ampere or newer")
+@pytest.mark.xfail(not is_ampere_or_newer(), reason="Requires Ampere or newer", run=False)
 def test_convert_auto_layout_to_coalesced_layout():
 
     @gluon.jit
 
@@ -3140,6 +3140,42 @@ def test_amd_tdm_load(target):
 """)
 
 
+@gluon.jit
+def amd_host_tdm_load_kernel(desc):
+    buffer = ttgl.allocate_shared_memory(desc.dtype, shape=desc.block_shape, layout=desc.layout)
+    ttgl.amd.gfx1250.tdm.async_load(desc, offsets=[0, 2], dest=buffer)
+
+    ttgl.amd.gfx1250.tdm.async_wait(0)
+    buffer.load(layout=ttgl.BlockedLayout([1, 8], [4, 8], [4, 1], [1, 0]))
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_GFX1250])
+def test_amd_host_tdm_load(target):
+
+    ptr = MockTensor(ttgl.float16, shape=(32, 128))
+    layout = ttgl.PaddedSharedLayout.with_identity_for([[32, 4]], [16, 64], [1, 0])
+    desc = gluon.amd.gfx1250.TensorDescriptor.from_tensor(ptr, block_shape=(16, 64), layout=layout)
+    module = run_parser(amd_host_tdm_load_kernel, *make_args(desc), target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(module.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.padded_shared<[32:+4] {order = [1, 0], shape = [16, 64]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @amd_host_tdm_load_kernel(%arg0: !tt.tensordesc<tensor<16x64xf16, #shared>>, %arg1: i32, %arg2: i32, %arg3: i64, %arg4: i64) attributes {noinline = false} {
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable>
+    %c0_i32 = arith.constant 0 : i32
+    %c2_i32 = arith.constant 2 : i32
+    %true = arith.constant true
+    %1 = amdg.async_tdm_copy_global_to_local %arg0[%c0_i32, %c2_i32] into %0, %true : !tt.tensordesc<tensor<16x64xf16, #shared>> -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable>
+    %2 = amdg.async_tdm_wait  {num = 0 : i32}
+    %3 = ttg.local_load %0 : !ttg.memdesc<16x64xf16, #shared, #smem, mutable> -> tensor<16x64xf16, #blocked>
+    tt.return
+  }
+}
+""")
+
+
 @gluon.jit
 def amd_tdm_store_kernel(ptr):
     SHARED_LAYOUT: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, [1, 0])
 
@@ -66,6 +66,26 @@ def kernel(C, A, B, M, N, K,
   tl.store(c_ptrs, c)
 """
 
+gluon_kernel_src = """
+from triton.experimental import gluon
+from triton.experimental.gluon import language as gl
+
+@gluon.jit
+def kernel(
+    C, A, B, M, N, K,
+    stride_cm, stride_cn,
+    stride_am, stride_ak,
+    stride_bk, stride_bn,
+    BLOCK_M: gl.constexpr,
+    BLOCK_N: gl.constexpr,
+    BLOCK_K: gl.constexpr
+):
+    layout: gl.constexpr = gl.BlockedLayout(size_per_thread=[1], threads_per_warp=[64], warps_per_cta=[1], order=[0])
+    offs = gl.arange(0, 64, layout=layout)
+    a = gl.load(A + offs)
+    gl.store(B + offs, a)
+"""
+
 test_utils_src = """
 #include <cuda.h>
 #include <stdio.h>
@@ -672,3 +692,15 @@ def test_ttgir_to_spv():
         assert "OpCapability Kernel" in spv
         assert "LocalSize 128 1 1" in spv
         assert "SubgroupSize 32" in spv
+
+
+def test_gluon_kernel():
+    if not is_hip():
+        pytest.xfail("Gluon kernel is only supported on HIP")
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        dtype = "fp16"
+        BM, BN, BK = 16, 16, 16
+
+        kernel_path = write_triton_kernels(tmp_dir, gluon_kernel_src, kernel_utils_src)
+        compile_aot_kernel_no_specialization(tmp_dir, kernel_path, dtype, BM, BN, BK)
+        check_hasco_binary_str(tmp_dir, dtype)
@@ -1,5 +1,6 @@
 from . import nvidia
+from . import amd
 from ._runtime import constexpr_function, jit
 from triton.language.core import must_use_result
 
-__all__ = ["constexpr_function", "jit", "must_use_result", "nvidia"]
+__all__ = ["constexpr_function", "jit", "must_use_result", "nvidia", "amd"]
@@ -0,0 +1,3 @@
+from . import gfx1250
+
+__all__ = ["gfx1250"]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from . import gfx1250`
	`2`	`+`
	`3`	`+__all__ = ["gfx1250"]`