Test support for tensor_descriptors in gluon, initial load/store/prefetch operations

mieshkiwrk · mieshkiwrk · commit acb526d4de39 · 2025-12-01T07:58:02.000Z
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -31,6 +31,7 @@ namespace ttg = triton::gpu;
 namespace ttng = triton::nvidia_gpu;
 namespace gluon = mlir::triton::gluon;
 namespace ttag = mlir::triton::amdgpu;
+namespace ttgi = mlir::triton::gpu::intel;
 
 // Helper to check if an MLIR type or attribute has a verifier method.
 template <typename AttrOrType>
@@ -897,7 +898,72 @@ void init_gluon_ir(py::module &&m) {
       .def("create_lds_barrier_arrive",
            [](GluonOpBuilder &self, Value memDesc, int count) -> Value {
              return self.create<ttag::ArriveBarrierOp>(memDesc, count);
-           });
+           })
+      .def("create_prefetch",
+           [](GluonOpBuilder &self, Value tensorDesc, std::vector<Value> &offsets,
+              bool isVolatile) {
+             // Get the base pointer from tensor descriptor
+             auto makeTensorDescOp = tensorDesc.getDefiningOp<triton::MakeTensorDescOp>();
+             if (!makeTensorDescOp) {
+               throw std::runtime_error("Expected tensor descriptor from MakeTensorDescOp");
+             }
+
+             Value base = makeTensorDescOp.getBase();
+             auto shape = makeTensorDescOp.getShape();
+             auto strides = makeTensorDescOp.getStrides();
+
+             // Convert shape from i32 to i64 for MakeTensorPtrOp
+             // Needed because:
+             // error: 'tt.make_tensor_ptr' op operand #1 must be
+             // variadic of 64-bit signless integer, but got 'i32'
+             SmallVector<Value> i64Shape;
+             for (auto shapeVal : shape) {
+               auto i64Val = self.create<arith::ExtSIOp>(self.getBuilder().getI64Type(), shapeVal);
+               i64Shape.push_back(i64Val);
+             }
+
+             // Get block shape from tensor descriptor type
+             auto descType = cast<triton::TensorDescType>(tensorDesc.getType());
+             auto blockType = cast<RankedTensorType>(descType.getBlockType());
+             auto tensorShape = blockType.getShape();
+
+             // Convert to int32 vector for MakeTensorPtrOp
+             std::vector<int32_t> blockShape;
+             for (int64_t dim : tensorShape) {
+               blockShape.push_back(static_cast<int32_t>(dim));
+             }
+
+             // Default order for 2D tensors (row-major)
+             std::vector<int32_t> order = {1, 0};
+             if (blockShape.size() != 2) {
+               // For non-2D tensors, use sequential order
+               order.resize(blockShape.size());
+               std::iota(order.begin(), order.end(), 0);
+             }
+
+             // Empty mask
+             Value maskVal = Value();
+
+             auto tensorPtrOp = self.create<mlir::triton::MakeTensorPtrOp>(base, /*shape*/i64Shape, strides, offsets,
+                                                 /*tensor_shape*/blockShape, order);
+
+             auto op = self.create<ttgi::PrefetchOp>(
+                  /*base*/tensorPtrOp.getResult(), maskVal, tt::CacheModifier::NONE, tt::EvictionPolicy::NORMAL, isVolatile);
+             return op.getOperation();
+           })
+      // Example for passing block_ptr
+      // .def("create_prefetch",
+      //      [](GluonOpBuilder &self, Value ptr, //, py::object mask,
+      //         //triton::CacheModifier cache, triton::EvictionPolicy evict,
+      //         bool isVolatile) {
+      //        //auto c = triton::CacheModifier();
+      //        //Value maskVal = mask.is_none() ? Value() : mask.cast<Value>();
+      //        Value maskVal = Value();
+      //
+      //        self.create<ttgi::PrefetchOp>(
+      //            ptr, maskVal, tt::CacheModifier::NONE, tt::EvictionPolicy::NORMAL, isVolatile);
+      //      })
+      ;
 
   m.def(
       "compute_tmem_reg_layout",
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -648,7 +648,11 @@ void init_triton_ir(py::module &&m) {
              if (!ret)
                return py::none();
              return py::str(ret.getValue().str());
-           });
+           })
+      .def("set_attr",
+         [](Operation &self, const std::string &name, Attribute &attr) {
+           self.setAttr(name, attr);
+         });
 
   // dynamic_attr is used to transfer ownership of the MLIR context to the
   // module
@@ -1530,8 +1534,9 @@ void init_triton_ir(py::module &&m) {
            })
       .def("create_descriptor_store",
            [](TritonOpBuilder &self, Value desc, Value value,
-              std::vector<Value> &indices) -> void {
-             self.create<DescriptorStoreOp>(desc, value, indices);
+              std::vector<Value> &indices) -> Operation* {//void {
+             auto op = self.create<DescriptorStoreOp>(desc, value, indices);
+             return op.getOperation();
            })
       .def("create_descriptor_reduce",
            [](TritonOpBuilder &self, DescriptorReduceKind kind, Value desc,
diff --git a/python/triton/experimental/gluon/language/__init__.py b/python/triton/experimental/gluon/language/__init__.py
@@ -50,7 +50,6 @@
     device_assert,
     device_print,
     dot_fma,
-    xpu_dot_fma,
     expand_dims,
     full,
     fp4_to_fp,
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -590,9 +590,3 @@ def dot_fma(a, b, acc, _semantic=None):
 
     handle = _semantic.dot(a, b, acc, input_precision=None, max_num_imprecise_acc=None, out_dtype=acc.dtype).handle
     return tensor(handle, acc.type)
-
-
-@builtin
-def xpu_dot_fma(a, b, acc, _semantic=None):
-    handle = _semantic.dot(a, b, acc, input_precision=None, max_num_imprecise_acc=None, out_dtype=acc.dtype).handle
-    return tensor(handle, acc.type)
diff --git a/python/triton/experimental/gluon/language/intel/xpu/xe.py b/python/triton/experimental/gluon/language/intel/xpu/xe.py
@@ -0,0 +1,202 @@
+from __future__ import annotations
+
+from typing import List, Tuple, Sequence
+from dataclasses import dataclass
+import triton.language.core as tl_core
+
+import triton.experimental.gluon.language._core as ttgl
+from triton.experimental.gluon.language._layouts import DotOperandLayout
+from triton.experimental.gluon.language.intel._layouts import IntelDPASLayout
+from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
+from triton.language.core import ir, constexpr, tensor_descriptor_base, block_type, tensor, tuple
+
+
+# load_tensor_descriptor = builtin(tl_core.load_tensor_descriptor)
+# store_tensor_descriptor = builtin(tl_core.store_tensor_descriptor)
+
+
+__all__ = ["make_tensor_descriptor", "dot_fma"]
+
+
+
+class tensor_descriptor(tensor_descriptor_base):
+    """A descriptor representing a tensor in global memory."""
+
+    def __init__(self, handle, shape: List[tensor], strides: List[tensor], block_type: block_type, layout):
+        """Not called by user code."""
+        # IR handle
+        super().__init__(handle, block_type)
+        # Global shape
+        self.shape = tuple(shape)
+        self.strides = tuple(strides)
+        self.layout = layout
+
+        self.type = tensor_descriptor_type(
+            block_type,
+            shape_type=self.shape.type,
+            strides_type=self.strides.type,
+            layout=self.layout, # comment
+        )
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+        self.shape._flatten_ir(handles)
+        self.strides._flatten_ir(handles)
+
+    # TODO: MaterializeBlockPointers.cpp
+    # Add 2d_block_io parameter + validation to set proper attribute
+    # Validation: (?)
+    #   > 2 dims
+    #   > stride 16 bytes aligned
+    #   and others
+    @builtin
+    def load(self, offsets: Sequence[constexpr | tensor], is_2d_block=False, _semantic=None) -> tensor:
+        op = _semantic.descriptor_load(self, offsets, "", "")
+
+        if is_2d_block:
+            # TODO: proper handling like below test example
+            # Option to set row/column major and other params
+            attr = _semantic.builder.get_string_attr("row_major")
+            op.handle.set_attr("ttig.block_io", attr)
+
+        return op
+
+    @builtin
+    def store(self, offsets: Sequence[constexpr | tensor], value: tensor, is_2d_block=False, _semantic=None) -> tensor:
+        op = _semantic.descriptor_store(self, value, offsets)
+
+        if is_2d_block:
+            attr = _semantic.builder.get_string_attr("row_major")
+            op.handle.set_attr("ttig.block_io", attr)
+
+        return op
+
+    @builtin
+    def prefetch(self, offsets: Sequence[constexpr | tensor], mask=None, cache=None, evict=None, is_volatile=False, is_2d_block=False, _semantic=None):
+        # TODO: handle other ttig.prefetch params
+        # ptr is just temporary, support for tensor descriptor is needed
+        # calculate offsets like tt.advance
+        # maybe add support for mask, seems optional
+        # also 2d block attr and others
+        #return _semantic.builder.create_prefetch(ptr.handle, False)
+
+        """
+        pyton/triton/language/semantic.py @ load:1077 (TritonSemantic)
+        cache_modifier: str, eviction_policy: str
+        cache = self._str_to_load_cache_modifier(cache_modifier)
+        eviction = self._str_to_eviction_policy(eviction_policy)
+        """
+
+        ptr_handle = self.handle
+        offsets_handles = [offset.handle if hasattr(offset, 'handle') else offset for offset in offsets]
+        op = _semantic.builder.create_prefetch(ptr_handle, offsets_handles, False)
+
+        if is_2d_block:
+            attr = _semantic.builder.get_string_attr("row_major")
+            op.set_attr("ttig.block_io", attr)
+
+        return op
+
+
+
+@dataclass(eq=True)
+class tensor_descriptor_type(ttgl.base_type):
+    """The type for a tensor descriptor."""
+
+    block_type: ttgl.block_type
+    shape_type: ttgl.tuple_type
+    strides_type: ttgl.tuple_type
+    layout: IntelDPASLayout
+
+    def __str__(self) -> str:
+        return f"tensor_descriptor<{self.block_type}, {self.layout}>"
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_descriptor, int]:
+        handle = handles[cursor]
+        cursor += 1
+        shape, cursor = self.shape_type._unflatten_ir(handles, cursor)
+        strides, cursor = self.strides_type._unflatten_ir(handles, cursor)
+        value = tensor_descriptor(handle, shape, strides, self.block_type, self.layout)
+        return value, cursor
+
+    def _to_ir(self, builder: ir.builder) -> ir.type:
+        is_signed = self.block_type.element_ty.is_int_signed()
+        return builder.get_tensor_descriptor_layout_type(
+            self.block_type.to_ir(builder),
+            is_signed,
+            self.layout._to_ir(builder),
+        )
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        out.append(self._to_ir(builder))
+        self.shape_type._flatten_ir_types(builder, out)
+        self.strides_type._flatten_ir_types(builder, out)
+
+    def mangle(self) -> str:
+        return f"TD{self.block_type.mangle()}_{self.shape_type.mangle()}_{self.strides_type.mangle()}_{self.layout.mangle()}TD"
+
+
+@builtin
+def make_tensor_descriptor(ptr: ttgl.tensor, shape: List[int], strides: List[int],
+                          block_shape: List[int], layout: IntelDPASLayout,
+                          _semantic=None) -> tensor_descriptor:
+    # Unwrap constexpr if needed
+    layout = _unwrap_if_constexpr(layout)
+
+    # Get the pointer handle directly
+    ptr_handle = ptr.handle
+
+    # Convert shape and strides to IR values AND create tensor objects
+    shape_handles = _semantic._convert_to_ir_values(shape, require_i64=False)
+    stride_handles = _semantic._convert_to_ir_values(strides, require_i64=True)
+
+    # Create tensor objects from the handles
+    shape_tensors = [ttgl.tensor(h, ttgl.int32) for h in shape_handles]
+    stride_tensors = [ttgl.tensor(h, ttgl.int64) for h in stride_handles]
+
+    # Build type information
+    block_type = ttgl.block_type(ptr.type.element_ty, block_shape)
+
+    # TODO: this is w/a for xpu_dot_fma assertion - layout for block_type is not implemented yet
+    # See: gluon/language/_core.py:19
+    block_type.layout = layout
+
+    shape_type = ttgl.tuple_type([ttgl.int32] * len(shape))
+    strides_type = ttgl.tuple_type([ttgl.int64] * len(strides))
+
+    # Pass tensor objects, not constexpr values
+    shape_tuple = ttgl.tuple(shape_tensors, shape_type)
+    strides_tuple = ttgl.tuple(stride_tensors, strides_type)
+
+    desc_type = tensor_descriptor_type(block_type, shape_type, strides_type, layout) #, shape_handles)
+
+    # Create the descriptor
+    padding = _semantic._str_to_padding_option("zero")
+    desc_handle = _semantic.builder.create_make_tensor_descriptor(
+        desc_type._to_ir(_semantic.builder),
+        ptr_handle,
+        shape_handles,
+        stride_handles,
+        padding
+    )
+
+    return tensor_descriptor(desc_handle, shape_tuple, strides_tuple, block_type, layout)
+
+@builtin
+def dot_fma(a, b, acc, _semantic=None):
+    assert isinstance(a, tensor), "a must be a tensor"
+    assert isinstance(b, tensor), "b must be a tensor"
+    assert isinstance(acc, tensor), "acc must be a tensor"
+
+    mma_layout = acc.type.layout
+    assert isinstance(mma_layout, IntelDPASLayout), "acc must have a BlockedLayout"
+    assert isinstance(a.type.layout, DotOperandLayout), "a must have a DotOperandLayout"
+    assert isinstance(b.type.layout, DotOperandLayout), "b must have a DotOperandLayout"
+    assert a.type.layout.parent == mma_layout, "a's parent layout must be the same as acc's layout"
+    assert b.type.layout.parent == mma_layout, "b's parent layout must be the same as acc's layout"
+    assert a.type.layout.operand_index == 0, "a's operand index must be 0"
+    assert b.type.layout.operand_index == 1, "b's operand index must be 1"
+
+    handle = _semantic.dot(a, b, acc, input_precision=None, max_num_imprecise_acc=None, out_dtype=acc.dtype).handle
+    return tensor(handle, acc.type)
+
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -29,7 +29,7 @@ class XPUOptions:
     num_ctas: int = 1
     num_stages: int = 2
     cluster_dims: tuple = (1, 1, 1)
-    warp_size: int = 32
+    warp_size: int = 16 #32 # TODO:[mdziado]
     optimize_epilogue: bool = False
     enable_fp_fusion: bool = True
     launch_cooperative_grid: bool = False
@@ -311,6 +311,11 @@ def gluon_to_ttgir(self, src, metadata, options):
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
 
+        # TODO: support tensor descriptors
+        # This is W/A to convert them into block_pointers
+        intel.passes.ttir.add_convert_tdesc_to_block_pointer(pm)
+        passes.ttir.add_rewrite_tensor_descriptor_to_pointer(pm)
+
         passes.gluon.add_inliner(pm)
         passes.gluon.add_resolve_auto_encodings(pm)
         passes.common.add_sccp(pm)
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp
@@ -5,6 +5,8 @@
 #include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
@@ -15,6 +17,7 @@
 
 using namespace mlir;
 namespace tt = mlir::triton;
+namespace ttgi = mlir::triton::gpu::intel;
 
 namespace mlir::triton::intel {
 #define GEN_PASS_DEF_TRITONINTELTENSORDESCTOBLOCKPOINTER
@@ -265,18 +268,28 @@ struct TritonIntelTensorDescToBlockPointer
     for (size_t i = 0; i < tensorType.getRank(); ++i)
       boundaryCheck.push_back(i);
 
+    Attribute blockIOAttr =
+        op->getAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName());
+
     constexpr bool isLoad = std::is_same_v<OpTy, tt::DescriptorLoadOp>;
     if constexpr (isLoad) {
       auto loadOp = builder.createOrFold<tt::LoadOp>(
           loc, ptr, boundaryCheck,
           /*padding*/ std::nullopt, op.getCache(), op.getEvict(),
           /*volatile*/ false);
+      if (blockIOAttr) {
+          auto* loadOpInst = loadOp.getDefiningOp();
+          loadOpInst->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(), blockIOAttr);
+      }
       LLVM_DEBUG(llvm::dbgs().indent(2) << loadOp << "\n");
       op.replaceAllUsesWith(loadOp);
     } else {
       [[maybe_unused]] auto storeOp = builder.createOrFold<tt::StoreOp>(
           loc, ptr, op.getSrc(), boundaryCheck, tt::CacheModifier::NONE,
           tt::EvictionPolicy::NORMAL);
+      if (blockIOAttr) {
+          storeOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(), blockIOAttr);
+      }
       LLVM_DEBUG(llvm::dbgs().indent(2) << storeOp << "\n");
     }