intel
diff --git a/‎bench/bench/bench_mlp.py‎
Lines changed: 19 additions & 7 deletions b/‎bench/bench/bench_mlp.py‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/OpInterfaces.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/Triton/IR/OpInterfaces.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonAttrDefs.td‎
Lines changed: 15 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonAttrDefs.td‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 26 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 25 additions & 6 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 25 additions & 6 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonTypes.td‎
Lines changed: 22 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonTypes.td‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 25 additions & 2 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 1 addition & 0 deletions b/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 9 additions & 6 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp‎
Lines changed: 8 additions & 4 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp‎
Lines changed: 8 additions & 4 deletions
@@ -98,7 +98,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     # -- benchmark --
     fpath = Path(f"logs/{name}/{batch}-{dim1}-{dim2}-{n_expts_tot}-{n_expts_act}-{x_dtype}-{w_dtype}.hatchet")
     fpath.parent.mkdir(parents=True, exist_ok=True)
-    x_dtype = {"bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[x_dtype]
+    x_dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[x_dtype]
     # special treatment of fp8_e4m3 on AMD CDNA3 because it uses fp8_e4m3fnuz
     if x_dtype == torch.float8_e4m3fn and get_cdna_version() == 3:
         x_dtype = torch.float8_e4m3fnuz
@@ -140,17 +140,29 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
             min_time = max(min_time_flops, min_time_bytes)
             util = min_time / tot_time
         else:
-            util = "N/A"
+            util = 0.0
         tflops = sum([tot_flops[w] for w in [8, 16]]) / tot_time * 1e-3
         tbps = tot_bytes / tot_time * 1e-3
+        print(f"Utilization: {util:.0%}; {tflops:>6.1f} TFLOPs, {tbps:.1f} TB/s")
 
     return util, tflops, tbps
 
 
 if __name__ == "__main__":
     has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or get_cdna_version() == 4
-    qxdtype = "fp8" if has_native_mx4 else "bf16"
-    print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
-    print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
-    print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4"))
-    print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=1, name="llama4"))
+    if SPECS is None:
+        print("Current GPU has no specs provided, utilization is N/A")
+    if has_native_mx4:
+        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense")
+        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "mx4", TP=1, EP=1, name="dense")
+        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4")
+        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "mx4", TP=4, EP=1, name="llama4")
+    else:
+        # bf16/fp16 x fp8 is skipped because matmul_ogs requires x and w has the
+        # same type when not doing mxfp operation
+        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense")
+        bench_mlp(8192, 8192, 8192, 1, 1, "fp16", "mx4", TP=1, EP=1, name="dense")
+        bench_mlp(8192, 8192, 8192, 1, 1, "bf16", "mx4", TP=1, EP=1, name="dense")
+        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4")
+        bench_mlp(2048, 5120, 8192, 128, 4, "bf16", "mx4", TP=4, EP=1, name="llama4")
+        bench_mlp(2048, 5120, 8192, 128, 4, "fp16", "mx4", TP=4, EP=1, name="llama4")
@@ -2,6 +2,7 @@
 #define TRITON_IR_OP_INTERFACES_H_
 
 #include "mlir/IR/OpDefinition.h"
+#include "triton/Dialect/Triton/IR/Types.h"
 
 namespace mlir {
 
 
@@ -67,6 +67,21 @@ def TT_AtomicRMWAttr : I32EnumAttr<
     let cppNamespace = "::mlir::triton";
 }
 
+def TT_DescriptorReduceKindAttr : I32EnumAttr<
+    "DescriptorReduceKind", "",
+    [
+        I32EnumAttrCase<"ADD", 1, "add">,
+        I32EnumAttrCase<"MIN", 2, "min">,
+        I32EnumAttrCase<"MAX", 3, "max">,
+        I32EnumAttrCase<"INC", 4, "inc">,
+        I32EnumAttrCase<"DEC", 5, "dec">,
+        I32EnumAttrCase<"AND", 6, "and">,
+        I32EnumAttrCase<"OR", 7, "or">,
+        I32EnumAttrCase<"XOR", 8, "xor">,
+    ]> {
+    let cppNamespace = "::mlir::triton";
+}
+
 def TT_MemSyncScopeAttr : I32EnumAttr<
     "MemSyncScope", "",
     [
 
@@ -75,5 +75,31 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
   let verify = [{ return ::mlir::triton::impl::verifyDotOpInterface($_op); }];
 }
 
+def TT_DescriptorOpInterface : OpInterface<"DescriptorOpInterface"> {
+  let description = [{
+    Common interface to get the descriptor argument from an operation on tensor descriptors.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Get the descriptor",
+      /*retType=*/"::mlir::TypedValue<mlir::triton::TensorDescType>",
+      /*methodName=*/"getDesc",
+      /*args=*/(ins)>,
+  ];
+}
+
+def TT_DescriptorStoreLikeOpInterface : OpInterface<"DescriptorStoreLikeOpInterface", [TT_DescriptorOpInterface]> {
+  let cppNamespace = "::mlir::triton";
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Get Source tensor",
+      /*retType=*/"::mlir::TypedValue<mlir::RankedTensorType>",
+      /*methodName=*/"getSrc",
+      /*args=*/(ins)>,
+  ];
+}
+
 
 #endif // TRITON_OP_INTERFACES
@@ -1019,7 +1019,7 @@ def TT_MakeTensorDescOp : TT_Op<"make_tensor_descriptor", [
   let assemblyFormat = "$base `,` `[` $shape `]` `,` `[` $strides `]` attr-dict `:` type($base) `,` type($result)";
 
   let builders = [
-    OpBuilder<(ins "Value":$base, "ValueRange":$shape, "ValueRange":$strides, "ArrayRef<int32_t>":$blockShape)>
+    OpBuilder<(ins "Value":$base, "ValueRange":$shape, "ValueRange":$strides, "ArrayRef<int32_t>":$blockShape, "bool":$isSignedInteger)>
   ];
 
   let extraClassDeclaration = [{
@@ -1259,7 +1259,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
 }
 
 
-def TT_DescriptorLoadOp : TT_Op<"descriptor_load"> {
+def TT_DescriptorLoadOp : TT_Op<"descriptor_load", [TT_DescriptorOpInterface]> {
   let summary = "Load from descriptor";
   let description = [{
     This operation will be lowered to Nvidia TMA load operation on targets supporting it.
@@ -1287,7 +1287,7 @@ def TT_DescriptorLoadOp : TT_Op<"descriptor_load"> {
   let hasVerifier = 1;
 }
 
-def TT_DescriptorStoreOp : TT_Op<"descriptor_store"> {
+def TT_DescriptorStoreOp : TT_Op<"descriptor_store", [TT_DescriptorStoreLikeOpInterface]> {
   let summary = "store value based on descriptor";
   let description = [{
     This operation will be lowered to Nvidia TMA store operation on targets supporting it.
@@ -1304,11 +1304,30 @@ def TT_DescriptorStoreOp : TT_Op<"descriptor_store"> {
     $desc `[` $indices `]` `,` $src
     attr-dict `:` qualified(type($desc)) `,` type($src)
   }];
-
   let hasVerifier = 1;
 }
 
-def TT_DescriptorGatherOp : TT_Op<"descriptor_gather"> {
+def TT_DescriptorReduceOp : TT_Op<"descriptor_reduce", [TT_DescriptorStoreLikeOpInterface]> {
+  let summary = "performs a reducing store operation based on a descriptor";
+  let description = [{
+    This operation will be lowered to Nvidia TMA store operation on targets supporting it.
+    `desc` is a tensor descriptor object.
+    The shape and types of `src` must match the descriptor otherwise the result is undefined.
+  }];
+  let arguments = (ins
+    TT_DescriptorReduceKindAttr:$kind,
+    Arg<TT_TensorDescType, "", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>:$desc,
+    TT_Tensor:$src,
+    Variadic<I32>:$indices
+  );
+
+  let assemblyFormat = [{
+    $kind `,` $desc `[` $indices `]` `,` $src
+    attr-dict `:` qualified(type($desc)) `,` type($src)
+  }];
+}
+
+def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [TT_DescriptorOpInterface]> {
   let summary = "gather multiple rows from a descriptor into a single tensor";
   let description = [{
     The `tt.descriptor_gather` op will be lowered to NVIDIA TMA
@@ -1341,7 +1360,7 @@ def TT_DescriptorGatherOp : TT_Op<"descriptor_gather"> {
   }];
 }
 
-def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter"> {
+def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter", [TT_DescriptorStoreLikeOpInterface]> {
   let summary = "scatter multiple rows to a descriptor from a single tensor";
   let description = [{
     The `tt.descriptor_scatter` op will be lowered to NVIDIA TMA
 
@@ -102,6 +102,28 @@ def TT_TensorDescType : TritonTypeDef<"TensorDesc", "tensordesc", []> {
 
   let parameters = (ins "RankedTensorType":$blockType);
   let assemblyFormat = "`<` $blockType `>`";
+
+  let builders = [
+    TypeBuilder<(ins "RankedTensorType":$blockType, "bool":$isSigned), [{
+      if (auto intTy = llvm::dyn_cast<IntegerType>(blockType.getElementType())) {
+        auto sem = isSigned ? IntegerType::Signed : IntegerType::Unsigned;
+        auto elemTy = IntegerType::get($_ctxt, intTy.getWidth(), sem);
+        blockType = RankedTensorType::get(blockType.getShape(), elemTy);
+      }
+      return Base::get($_ctxt, blockType);
+    }]>,
+  ];
+  let extraClassDeclaration = [{
+    RankedTensorType getSignlessBlockType() const {
+      auto resTy = getBlockType();
+      if (auto intTy = llvm::dyn_cast<IntegerType>(resTy.getElementType())) {
+        auto width = resTy.getElementTypeBitWidth();
+        auto signlessTy = IntegerType::get(getContext(), width);
+        resTy = RankedTensorType::get(resTy.getShape(), signlessTy);
+      }
+      return resTy;
+    }
+  }];
 }
 
 #endif
@@ -316,7 +316,7 @@ def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global">
   }];
 
   let arguments = (ins
-    Arg<TT_PtrType, "", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>:$desc_ptr,
+    Arg<TT_PtrType, "", [MemWrite<GlobalMemory>]>:$desc_ptr,
     Variadic<I32>:$coord,
     Arg<TTG_MemDescType, "", [MemRead<SharedMemory>]>:$src
   );
@@ -327,6 +327,29 @@ def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global">
   }];
 }
 
+def TTNG_AsyncTMAReduceOp : TTNG_Op<"async_tma_reduce", [MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>]> {
+  let summary = "reduce result in gmem based on a TMA descriptor";
+
+  let description = [{
+    This operation copies data from local memory to global memory
+    asynchronously, and atomically performs the specified reduction kind.
+    Atomicity is at the granularity of individual elements, and only relaxed
+    semantics are implied.
+  }];
+
+  let arguments = (ins
+    TT_DescriptorReduceKindAttr:$kind,
+    Arg<TT_PtrType, "", [MemRead<GlobalMemory>]>:$desc_ptr,
+    Variadic<I32>:$coord,
+    Arg<TTG_MemDescType, "", [MemRead<SharedMemory>]>:$src
+  );
+
+  let assemblyFormat = [{
+    $kind `,` $desc_ptr `[` $coord `]` $src
+    attr-dict `:` qualified(type($desc_ptr)) `,` qualified(type($src))
+  }];
+}
+
 def TTNG_AsyncTMAGatherOp : TTNG_Op<"async_tma_gather"> {
   let summary = "gather data based on descriptor from global memory to local memory asynchronously";
 
@@ -365,7 +388,7 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter"> {
   }];
 
   let arguments = (ins
-    Arg<TT_PtrType, "", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>:$desc_ptr,
+    Arg<TT_PtrType, "", [MemWrite<GlobalMemory>]>:$desc_ptr,
     RankedTensorOf<[I32]>:$x_offsets,
     I32:$y_offset,
     Arg<TTG_MemDescType, "", [MemRead<SharedMemory>]>:$src
 
@@ -620,6 +620,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
       GenericOpPattern<triton::AtomicRMWOp>, GenericOpPattern<ReturnOp>,
       GenericOpPattern<triton::DescriptorLoadOp>,
       GenericOpPattern<triton::DescriptorStoreOp>,
+      GenericOpPattern<triton::DescriptorReduceOp>,
       GenericOpPattern<triton::ExperimentalTensormapCreateOp>,
       GenericOpPattern<triton::ExperimentalTensormapFenceproxyAcquireOp>,
       // this assumes the right layout will be set later for dot scaled.
 
@@ -969,16 +969,17 @@ OpFoldResult AdvanceOp::fold(FoldAdaptor adaptor) {
 //-- MakeTensorDescOp --
 void MakeTensorDescOp::build(OpBuilder &builder, OperationState &state,
                              Value base, ValueRange shape, ValueRange strides,
-                             ArrayRef<int32_t> blockShape) {
+                             ArrayRef<int32_t> blockShape,
+                             bool isSignedInteger) {
   auto ptrTy = dyn_cast<triton::PointerType>(base.getType());
   if (!ptrTy) {
     llvm::report_fatal_error("Expected pointer type");
   }
   auto elemTy = ptrTy.getPointeeType();
-
   SmallVector<int64_t> blockShape64(blockShape);
   auto blockTy = RankedTensorType::get(blockShape64, elemTy);
-  auto descTy = TensorDescType::get(builder.getContext(), blockTy);
+  auto descTy =
+      TensorDescType::get(builder.getContext(), blockTy, isSignedInteger);
   return build(builder, state, descTy, base, shape, strides);
 }
 
@@ -1333,21 +1334,23 @@ static LogicalResult verifyGatherScatterOp(Operation *op,
 }
 
 LogicalResult DescriptorGatherOp::verify() {
-  return verifyGatherScatterOp(*this, getDesc().getType().getBlockType(),
+  return verifyGatherScatterOp(*this,
+                               getDesc().getType().getSignlessBlockType(),
                                getResult().getType(), getXOffsets().getType());
 }
 
 // -- DescriptorScatterOp --
 LogicalResult DescriptorScatterOp::verify() {
-  return verifyGatherScatterOp(*this, getDesc().getType().getBlockType(),
+  return verifyGatherScatterOp(*this,
+                               getDesc().getType().getSignlessBlockType(),
                                getSrc().getType(), getXOffsets().getType());
 }
 
 // -- DescriptorLoadOp --
 static LogicalResult verifyDescriptorLoadStoreType(Operation *op,
                                                    TensorDescType desc,
                                                    RankedTensorType tensor) {
-  RankedTensorType block = desc.getBlockType();
+  RankedTensorType block = desc.getSignlessBlockType();
   ArrayRef<int64_t> blockShape = block.getShape();
   ArrayRef<int64_t> tensorShape = tensor.getShape();
   if (blockShape.size() > tensorShape.size()) {
 
@@ -18,11 +18,8 @@ static SmallVector<TMAStore> getTMAStores(scf::ForOp forOp) {
   SmallVector<TMAStore> tmaStores;
 
   forOp.getBody()->walk<mlir::WalkOrder::PreOrder>([&](Operation *op) {
-    if (auto storeOp = dyn_cast<tt::DescriptorStoreOp>(op)) {
+    if (auto storeOp = dyn_cast<tt::DescriptorStoreLikeOpInterface>(op)) {
       tmaStores.push_back({storeOp, storeOp.getDesc(), storeOp.getSrc()});
-    } else if (auto scatterOp = dyn_cast<tt::DescriptorScatterOp>(op)) {
-      tmaStores.push_back({scatterOp, scatterOp.getDesc(), scatterOp.getSrc()});
-
       // Don't walk into nested loops.
     } else if (isa<scf::ForOp>(op)) {
       return WalkResult::skip();
@@ -77,6 +74,13 @@ static void createTMAAsyncCopy(scf::ForOp forOp, const TMAStore &store,
         storeOp.getIndices());
     builder.create<ttng::AsyncTMACopyLocalToGlobalOp>(
         loc, tmaPtr, storeOp.getIndices(), alloc);
+  } else if (auto reduceOp = dyn_cast<tt::DescriptorReduceOp>(store.op)) {
+    auto indices = ttng::translateTMAIndices(
+        builder, reduceOp.getLoc(),
+        reduceOp.getDesc().getType().getBlockType().getEncoding(),
+        reduceOp.getIndices());
+    builder.create<ttng::AsyncTMAReduceOp>(loc, reduceOp.getKind(), tmaPtr,
+                                           reduceOp.getIndices(), alloc);
   } else {
     auto scatterOp = cast<tt::DescriptorScatterOp>(store.op);
     builder.create<ttng::AsyncTMAScatterOp>(