intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 7 additions & 3 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 9 additions & 6 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎include/triton/Dialect/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 91 additions & 8 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 91 additions & 8 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/Dialect.h‎
Lines changed: 12 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/Dialect.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentDialect.td‎
Lines changed: 11 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentDialect.td‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentOps.td‎
Lines changed: 82 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentOps.td‎
Lines changed: 82 additions & 0 deletions
@@ -17,6 +17,7 @@
 #include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonInstrument/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 // Below headers will allow registration to ROCm passes
@@ -26,6 +27,7 @@
 
 #include "triton/Dialect/Triton/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
+#include "triton/Dialect/TritonInstrument/Transforms/Passes.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
 #include "nvidia/hopper/include/Transforms/Passes.h"
@@ -64,6 +66,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::gpu::registerTritonGPUPasses();
   mlir::triton::nvidia_gpu::registerTritonNvidiaGPUPasses();
   mlir::test::intel::registerTestAxisInfoPass();
+  mlir::triton::instrument::registerTritonInstrumentPasses();
   mlir::test::registerTestAliasPass();
   mlir::test::registerTestAlignmentPass();
   mlir::test::registerAMDTestAlignmentPass();
@@ -123,9 +126,10 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   registry.insert<
       mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
       mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,
-      mlir::triton::gpu::TritonGPUDialect, mlir::math::MathDialect,
-      mlir::arith::ArithDialect, mlir::scf::SCFDialect, mlir::gpu::GPUDialect,
-      mlir::LLVM::LLVMDialect, mlir::NVVM::NVVMDialect,
+      mlir::triton::gpu::TritonGPUDialect,
+      mlir::triton::instrument::TritonInstrumentDialect,
+      mlir::math::MathDialect, mlir::arith::ArithDialect, mlir::scf::SCFDialect,
+      mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect, mlir::NVVM::NVVMDialect,
       mlir::triton::nvgpu::NVGPUDialect, mlir::triton::nvws::NVWSDialect,
       mlir::triton::amdgpu::TritonAMDGPUDialect,
       mlir::triton::proton::ProtonDialect, mlir::ROCDL::ROCDLDialect,
 
@@ -102,6 +102,11 @@ void populatePrintOpToLLVMPattern(LLVMTypeConverter &typeConverter,
                                   const TargetInfoBase &targetInfo,
                                   PatternBenefit benefit);
 
+void populateInstrumentationToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                           const TargetInfoBase &targetInfo,
+                                           RewritePatternSet &patterns,
+                                           PatternBenefit benefit);
+
 } // namespace triton
 } // namespace mlir
 
 
@@ -537,12 +537,6 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
 
-[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
-
 [[nodiscard]] bool emitTransferBetweenRegistersAndShared(
     LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
     std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
@@ -574,6 +568,15 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
                 ConversionPatternRewriter &rewriter,
                 const TargetInfoBase &targetInfo);
 
+// Lower local_load/local_store via ld.shared/st.shared
+SmallVector<Value> lowerLocalLdSt(Location loc, MLIRContext *ctx,
+                                  // Map from registers to offset
+                                  LinearLayout cvt, ArrayRef<Value> valsArray,
+                                  // Input for store, output for load
+                                  Type llvmElemTy, Value smemBase,
+                                  ConversionPatternRewriter &rewriter,
+                                  const TargetInfoBase &targetInfo);
+
 SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
                                     RewriterBase &rewriter);
 
 
@@ -1,3 +1,4 @@
 add_subdirectory(Triton)
 add_subdirectory(TritonGPU)
 add_subdirectory(TritonNvidiaGPU)
+add_subdirectory(TritonInstrument)
@@ -167,21 +167,22 @@ def SharedEncodingTrait : AttrInterface<"SharedEncodingTrait"> {
   ];
 }
 
-def SwizzledSharedEncodingAttr :
-  TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> {
+def SwizzledSharedEncodingAttr
+    : TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding",
+                     [SharedEncodingTrait, LayoutEncodingTrait]> {
   let mnemonic = "swizzled_shared";
 
   let description = [{
 An encoding for tensors whose elements may be simultaneously accessed by
-different cuda threads in the programs, via shared memory. In other words,
+different GPU threads in the programs, via shared memory. In other words,
 for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}.
 
 In order to avoid shared memory bank conflicts, elements may be swizzled.
 Here are some examples.  In all cases, the input tensor is [0, 1, ..., n-1].
 
 1. Basic swizzling
 
-  #shared<{vec=1, perPhase=1, maxPhase=4, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=1, perPhase=1, maxPhase=4, order=[1,0]}>
   [ 0,  1,  2,  3],  // xor with 0
   [ 5,  4,  7,  6],  // xor with 1
   [10, 11,  8,  9],  // xor with 2
@@ -192,7 +193,7 @@ out[r][c^r]).
 
 2. Multiple rows per phase
 
-  #shared<{vec=1, perPhase=2, maxPhase=4, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=1, perPhase=2, maxPhase=4, order=[1,0]}>
   [ 0,  1,  2,  3],  // phase 0 (xor with 0)
   [ 4,  5,  6,  7],
   [ 9,  8, 11, 10],  // phase 1 (xor with 1)
@@ -203,7 +204,7 @@ means that pairs of 2 rows get the same swizzling.
 
 3. Max-phase applied
 
-  $shared<{vec=1, perPhase=1, maxPhase=2, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=1, perPhase=1, maxPhase=2, order=[1,0]}>
   [ 0,  1,  2,  3],  // phase 0 (xor with 0)
   [ 5,  4,  7,  6],  // phase 1 (xor with 1)
   [ 8,  9, 10, 11],  // phase 0
@@ -218,7 +219,7 @@ effect of limiting the maximum value of the xor to m-1.
 
 4. Max-phase and per-phase
 
-  #shared<{vec=1, perPhase=2, maxPhase=2, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=1, perPhase=2, maxPhase=2, order=[1,0]}>
   [ 0,  1,  2,  3],  // phase 0 (xor with 0)
   [ 4,  5,  6,  7],  // phase 0
   [ 9,  8, 11, 10],  // phase 1 (xor with 1)
@@ -234,7 +235,7 @@ maximum value of maxPhase-1.  In other words, elements of row r are xor'ed with
 
 5. Adding vec
 
-  #shared<{vec=2, perPhase=1, maxPhase=4, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=2, perPhase=1, maxPhase=4, order=[1,0]}>
   [ 0,  1,  2,  3,  4,  5,  6,  7],
   [10, 11,  8,  9, 14, 15, 12, 13],
   [20, 21, 22, 23, 16, 17, 18, 19],
@@ -383,6 +384,88 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
   let genVerifyDecl = 1;
 }
 
+def PaddeddSharedEncodingAttr
+    : TritonGPU_Attr<"PaddedSharedEncoding", "padded_shared_encoding",
+                     [SharedEncodingTrait, LayoutEncodingTrait]> {
+  let mnemonic = "padded_shared";
+
+  let description = [{
+An encoding for tensors whose elements may be simultaneously accessed by
+different GPU threads in the programs, via shared memory. In other words,
+for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}.
+Compared to SwizzledSharedEncodingAttr, this encoding uses padding to avoid
+shared memory bank conflicts.
+
+Formally, given a layout:
+    padded_shared<[<interval_0>:+<pad_0>, <interval_1>:+<pad_1>, ...]>
+We insert a padding of `<pad_i>` elements after every `<interval_i>` elements.
+Multi interval-padding pairs are supported for flexibility of multi tiered
+padding schemes; they compose in an additive manner. So for a 1-D tensor element
+at index i, the corresponding shared memory location index is
+    i + \sum_{k} (i / interval_k) * pad_k = 1
+`<interval_i>` and `<pad_i>` all need to be power of two.
+
+Some concrete examples, using `eM` to mean tensor elements and `pN` to mean
+padding:
+
+1. Single interval-padding pair:
+
+   #ttg.padded_shared<[2:+2]>
+   [e0, e1, p0, p1,
+    e2, e3, p2, p3,
+    ...]
+
+2. Double interval-padding pairs:
+
+   #ttg.padded_shared<[2:+1, 4:+2]>
+   [e0, e1, p0,
+    e2, e3, p1, p2, p3,
+    e4, e5, p4,
+    e6, e7, p5, p6, p7,
+    ...]
+
+In addition to interval-padding pairs, this encoding requires an `order` to
+specify the logical tensor dimenions from the fastest-to slowest-varying.
+It may optionally support CGA level organization like other encoding
+attributes too, for example,
+    #ttg.padded_shared<[2:+1, 4:+2] {
+        order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1],
+        CTAOrder = [0, 1]}>
+  }];
+
+  let parameters = (ins
+      ArrayRefParameter<"unsigned">:$intervals,
+      ArrayRefParameter<"unsigned">:$paddings,
+      // Order of logical tensor dimensions; fastest-varying first.
+      ArrayRefParameter<"unsigned">:$order,
+      "CTALayoutAttr":$CTALayout
+  );
+
+  let builders = [
+      AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
+                       "ArrayRef<unsigned>":$order, "CTALayoutAttr":$ctaLayout)>,
+  ];
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    unsigned getRank() const { return getOrder().size(); }
+    int32_t getAlignment() const { return 16; }
+
+    unsigned getMinInterval() const {
+      return *llvm::min_element(getIntervals());
+    }
+
+    // Returns the total number of elements including padding given the input
+    // tensor shape.
+    int64_t getPaddedSize(ArrayRef<int64_t> shape) const;
+
+    SmallVector<unsigned> getCTAsPerCGA() const;
+    SmallVector<unsigned> getCTAOrder() const;
+    SmallVector<unsigned> getCTASplitNum() const;
+  }];
+  let hasCustomAssemblyFormat = 1;
+  let genVerifyDecl = 1;
+}
+
 def NVMMASharedEncodingAttr :
   TritonGPU_Attr<"NVMMASharedEncoding", "nvmma_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> {
   let mnemonic = "nvmma_shared";
 
@@ -0,0 +1,2 @@
+add_subdirectory(IR)
+add_subdirectory(Transforms)
@@ -0,0 +1,13 @@
+set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
+
+set(LLVM_TARGET_DEFINITIONS TritonInstrumentDialect.td)
+mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=tti)
+mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=tti)
+add_mlir_doc(TritonInstrumentDialect TritonInstrumentDialect dialects/ -gen-dialect-doc)
+
+set(LLVM_TARGET_DEFINITIONS TritonInstrumentOps.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls)
+mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+add_mlir_doc(TritonInstrumentOps TritonInstrumentOps dialects/ -gen-op-doc)
+
+add_public_tablegen_target(TritonInstrumentTableGen)
@@ -0,0 +1,12 @@
+#ifndef TRITON_DIALECT_TRITONINSTRUMENT_IR_DIALECT_H_
+#define TRITON_DIALECT_TRITONINSTRUMENT_IR_DIALECT_H_
+
+// TritonInstrument depends on Triton and TritonGPU
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+#define GET_OP_CLASSES
+#include "triton/Dialect/TritonInstrument/IR/Dialect.h.inc"
+#include "triton/Dialect/TritonInstrument/IR/Ops.h.inc"
+
+#endif // TRITON_DIALECT_TRITONINSTRUMENT_IR_DIALECT_H_
@@ -0,0 +1,11 @@
+#ifndef TRITONINSTRUMENT_DIALECT
+#define TRITONINSTRUMENT_DIALECT
+
+include "mlir/IR/OpBase.td"
+
+def TritonInstrument_Dialect : Dialect {
+  let name = "tti";
+  let cppNamespace = "::mlir::triton::instrument";
+}
+
+#endif // TRITONINSTRUMENT_DIALECT
@@ -0,0 +1,82 @@
+#ifndef TRITONINSTRUMENT_OPS
+#define TRITONINSTRUMENT_OPS
+
+include "triton/Dialect/TritonInstrument/IR/TritonInstrumentDialect.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
+include "triton/Dialect/Triton/IR/TritonTypes.td"
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+class TTI_Op<string mnemonic, list<Trait> traits = []> :
+    Op<TritonInstrument_Dialect, mnemonic, traits> {
+}
+
+// Define an array of pointers to shared memory buffers
+def TTI_ExperimentalSharedBufferPointersOp : TTI_Op<"experimental_shared_buffer_pointers", [Pure]> {
+  let summary = "definte an array of pointers to shared memory buffers";
+  let description = [{
+    Create a tensor of pointers to shared memory buffers.
+  }];
+  let arguments = (ins DenseI32ArrayAttr:$offsets);
+  let results = (outs TT_Tensor:$result);
+  let assemblyFormat = [{
+    attr-dict `:` type($result)
+  }];
+}
+
+// Check if writing to a buffer guarded by a mbar is valid
+def TTI_ExperimentalCheckAsyncWriteWithMbarSharedOp : TTI_Op<"experimental_check_async_write_with_mbar_shared", [Pure]> {
+  let summary = "check if writing to a buffer guarded by a mbar is valid";
+  let description = [{
+    Check if writing to a shared memory buffer guarded by a mbar is valid.
+    Update the buffer state and assert if the buffer is being read or written.
+  }];
+  let arguments = (ins
+    TTG_MemDescType:$buffer,
+    TTG_MemDescType:$mbar,
+    TT_Tensor:$buffers,
+    TT_Tensor:$states,
+    TT_Tensor:$barriers
+  );
+  let results = (outs
+    TT_Tensor:$outStates,
+    TT_Tensor:$outBarriers
+  );
+  let assemblyFormat = [{
+    $buffer `,` $mbar `{` $buffers `,` $states `,` $barriers `}` attr-dict `:` type($buffer) `,` type($mbar) `,` type($buffers) `,` type($states) `,` type($barriers) `->` type($outStates) `,` type($outBarriers)
+  }];
+  let builders = [
+    OpBuilder<(ins "Value":$buffer, "Value":$mbar, "Value":$buffers, "Value":$states, "Value":$barriers),[{
+      build($_builder, $_state, {states.getType(), barriers.getType()}, buffer, mbar, buffers, states, barriers);
+    }]>
+  ];
+}
+
+def TTI_ExperimentalCheckWaitMbarOp : TTI_Op<"experimental_check_wait_mbar", [Pure]> {
+  let summary = "check if waiting on a mbar is valid and update the barrier state";
+  let description = [{
+    Check if waiting on a mbar is valid and update the barrier state.
+  }];
+  let arguments = (ins
+    TTG_MemDescType:$mbar,
+    TT_Tensor:$barriers,
+    TT_Tensor:$states
+    );
+
+  let results = (outs
+    TT_Tensor:$outStates,
+    TT_Tensor:$outBarriers);
+
+  let assemblyFormat = [{
+    $mbar `{` $states `,` $barriers `}` attr-dict `:` type($mbar) `,` type($states) `,` type($barriers) `->` type($outStates) `,` type($outBarriers)
+  }];
+
+  let builders = [
+    OpBuilder<(ins "Value":$mbar, "Value":$barriers, "Value":$states),
+    [{
+      build($_builder, $_state, {states.getType(), barriers.getType()}, mbar, barriers, states);
+    }]>];
+
+}
+
+#endif // TRITONINSTRUMENT_OPS
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+add_subdirectory(IR)`
	`2`	`+add_subdirectory(Transforms)`