intel
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 13 additions & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 19 additions & 14 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrument.md‎
Lines changed: 70 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrument.md‎
Lines changed: 70 additions & 0 deletions
@@ -93,6 +93,7 @@ docs/sg_execution_times.rst
 /compile_commands.json
 .vscode
 .vs
+.cursor
 
 # Vim
 *.swp
 
@@ -244,6 +244,7 @@ See [`python/triton/knobs.py`](python/triton/knobs.py) for the full list of conf
 - `TRITON_FRONT_END_DEBUGGING=1` disables exception wrapping when an error occurs in the compiler frontend, allowing the full stack trace to be seen.
 - `TRITON_DISABLE_LINE_INFO=1` removes all line information from the module.
 - `PTXAS_OPTIONS` passes additional command-line options to the PTX assembler `ptxas` (only on NVIDIA).
+- `LLVM_EXTRACT_DI_LOCAL_VARIABLES` emit full debug info, allowing for eval of values in gpu debuggers (ie cuda-gdb, rocm-gdb etc)
 
 > [!NOTE]
 > Some of these environment variables don't have a knob in `knobs.py`-- those are only relevant to the C++ layer(s), hence they don't exist in the python layer.
 
@@ -49,6 +49,12 @@
 #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
 #include "mlir/InitAllPasses.h"
 
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
+#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
+
 namespace mlir {
 namespace test {
 namespace intel {
@@ -108,13 +114,20 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::registerTritonGENToSPIRVPasses();
   mlir::LLVM::registerInlinerInterface(registry);
   mlir::NVVM::registerInlinerInterface(registry);
+  mlir::registerLLVMDILocalVariable();
 
   // TritonAMDGPUToLLVM passes
   mlir::triton::registerAllocateAMDGPUSharedMemory();
   mlir::triton::registerConvertTritonAMDGPUToLLVM();
   mlir::triton::registerConvertBuiltinFuncToLLVM();
   mlir::triton::registerOptimizeAMDLDSUsage();
 
+  mlir::ub::registerConvertUBToLLVMInterface(registry);
+  mlir::registerConvertNVVMToLLVMInterface(registry);
+  mlir::registerConvertMathToLLVMInterface(registry);
+  mlir::cf::registerConvertControlFlowToLLVMInterface(registry);
+  mlir::arith::registerConvertArithToLLVMInterface(registry);
+
   // TritonAMDGPUTransforms passes
   mlir::registerTritonAMDGPUAccelerateMatmul();
   mlir::registerTritonAMDGPUOptimizeEpilogue();
 
@@ -48,6 +48,7 @@ constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
 
 // Find the contextual number of warps on which this operation is executed.
 int lookupNumWarps(Operation *op);
+int lookupNumWarps(Region *region);
 // Try to find the contextual number of warps on which this operation is
 // executed. Returns nullopt if a warp size cannot be find. This is used for
 // verifiers.
 
@@ -1036,15 +1036,17 @@ An encoding for tensors that have been produced by MFMA matrix core instructions
 available on AMD Instinct GPUs of CDNA architectures.
 
 It is characterized by the following parameters:
-- `version` indicates the GPU architecture:
+- `version`: The GPU architecture:
   - 1: gfx908: CDNA1
   - 2: gfx90a: CDNA2
   - 3: gfx942: CDNA3
   - 4: gfx950: CDNA4
-- `warpsPerCTA` indicates the warp layout in the block.
-- `MDim` and `NDim` indicate the dimension of the output of the mfma instruction.
-- `isTransposed` indicates the result tensor is transposed so that it can be converted to dotOperand layout
+- `warpsPerCTA`: The warp layout in the block.
+- `instrShape`: The shape in the form of (M, N, K) of the matrix.
+- `isTransposed`: Indicates the result tensor is transposed so that it can be converted to dotOperand layout
 without going to shared memory. This is used in the case of chained dot (E.g. Flash-Attention kernel).
+- `tilesPerWarp`: The tile layout within a warp. Defaults to unit tile layout, i.e., single tile on all dimensions.
+- `elementBitWidth`: Bit width of the output element type. Supported values are 32 and 64. Defaults to 32.
 
 Example 1:
 Suppose we have a tensor with a shape of [32, 64], warpsPerCTA set to [1, 2] and MDim=NDim=32.
@@ -1154,25 +1156,27 @@ w2 w2 w3 w3
     ins
     "unsigned": $version,
     ArrayRefParameter<"unsigned">:$warpsPerCTA,
-    ArrayRefParameter<"unsigned">:$tilesPerWarp,
-    "unsigned":$MDim,
-    "unsigned":$NDim,
+    ArrayRefParameter<"unsigned">:$instrShape,
     "bool":$isTransposed,
     "CTALayoutAttr":$CTALayout,
-    DefaultValuedParameter<"std::optional<Type>", "FloatType::get($_ctxt, 32)">:$elementType
+    ArrayRefParameter<"unsigned">:$tilesPerWarp,
+    "unsigned":$elementBitWidth
   );
 
   let builders = [
     AttrBuilder<(ins "unsigned":$version,
                      "ArrayRef<unsigned>":$warpsPerCTA,
-                     "unsigned":$MDim,
-                     "unsigned":$NDim,
+                     "ArrayRef<unsigned>":$instrShape,
                      "bool":$isTransposed,
                      "CTALayoutAttr":$CTALayout,
-                     "std::optional<Type>":$elementType), [{
-      SmallVector<unsigned> tilesPerWarp(warpsPerCTA.size(), 1);
-
-      return $_get(context, version, warpsPerCTA, tilesPerWarp, MDim, NDim, isTransposed, CTALayout, elementType);
+                     CArg<"ArrayRef<unsigned>", "{}">:$tpw,
+                     CArg<"unsigned", "0">:$elementBitWidth), [{
+      SmallVector<unsigned> tilesPerWarp(tpw);
+      if (tilesPerWarp.empty())
+        tilesPerWarp = SmallVector<unsigned>(warpsPerCTA.size(), 1);
+      if (elementBitWidth == 0)
+        elementBitWidth = 32;
+      return $_get($_ctxt, version, warpsPerCTA, instrShape, isTransposed, CTALayout, tilesPerWarp, elementBitWidth);
     }]>
   ];
 
@@ -1194,6 +1198,7 @@ w2 w2 w3 w3
 
   let genVerifyDecl = 1;
   let hasCustomAssemblyFormat = 1;
+  let skipDefaultBuilders = 1;
 }
 
 def AMDWmmaEncodingAttr : DistributedEncoding<"AMDWmmaEncoding", "amd_wmma_encoding", [MmaEncodingTrait]> {
 
@@ -53,6 +53,14 @@ class MMAv5PipelineableOperandsHelper {
   bool isOperandPipelineable(Value v, Operation *&foundDef);
 };
 
+bool areScalesPipelineable(TCGen5MMAScaledOp scaledOp, scf::ForOp forOp);
+bool isOperandPipelineableBase(
+    Value v, scf::ForOp forOp, Operation *&foundDef,
+    std::function<bool(Operation *)> isPipelineable =
+        [](Operation *) { return false; },
+    std::function<bool(Operation *)> isLoadToBePipelined =
+        [](Operation *) { return false; });
+
 //===----------------------------------------------------------------------===//
 // MMA Pipeline Rewriters
 //===----------------------------------------------------------------------===//
 
@@ -0,0 +1,70 @@
+# Triton Instrument Dialect and Concurrency Sanitizer (ConSan)
+
+### Overview
+
+ConSan instruments Triton IR to detect illegal concurrent accesses to shared and Tensor Core memory under warp specialization. It tracks per-buffer visibility of reads and writes across threads, models barrier-based synchronization, and models commit-count–based synchronization (cp.async, wgmma).
+
+Auxiliary state is kept in distributed tensors and global scratch memory, with types created on-demand per warp-specialization partition.
+
+### Thread model
+
+- Base threads: 16 warp-specialization (WS) threads (allowing for up to 16 partitions).
+- Peer classes: +16 Tensor Core (TC) threads and +16 TMA threads to model lack of ordering with base threads.
+- Total logical threads: 48. Bitmasks are sized to the next power of two: 64.
+
+Indexing uses a logical thread id in [0, 48), with column vectors sized to 64 for layout convenience.
+
+## Auxiliary data structures
+
+All types are generated on-demand (per partition) based on:
+
+- B: number of tracked buffers (power-of-two padded)
+- K: number of mbarriers (power-of-two padded)
+- T_bits: 64 (bitmask width)
+- T_commits: 16 (base threads; commit counters do not apply to TC/TMA helpers)
+
+“tensor” means a distributed Triton tensor; “scratch” means a pointer into global scratch memory. Shapes below are logical; actual encodings are partition-local blocked layouts.
+
+- buffers (tensor, <B x i64>): Base pointers of all (sub)buffers per memory space
+- barriers (tensor, <K x i64>): Pointers of all mbarriers
+- writeVisibility (scratch, <B x i64>): Per-buffer bitmask. Bit i set ⇒ thread i can see latest completed write to that buffer
+- readVisibility (scratch, <B x 64 x i64>): Per-buffer, per-thread lanes. Each lane stores a 64-bit mask of other threads whose reads are visible to that lane’s thread
+- writeTracking (scratch, <B x K x i8>): Map buffers → barriers tracking writes (boolean stored in i8)
+- readTracking (scratch, <B x K x i64>): Map buffers → barriers tracking reads (bitmask of threads)
+- outstandingCommits (scratch, <B x 16 x i8>): Per-buffer, per-base-thread commit counters for cp.async and wgmma
+
+## Visibility and legality rules
+
+- Reads are legal iff the reading thread sees the most recent write to the buffer (writeVisibility). There can be only one write in-flight.
+- Writes are legal iff the writing thread sees both all prior writes and all reads completed for that buffer.
+
+ConSan enforces these via two checks emitted before memory ops:
+
+- experimental_verify_write_visibility: “no one else is writing, or I can see the write”
+- experimental_verify_read_visibility: “my read-visibility lane is a superset of the OR of all lanes”
+
+## Barrier-based synchronization
+
+ConSan separates “tracking” from “visibility transfer”:
+
+- At memory ops that are tracked by a barrier (loads/stores, some TMEM ops):
+  - experimental_set_read_visibility / experimental_set_write_visibility updates the appropriate visibility table for the current thread and buffer.
+  - experimental_track_visible_reads / experimental_track_visible_writes snapshots current per-buffer visibility into readTracking/writeTracking for the given barrier.
+- At arrive/commit sites (e.g., tc commit, arrive on mbarrier): ConSan emits the track ops for both reads and writes.
+- At waits: experimental_transfer_visible_reads / experimental_transfer_visible_writes propagates tracked visibility from the barrier back into the waiting thread’s visibility, and this transfer is repeated to peer threads (base, TMA, TC) to keep the three classes consistent.
+
+## Commit-count–based synchronization
+
+Some hardware ops synchronize via “number of outstanding commits” rather than mbarriers.
+
+- Stage: experimental_stage_access_for_commit marks the current thread’s buffer lane with -1 (staged) in outstandingCommits[B x 16].
+- Commit: experimental_commit_accesses turns -1 into 1 and increments positive entries for the committing thread column.
+- Wait (cp.async): experimental_clear_outstanding_commits_set_write(thread, commits, writeVisibility, N) clears entries with count > N for the current thread, and sets the writeVisibility bit for rows where any thread’s entry was cleared.
+- Wait (wgmma): experimental_clear_outstanding_commits_set_read(thread, commits, readVisibility, N) clears entries with count > N for the current thread, and sets the readVisibility bit for rows where any thread’s entry was cleared.
+
+Legality checks for commit-count flows:
+
+- For writes to shared memory affected by cp.async: experimental_check_outstanding_commits(buffer, commits, "async_copy_global_to_shared") asserts the row for the buffer is all zeros (no pending writes), across all base-thread columns.
+- For reads of wgmma operands in shared memory: experimental_check_outstanding_commits(buffer, commits, "warpgroup_mma operand read") asserts the row is all zeros (no pending reads).
+
+Note: The check op has no “thread” operand; it inspects the whole row for the buffer.
-Original file line number
+Diff line change
 /compile_commands.json
 .vscode
 .vs
 +.cursor
 # Vim
 *.swp