intel
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/nvidia-toolchain-version.json‎
Lines changed: 1 addition & 0 deletions b/‎cmake/nvidia-toolchain-version.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 28 additions & 86 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 28 additions & 86 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonAttrDefs.td‎
Lines changed: 3 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonAttrDefs.td‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 3 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 16 additions & 15 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 16 additions & 15 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 12 additions & 5 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 7 additions & 52 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 7 additions & 52 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h‎
Lines changed: 0 additions & 37 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 0 additions & 12 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 0 additions & 12 deletions
@@ -73,6 +73,7 @@ cmake-build-*
 cuobjdump
 nvdisasm
 ptxas
+ptxas-blackwell
 
 # Third-party include
 third_party/nvidia/backend/include
 
@@ -1,4 +1,5 @@
 {
+  "ptxas-blackwell": "12.9.86",
   "ptxas": "12.8.93",
   "cuobjdump": "12.8.55",
   "nvdisasm": "12.8.55",
 
@@ -57,6 +57,7 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
   // computation is eliminated.
   SmallVector<Value> maybeDeduplicate(SourceOp op,
                                       SmallVector<Value> resultVals) const {
+    auto ctx = op.getContext();
     if (!isMemoryEffectFree(op))
       // the op has side effects: can't dedup
       return resultVals;
@@ -65,104 +66,45 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
       // there must be exactly 1 result
       return resultVals;
     Value result = results[0];
-    Type type = result.getType();
-    if (!type)
-      return resultVals;
-    RankedTensorType rtType = dyn_cast<RankedTensorType>(type);
+    RankedTensorType rtType = dyn_cast<RankedTensorType>(result.getType());
     if (!rtType)
       // the result must be a tensor
       return resultVals;
-    Attribute encoding = rtType.getEncoding();
-    if (!encoding)
-      // encoding not available
-      return resultVals;
-    Attribute baseEncoding = encoding;
-    if (isa<AMDMfmaEncodingAttr>(baseEncoding) ||
-        isa<AMDWmmaEncodingAttr>(baseEncoding))
-      // TODO: this logic seems incorrect for mfma and wmma layout. Skip for
-      // now. We saw mismatches for some flash-attention and dot tests on AMD
-      // backend. Note that this logic works for sliced layout whose parent is
-      // mfma layout. Therefore, this is not combined with the following check.
-      return resultVals;
-    while (auto sliced = dyn_cast<SliceEncodingAttr>(baseEncoding))
-      baseEncoding = sliced.getParent();
-    if (isa<LinearEncodingAttr, DotOperandEncodingAttr>(baseEncoding)) {
-      // TODO: this logic seems incorrect for mma layout. Skip for now.
-      // The following test crashes and some other miscompile:
-      // test_core::test_fp8_dot_acc
-      return resultVals;
-    }
 
-    SmallVector<unsigned> elemsPerThread = getElemsPerThread(rtType);
-    int rank = elemsPerThread.size();
-    if (product<unsigned>(elemsPerThread) != resultVals.size())
-      return resultVals;
+    // Bail out if we don't have the constancy analysis
     AxisInfo *axisInfo = axisAnalysisPass.getAxisInfo(result);
     if (!axisInfo)
-      // axis info (e.g., constancy) not available
-      return resultVals;
-    SmallVector<unsigned> contigPerThread = getContigPerThread(rtType);
-    if (rank != contigPerThread.size())
       return resultVals;
-
     SmallVector<int64_t> constancy = axisInfo->getConstancy();
-    if (rank != constancy.size())
-      return resultVals;
-    bool hasConstancy = false;
-    for (int i = 0; i < rank; ++i) {
-      if (constancy[i] > contigPerThread[i]) {
-        if (constancy[i] % contigPerThread[i] != 0)
-          // constancy is not evenly covered by contigPerThread
-          return resultVals;
-        // can't move the values across different
-        // "contigPerThread"-sized blocks
-        constancy[i] = contigPerThread[i];
-      }
-      if (elemsPerThread[i] < 1 || constancy[i] < 1)
-        return resultVals;
-      if (!(elemsPerThread[i] % constancy[i] == 0 ||
-            constancy[i] % elemsPerThread[i] == 0))
-        // either the constancy along each dimension must fit
-        // into the elemsPerThread or the other way around
-        return resultVals;
-      if (constancy[i] > 1)
-        hasConstancy = true;
-    }
-    if (!hasConstancy)
-      // nothing to deduplicate
-      return resultVals;
 
-    if (rank > 1) {
-      // reorder the shape and constancy vectors by the axis order:
-      // from the fastest-changing to the smallest-changing axis
-      SmallVector<unsigned> order = getOrder(rtType);
-      if (rank != order.size())
-        return resultVals;
-      elemsPerThread = applyPermutation(elemsPerThread, order);
-      constancy = applyPermutation(constancy, order);
-    }
+    if (llvm::all_of(constancy, [](int64_t c) { return c == 1; }))
+      return resultVals;
 
-    SmallVector<unsigned> strides(rank, 1);
-    for (int i = 1; i < rank; ++i) {
-      strides[i] = strides[i - 1] * elemsPerThread[i - 1];
-    }
-    SmallVector<Value> dedupResultVals;
-    dedupResultVals.reserve(resultVals.size());
-    for (int i = 0; i < resultVals.size(); ++i) {
-      // each coordinate of the orig_idx is "coarsened" using the
-      // constancy along this dimension: the resulting dedup_idx
-      // points to the reused value in the original resultsVal
-      int orig_idx = i;
-      int dedup_idx = 0;
-      for (int j = 0; j < rank; ++j) {
-        int coord_j = orig_idx % elemsPerThread[j];
-        dedup_idx += (coord_j / constancy[j] * constancy[j]) * strides[j];
-        orig_idx /= elemsPerThread[j];
+    // We zero out the bases that are constant
+    auto kReg = StringAttr::get(ctx, "register");
+    auto ll = toLinearLayout(rtType);
+    auto dims = to_vector(ll.getOutDimNames());
+    auto llReg = ll.sublayout({kReg}, dims);
+    auto inv = ll.pseudoinvert();
+    auto invReg = inv.sublayout(dims, {kReg});
+    auto bases_inv = invReg.getBases();
+    for (auto [c, d] : llvm::zip(constancy, dims)) {
+      assert(llvm::isPowerOf2_32(c));
+      for (int i = 0; i < llvm::Log2_32(c); i++) {
+        bases_inv[d][i] = {0};
       }
-      dedupResultVals.push_back(resultVals[dedup_idx]);
     }
-
-    return dedupResultVals;
+    auto invBroadcast =
+        LinearLayout(bases_inv, invReg.getOutDims(), /*isSurjective=*/false);
+    auto cvt = llReg.compose(invBroadcast);
+
+    // Deduplicate the result values
+    SmallVector<Value> outVals(resultVals.size());
+    for (int i = 0; i < outVals.size(); i++) {
+      auto srcIdx = cvt.apply({{kReg, i}}).begin()->second;
+      outVals[i] = resultVals[srcIdx];
+    }
+    return outVals;
   }
   LogicalResult
   matchAndRewrite(SourceOp op, OpAdaptor adaptor,
 
@@ -129,7 +129,9 @@ def TT_InputPrecisionAttr : I32EnumAttr<
     [
       I32EnumAttrCase<"TF32", 0, "tf32">,
       I32EnumAttrCase<"TF32x3", 1, "tf32x3">,
-      I32EnumAttrCase<"IEEE", 2, "ieee">
+      I32EnumAttrCase<"IEEE", 2, "ieee">,
+      I32EnumAttrCase<"BF16x3", 3, "bf16x3">,
+      I32EnumAttrCase<"BF16x6", 4, "bf16x6">
     ]>{
   let cppNamespace = "::mlir::triton";
 }
 
@@ -664,9 +664,11 @@ def TT_DotOp : TT_Op<"dot", [Pure,
 
     let description = [{
         $d = matrix_multiply($a, $b) + $c. $inputPrecision describes how to exercise the TC
-        when the inputs are f32. It can be one of: tf32, tf32x3, ieee.
+        when the inputs are f32. It can be one of: tf32, tf32x3, ieee, bf16x3, bf16x6.
         tf32: use TC with tf32 ops.
         tf32x3: implement the 3xTF32 trick. For more info see the pass in F32DotTC.cpp
+        bf16x3: implement the 3xBF16 trick. For more info see the pass in F32DotTC.cpp
+        bf16x6: implement the 6xBF16 trick. For more info see the pass in F32DotTC.cpp
         ieee: don't use TC, implement dot in software.
         If the GPU does not have Tensor cores or the inputs are not f32, this flag is ignored.
     }];
 
@@ -117,6 +117,19 @@ chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
                      int32_t elemBitWidth, unsigned instBitWidth,
                      unsigned numLanesInShuffleGroup);
 
+LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
+                                           int numWarps);
+
+std::optional<LinearLayout>
+getTmemLoadStoreLayout16x256(int M, int N, RankedTensorType oldType,
+                             int numWarps);
+
+// Return a layout valid for TMemLoad op for a tmem layout of block MxN that
+// distribute the data long M for the warp groups. This doesn't affect the TMem
+// layout it just returns a distributed layout compatible for tmem_load.
+LinearLayout getTmemLoadLayoutSplitLongM(int M, int N, RankedTensorType oldType,
+                                         int numWarps);
+
 // Create LinearLayout for scale in scaled mfma.
 LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<int64_t> dotOperandShape,
@@ -129,12 +142,10 @@ LinearLayout chooseScaledWmmaScaleLayout(
     const std::vector<std::vector<int32_t>> &dotOperandWarpBasis,
     ArrayRef<int64_t> dotOperandShape);
 
-LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx, int dotOperandIdx,
-                                          ArrayRef<int64_t> dotOperandShape,
-                                          ArrayRef<unsigned> tilesPerWarp,
+LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx,
+                                          ArrayRef<int64_t> shape, int opIdx,
                                           ArrayRef<unsigned> warpsPerCTA,
-                                          unsigned instrM, unsigned instrN,
-                                          CTALayoutAttr ctaLayoutAttr);
+                                          CTALayoutAttr ctaLayout);
 
 // Create LinearLayout for nvidia mma tile.
 LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef<unsigned> tileShape,
@@ -151,15 +162,5 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
                                        bool disableSwizzle);
 
-// Make a LinearLayout that maps a block-id to an N-dimensional index.
-//
-// The tensor is split up into CTAsPerCGA pieces, which are distributed among
-// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
-//
-// See the nomenclature note at the top of the LinearLayoutConversions.cpp file
-// for an explanation of why this is called makeCgaLayout when it accepts a
-// CTALayoutAttr.
-LinearLayout makeCgaLayout(CTALayoutAttr layout);
-
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -177,15 +177,22 @@ def TritonGPUPartitionScheduling : Pass<"tritongpu-partition-scheduling", "mlir:
 }
 
 def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
-  let summary = "3xTF32 trick";
+  let summary = "Emulate dot-product tensor core precision using TF32s or BF16s";
 
   let description = [{
-    Decompose fp32 `DotOp` instructions into 4 pointwise ops and 3 fp16 `DotOp`s
-    to allow using TensorCores. See https://github.com/NVIDIA/cutlass/discussions/385
+      Generic pass to emulate/decompose f32 `DotOp` instructions.
+    * Decompose fp32 `DotOp` instructions into 4 pointwise ops and 3 fp16 `DotOp`s
+      to allow using TensorCores. See https://github.com/NVIDIA/cutlass/discussions/385.
+    * Decompose fp32 `DotOp` instructions into BF16 operations.
+      See https://arxiv.org/abs/1904.06376
   }];
 
-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
-                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"];
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
+  let options = [
+    Option<"emuTF32", "emu-tf32",
+           "bool", /*default*/"false",
+           "whether to handle InputPrecision TF32xN for Nvidia GPUs">
+  ];
 }
 
 def TritonGPUPrefetch : Pass<"tritongpu-prefetch", "mlir::ModuleOp"> {
 
@@ -29,7 +29,6 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
-#include "llvm/Support/ErrorHandling.h"
 
 // TritonNvidiaGPU depends on Triton
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -62,68 +61,24 @@ struct TMemAllocation {
   int numCols;
 };
 
-// Used to describe the layout of the TMEM load/store instructions
-enum class TMemAccessAtom { I32x32b, I16x64b, I16x128b, I16x256b, I16x32bx2 };
-
-inline int getElementsPerThread(TMemAccessAtom atom) {
-  switch (atom) {
-  case TMemAccessAtom::I32x32b:
-  case TMemAccessAtom::I16x64b:
-  case TMemAccessAtom::I16x32bx2:
-    return 1;
-  case TMemAccessAtom::I16x128b:
-    return 2;
-  case TMemAccessAtom::I16x256b:
-    return 4;
-  }
-  llvm_unreachable("Unknown TMemAccessAtom");
-}
-
-inline const char *getOpShape(TMemAccessAtom atom) {
-  switch (atom) {
-  case TMemAccessAtom::I32x32b:
-    return "32x32b";
-  case TMemAccessAtom::I16x64b:
-    return "16x64b";
-  case TMemAccessAtom::I16x128b:
-    return "16x128b";
-  case TMemAccessAtom::I16x256b:
-    return "16x256b";
-  case TMemAccessAtom::I16x32bx2:
-    return "16x32bx2";
-  }
-  llvm_unreachable("Unknown TMemAccessAtom");
-}
-
-LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom,
-                           bool unpacked);
-
 TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
 
-SmallVector<gpu::DistributedEncodingTrait>
-getTmemCompatibleLayouts(gpu::MemDescType memType, unsigned numWarps,
-                         ArrayRef<int64_t> ctaSplit = {1, 1});
-
-std::optional<gpu::DistributedEncodingTrait>
+gpu::DistributedEncodingTrait getTmemCompatibleLayout(unsigned M, unsigned N,
+                                                      RankedTensorType oltType,
+                                                      unsigned numWarps);
+gpu::DistributedEncodingTrait
 getTmemLoadLayoutSplitLongM(RankedTensorType tensorType,
                             gpu::MemDescType memType, int numWarps);
-
 SmallVector<gpu::DistributedEncodingTrait>
 getTmemCompatibleLayouts(Operation *op, RankedTensorType tensorType,
                          gpu::MemDescType memType);
 
 bool isDistributedLayoutTMemCompatible(Operation *op,
                                        RankedTensorType tensorType,
                                        gpu::MemDescType memType);
-
-gpu::DistributedEncodingTrait
-getDefaultLayoutForTmemLdSt(gpu::MemDescType memType, unsigned numWarps,
-                            gpu::CTALayoutAttr ctaLayout);
-
-std::optional<LinearLayout>
-getDistributedLayoutForTmemLdSt(gpu::MemDescType memType, TMemAccessAtom atom,
-                                unsigned numWarps,
-                                gpu::CTALayoutAttr ctaLayout);
+bool isDistributedLayoutSplitMTmemLoadStore(RankedTensorType tensorType,
+                                            gpu::MemDescType memType,
+                                            int numWarps);
 
 } // namespace mlir::triton::nvidia_gpu
 
 
@@ -558,18 +558,6 @@ class LinearLayout {
     return reshapeOuts({{*getOutDimNames().begin(), getTotalOutDimSize()}});
   }
 
-  [[nodiscard]] LinearLayout renameInDim(StringAttr oldDim,
-                                         StringAttr newDim) const {
-    auto bases = getBases();
-    auto it = bases.find(oldDim);
-    assert(it != bases.end());
-    auto value = std::move(it->second);
-    bases.erase(it);
-    bases.insert({newDim, std::move(value)});
-    return LinearLayout(bases, getOutDims(),
-                        /*requireSurjective=*/isSurjective());
-  }
-
   // Concatenates two layouts by their in (resp. out) dimensions. The layouts
   // must have the same output (resp. input) dimensions and sizes and different
   // input (resp. output) dimensions. The input dimensions of this layout are
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`{`
	`2`	`+ "ptxas-blackwell": "12.9.86",`
`2`	`3`	`"ptxas": "12.8.93",`
`3`	`4`	`"cuobjdump": "12.8.55",`
`4`	`5`	`"nvdisasm": "12.8.55",`
Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,9 @@ def TT_InputPrecisionAttr : I32EnumAttr<`
`129`	`129`	`[`
`130`	`130`	`I32EnumAttrCase<"TF32", 0, "tf32">,`
`131`	`131`	`I32EnumAttrCase<"TF32x3", 1, "tf32x3">,`
`132`		`- I32EnumAttrCase<"IEEE", 2, "ieee">`
	`132`	`+ I32EnumAttrCase<"IEEE", 2, "ieee">,`
	`133`	`+ I32EnumAttrCase<"BF16x3", 3, "bf16x3">,`
	`134`	`+ I32EnumAttrCase<"BF16x6", 4, "bf16x6">`
`133`	`135`	`]>{`
`134`	`136`	`let cppNamespace = "::mlir::triton";`
`135`	`137`	`}`