intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 0 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 6 additions & 1 deletion b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 10 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 40 additions & 37 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 40 additions & 37 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 9 additions & 9 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 18 additions & 13 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 38 additions & 24 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 38 additions & 24 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp‎
Lines changed: 7 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/test/unit/language/test_core.py‎
Lines changed: 1 addition & 2 deletions b/‎python/test/unit/language/test_core.py‎
Lines changed: 1 addition & 2 deletions
@@ -246,13 +246,6 @@ chooseStMatrixLayout(MLIRContext *ctx, RankedTensorType tensorTy,
                      ArrayRef<unsigned> repShape,
                      ArrayRef<unsigned> paddedRepShape,
                      ArrayRef<unsigned> order, int swizzleByteSize);
-
-// FIXME
-// Exposing to use it in LinearLayoutConversionsTest.cpp
-// Remove it once we fully activate the DotOperand conversion via LLs
-class DotOperandEncodingAttr;
-LinearLayout ampereDotToLinearLayout(ArrayRef<int64_t> shape,
-                                     DotOperandEncodingAttr dot);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -115,7 +115,12 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
 
   assert(!isMfmaToDotShortcut(srcTy, dstTy));
 
-  auto [inOrd, outOrd] = getCvtOrder(srcLayout, dstLayout);
+  // FIXME This is NOT entirely correct
+  // This should be getElemOrder, but we don't have such a method
+  // TODO Implement getElemOrder and make sure it's consistent with
+  // getContigPerThread
+  auto inOrd = gpu::getThreadOrder(srcLayout);
+  auto outOrd = gpu::getThreadOrder(dstLayout);
   scratchConfig.order = outOrd;
 
   unsigned srcContigPerThread =
 
@@ -90,6 +90,16 @@ void decomposeBlockedToDotLayoutConversion(ModuleOp module) {
     auto dstDotOp =
         dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
     if (srcBlocked && dstDotOp) {
+      // FIXME [Dot LL]
+      // We support this one via LLs, as the LocalLoad path is buggy
+      if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dstDotOp.getParent())) {
+        bool largeKWidth =
+            dstDotOp.getKWidth() * dstType.getElementTypeBitWidth() > 64;
+        if (mma.isAmpere() && largeKWidth) {
+          return;
+        }
+      }
+
       Attribute sharedMemorySpace =
           triton::gpu::SharedMemorySpaceAttr::get(srcType.getContext());
       auto tmpType = MemDescType::get(
 
@@ -11,6 +11,7 @@
 #include "mlir/Support/LLVM.h"
 #include "triton/Analysis/Utility.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
+#include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
@@ -237,13 +238,36 @@ static SmallVector<unsigned> eraseOrder(ArrayRef<unsigned> order,
   return resOrder;
 }
 
+SmallVector<unsigned> getOrderForDotOperand(unsigned opIdx, unsigned rank,
+                                            bool kMajor) {
+  // kMajor: if true, the matrix is fastest-running on k,
+  //         otherwise it is on m (resp. n)
+  // opIdx=0: [batch, m, k] if rank == 3 else [m, k]
+  // opIdx=1: [batch, k, n] if rank == 3 else [k, n]
+  // batch (if rank == 3) is always the slowest running dimension
+  assert(rank == 2 || rank == 3);
+  assert(opIdx == 0 || opIdx == 1);
+  SmallVector<unsigned> order(rank);
+  std::iota(order.rbegin(), order.rend(), 0);
+  // If opIdx is 1 and kMajor is true, the order is [0, 1]
+  // (resp. [1, 2, 0] if rank == 3)
+  // Same if opIdx is 0 and kMajor is false
+  if (bool(opIdx) == kMajor) {
+    std::swap(order[0], order[1]);
+  }
+  return order;
+}
+
 SmallVector<unsigned> getWarpOrder(Attribute layout) {
   if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
     if (isa<AMDMfmaEncodingAttr>(dotLayout.getParent())) {
       return getWarpOrder(dotLayout.getParent());
     }
   }
   auto order = getOrder(layout);
+  // FIXME: This mmaLayout if should just return
+  // getOrderForDotOperand(0, order.size(), kMajor=false)
+  // as mma has the same order as DotOperand(opIdx=0)
   if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
     if (mmaLayout.isHopper()) {
       // Hopper MMA instructions force a warp order of [0, 1]. See docs:
@@ -253,40 +277,8 @@ SmallVector<unsigned> getWarpOrder(Attribute layout) {
       order.insert(order.begin(), 0);
     }
   } else if (auto dotOpLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
-    // opIdx=0: [/*dim0*/batch, /*dim1=*/m, /*dim2=*/k] -> order=[1, 2, 0]
-    // opIdx=1: [/*dim0*/batch, /*dim1=*/k, /*dim2=*/n] -> order=[2, 1, 0]
-    std::iota(order.rbegin(), order.rend(), 0);
-    if (dotOpLayout.getOpIdx() == 0) {
-      std::swap(order[0], order[1]);
-    }
-  }
-  return order;
-}
-
-SmallVector<unsigned> getOrderForDotOperand(unsigned opIdx, unsigned rank) {
-  assert((rank == 2 || rank == 3) &&
-         "Invalid rank for dot operand order computation");
-  SmallVector<unsigned> order(rank);
-  // The 'order' field typically represents a descending sorted array of
-  // dimensions based on contiguity. For instance, in axisInfo utilities that
-  // retrieve tensor contiguity, it's assumed that the dimension with the
-  // highest contiguity corresponds to order[0].
-  //
-  // The relation between contiguity and order is only relevant if the layout
-  // interfaces with HBM, as is the case when we load tensor from HBM to
-  // registers in the dot layout to bypass LDS. When bypassing LDS, we make
-  // the following assumptions about tensor layouts:
-  // - Tensor A (opIdx == 0) is considered to be row-major.
-  // - Tensor B (opIdx == 1) is considered to be column-major.
-  //
-  // Based on these assumptions, we define the following orders:
-  // - For opIdx == 0, batch=dim0, m=dim1, and k=dim2, we assume an order of [2,
-  // 1, 0] for 3D tensors.
-  // - For opIdx == 1, batch=dim0, k=dim1, and n=dim2, we assume an order of [1,
-  // 2, 0] for 3D tensors.
-  std::iota(order.rbegin(), order.rend(), 0);
-  if (opIdx == 1) {
-    std::swap(order[0], order[1]);
+    order = getOrderForDotOperand(dotOpLayout.getOpIdx(), order.size(),
+                                  /*kMajor*/ false);
   }
   return order;
 }
@@ -303,7 +295,7 @@ SmallVector<unsigned> getOrder(Attribute layout) {
     return order;
   }
   if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
-    auto rank = getWarpsPerCTA(dotLayout.getParent()).size();
+    auto rank = dotLayout.getWarpsPerCTA().size();
     // FIXME: delete if branch for `DpasEncodingAttr` and provide more
     // general solution to make `getOrderForDotOperand` function compatible
     // with Intel layouts.
@@ -314,7 +306,7 @@ SmallVector<unsigned> getOrder(Attribute layout) {
       std::iota(order.rbegin(), order.rend(), 0);
       return order;
     }
-    return getOrderForDotOperand(dotLayout.getOpIdx(), rank);
+    return getOrderForDotOperand(dotLayout.getOpIdx(), rank, /*kMajor*/ true);
   }
   if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout)) {
     SmallVector<unsigned> parentOrder = getOrder(sliceLayout.getParent());
@@ -1069,7 +1061,17 @@ SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
   return ::getWarpOrder(*this);
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getThreadOrder() const {
-  return ::getOrder(*this);
+  // FIXME: delete if branch for `DpasEncodingAttr` and provide more
+  // general solution to make `getOrderForDotOperand` function compatible
+  // with Intel layouts.
+  // More details:
+  // https://github.com/intel/intel-xpu-backend-for-triton/pull/2517
+  if (mlir::dyn_cast<intel::DpasEncodingAttr>(getParent())) {
+    return ::getOrder(*this);
+  } else {
+    return getOrderForDotOperand(getOpIdx(), getWarpsPerCTA().size(),
+                                 /*kMajor*/ true);
+  }
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getShapePerCTATile(
     ArrayRef<int64_t> tensorShape) const {
@@ -2055,6 +2057,7 @@ SmallVector<int64_t> NvidiaMmaEncodingAttr::getMMAv2RepForOperand(
     ArrayRef<int64_t> shape, int bitwidth, int kWidth, int opIdx) const {
   auto rank = shape.size();
   auto warpsPerCTA = getWarpsPerCTA();
+
   SmallVector<int> shapePerWarp = {1, 16, 8, 4 * 64 / bitwidth};
   int numRepBatch =
       rank == 3
 
@@ -906,17 +906,17 @@ std::optional<LinearLayout>
 DotOperandEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   if (auto mfmaLayout = llvm::dyn_cast<AMDMfmaEncodingAttr>(getParent())) {
     return dotOperandMfmaToLinearLayout(*this, shape);
-  }
-  if (auto dpasLayout = llvm::dyn_cast<intel::DpasEncodingAttr>(getParent())) {
+  } else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(getParent())) {
+    // FIXME [Dot LL]
+    // Do this unconditionally
+    auto largeKWidth = getKWidth() == 8;
+    if (mma.isAmpere() && largeKWidth) {
+      return ampereDotToLinearLayout(shape, *this);
+    }
+  } else if (auto dpasLayout =
+                 llvm::dyn_cast<intel::DpasEncodingAttr>(getParent())) {
     return dotOperandDpasToLinearLayout(*this, shape);
   }
-
-  // TODO Activate in a follow-up PR
-  // else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(getParent())) {
-  //  if (mma.isAmpere()) {
-  //    return ampereDotToLinearLayout(shape, *this);
-  //  }
-  //}
   return std::nullopt;
 }
 
 
@@ -17,8 +17,9 @@ LogicalResult UpcastMXFPOp::verify() {
   auto xTy = getSrc().getType();
   auto scaleTy = getScale().getType();
 
-  if (xTy.getElementType() != FloatType::getBF16(getContext())) {
-    return emitOpError("element type of the first operand must be bf16");
+  if (xTy.getElementType() != FloatType::getBF16(getContext()) &&
+      xTy.getElementType() != IntegerType::get(getContext(), 8)) {
+    return emitOpError("element type of the first operand must be bf16 or i8");
   }
 
   if (scaleTy.getElementType() != IntegerType::get(getContext(), 8)) {
@@ -72,7 +73,7 @@ LogicalResult UpcastMXFPOp::verify() {
 }
 
 LogicalResult UpcastMXFPOp::inferReturnTypes(
-    MLIRContext *context, std::optional<Location> location, ValueRange operands,
+    MLIRContext *ctx, std::optional<Location> loc, ValueRange operands,
     DictionaryAttr attributes, OpaqueProperties opaqueProperties,
     RegionRange regions, SmallVectorImpl<Type> &inferredReturnTypes) {
   auto xTy = cast<RankedTensorType>(operands[0].getType());
@@ -82,21 +83,25 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
 
   auto encoding = xTy.getEncoding();
   if (!encoding) {
-    return emitOptionalError(location, "expected an encoding");
+    return emitOptionalError(loc, "expected an encoding");
   }
   if (!mlir::isa<DotOperandEncodingAttr>(encoding)) {
-    return emitOptionalError(location, "expected an mma layout encoding");
-  }
-  if (xShape.size() < 2) {
-    return emitOptionalError(location, "tensor rank must be at least 2");
+    return emitOptionalError(loc, "expected a dotOperand encoding");
   }
 
-  // For now we just return the input encoding. For fp4 we'll need to cast from
-  // tf32 to fp16 encoding and multiply the shape by two
-  assert((typeEncoded == F8F6F4Type::E4M3 || typeEncoded == F8F6F4Type::E5M2) &&
-         "NYI: only fp8e4m3 and fp8e5m2 are supported");
+  if (typeEncoded == F8F6F4Type::E2M1) {
+    auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+    auto newVEncoding = DotOperandEncodingAttr::get(
+        ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(),
+        oldEncoding.getKWidth() * 2);
+    auto newShape = SmallVector<int64_t>(xShape);
+    newShape.back() *= 2;
+    inferredReturnTypes.push_back(
+        RankedTensorType::get(newShape, FloatType::getBF16(ctx), newVEncoding));
+  } else {
+    inferredReturnTypes.push_back(xTy);
+  }
 
-  inferredReturnTypes.push_back(xTy);
   return success();
 }
 
 
@@ -406,7 +406,7 @@ class ScaledBlockedToMMAv2
     auto ctx = dotOp.getContext();
 
     // Check that rhs scale is null
-    assert(dotOp.getRhsScale() == nullptr && "rhs scale must be null");
+    assert(dotOp.getRhsScale() == nullptr && "rhs scale NYI");
 
     // operands
     auto a = dotOp.getLhs();
@@ -426,10 +426,11 @@ class ScaledBlockedToMMAv2
       }
     };
 
-    assert(aType == F8F6F4Type::E4M3 ||
-           aType == F8F6F4Type::E5M2 && "lhs just supports fp8");
+    assert((aType == F8F6F4Type::E4M3 || aType == F8F6F4Type::E5M2 ||
+            aType == F8F6F4Type::E2M1) &&
+           "NYI: lhs supports fp4 or fp8");
     assert(bType == F8F6F4Type::E4M3 ||
-           bType == F8F6F4Type::E5M2 && "rhs just supports fp8");
+           bType == F8F6F4Type::E5M2 && "NYI: rhs supports fp8");
 
     // TODO run accelerate matmul on A and B first to choose their layouts
     // Set return type
@@ -440,6 +441,7 @@ class ScaledBlockedToMMAv2
     auto instrShape = mmaVersionToInstrShape(versionMajor, retShapePerCTA,
                                              rewriter.getBF16Type(), numWarps);
     auto CTALayout = getCTALayout(oldRetType.getEncoding());
+    // TODO Use warpsPerTileV2
     SmallVector<unsigned> warpsPerCTA = {numWarps, 1};
     auto mmaEnc = NvidiaMmaEncodingAttr::get(ctx, /*versionMajor=*/versionMajor,
                                              /*versionMinor=*/0, warpsPerCTA,
@@ -452,27 +454,39 @@ class ScaledBlockedToMMAv2
     auto newAcc =
         rewriter.create<ConvertLayoutOp>(oldAcc.getLoc(), newRetType, oldAcc);
 
-    auto toMMABf16 = [&newRetType, &rewriter, &ctx,
-                      &enumToType](TypedValue<RankedTensorType> v, int idx,
-                                   F8F6F4Type type) {
-      // MMAv2 Layout
+    auto toMMABf16 = [&newRetType, &rewriter, &ctx, &enumToType](
+                         TypedValue<RankedTensorType> v, int idx,
+                         F8F6F4Type type) -> TypedValue<RankedTensorType> {
       auto vType = v.getType();
-      auto newVEncoding = DotOperandEncodingAttr::get(
-          ctx, idx, newRetType.getEncoding(), enumToType((type)));
-      auto newVType = RankedTensorType::get(
-          v.getType().getShape(), v.getType().getElementType(), newVEncoding);
-      v = rewriter.create<ConvertLayoutOp>(v.getLoc(), newVType, v);
-
-      // Bitcast
-      auto vTypeFp8 = RankedTensorType::get(
-          vType.getShape(), rewriter.getFloat8E4M3FNType(), newVEncoding);
-      v = cast<TypedValue<RankedTensorType>>(
-          rewriter.create<BitcastOp>(v.getLoc(), vTypeFp8, v).getResult());
-
-      // Convert to bf16
-      auto vTypeBf16 = RankedTensorType::get(
-          vType.getShape(), rewriter.getBF16Type(), newVEncoding);
-      return rewriter.create<FpToFpOp>(v.getLoc(), vTypeBf16, v);
+      if (type == F8F6F4Type::E2M1) {
+        // A bit too dynamically typed...
+        // perhaps return ints in both cases?
+
+        auto retEnc = dyn_cast<NvidiaMmaEncodingAttr>(newRetType.getEncoding());
+        auto newVEncoding = DotOperandEncodingAttr::get(
+            ctx, idx, newRetType.getEncoding(), /*kWidth=*/4);
+        auto newVType = RankedTensorType::get(
+            vType.getShape(), vType.getElementType(), newVEncoding);
+        return rewriter.create<ConvertLayoutOp>(v.getLoc(), newVType, v);
+      } else {
+        assert(type == F8F6F4Type::E5M2 || type == F8F6F4Type::E4M3);
+        auto newVEncoding = DotOperandEncodingAttr::get(
+            ctx, idx, newRetType.getEncoding(), /*kWidth=*/8);
+        auto newVType = RankedTensorType::get(
+            vType.getShape(), vType.getElementType(), newVEncoding);
+        v = rewriter.create<ConvertLayoutOp>(v.getLoc(), newVType, v);
+
+        // Bitcast
+        auto vTypeFp8 = RankedTensorType::get(vType.getShape(),
+                                              enumToType(type), newVEncoding);
+        v = cast<TypedValue<RankedTensorType>>(
+            rewriter.create<BitcastOp>(v.getLoc(), vTypeFp8, v).getResult());
+
+        // Convert to bf16
+        auto vTypeBf16 = RankedTensorType::get(
+            vType.getShape(), rewriter.getBF16Type(), newVEncoding);
+        return rewriter.create<FpToFpOp>(v.getLoc(), vTypeBf16, v);
+      }
     };
     a = toMMABf16(a, 0, aType);
     b = toMMABf16(b, 1, bType);
 
@@ -44,6 +44,13 @@ class TritonGPUReduceDataDuplicationPass
         return;
       if (!cvtNeedsSharedMemory(srcType, dstType))
         return;
+      // FIXME [Dot LL]
+      // We support this one via LLs, as the LocalLoad path is buggy
+      bool largeKWidth =
+          dstDotOp.getKWidth() * dstType.getElementTypeBitWidth() > 64;
+      if (largeKWidth) {
+        return;
+      }
       auto srcOrder = triton::gpu::getOrder(srcEncoding);
       auto rank = srcOrder.size();
       SmallVector<unsigned> sharedOrder;
 
@@ -3475,10 +3475,9 @@ def mxfp_to_bf16_kernel(
         tl.store(mxfp_ptr + offsets, tl.ravel(mxfp), mask=offsets < N * 32)
 
     def dot_scale_ref(x, scale, y, type_x, type_y):
-        e_bits, m_bits = {"e4m3": (4, 3), "e5m2": (5, 2)}[type_x]
+        e_bits, m_bits = {"e2m1": (2, 1), "e4m3": (4, 3), "e5m2": (5, 2)}[type_x]
         type_fp8_y = {"e4m3": torch.float8_e4m3fn, "e5m2": torch.float8_e5m2}[type_y]
 
-        # Need to implement fp4 -> fp8 cast to support 1 byte types
         comp_dtype = torch.bfloat16
 
         x = x.contiguous()