intel
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 6 additions & 4 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 3 additions & 4 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 4 additions & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 27 additions & 17 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 27 additions & 17 deletions
@@ -11,6 +11,9 @@ python/*.whl
 python/triton/_C/*.pyd
 python/triton/_C/*.so
 python/triton/_C/*.dylib
+python/triton/_C/*.pdb
+python/triton/_C/*.exe
+python/triton/_C/*.ilk
 
 benchmarks/dist
 benchmarks/*.egg-info/
 
@@ -86,7 +86,8 @@ def TT_BitcastOp : TT_Op<"bitcast", [Elementwise,
     // TODO: Add verifier
 }
 
-def TT_FpToFpOp : TT_Op<"fp_to_fp", [SameOperandsAndResultShape,
+def TT_FpToFpOp : TT_Op<"fp_to_fp", [Elementwise,
+                                     SameOperandsAndResultShape,
                                      SameOperandsAndResultEncoding,
                                      Pure,
                                      /*DeclareOpInterfaceMethods<CastOpInterface>*/]> {
@@ -675,6 +676,7 @@ def TT_DotOp : TT_Op<"dot", [Pure,
 // DotScaled Op
 //
 def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
+                             AttrSizedOperandSegments,
                              DotLike,
                              TypesMatchWith<"result's type matches accumulator's type",
                                             "d", "c", "$_self">]> {
@@ -692,7 +694,7 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
       RankedTensorOf<[TT_Float,I8]>:$lhs,
       RankedTensorOf<[TT_Float,I8]>:$rhs,
       TT_FloatTensor:$c,
-      RankedTensorOf<[I8]>:$lhs_scale,
+      Optional<RankedTensorOf<[I8]>>:$lhs_scale,
       Optional<RankedTensorOf<[I8]>>:$rhs_scale,
       TT_ScaleDotElemTypeAttr:$lhs_type,
       TT_ScaleDotElemTypeAttr:$rhs_type
@@ -702,8 +704,8 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
 
     // Not sure why I need to fully specify the optional group, but otherwise it complains when loading the mlir file
     let assemblyFormat = [{
-      $lhs `,` $lhs_scale `,` $rhs (`,`) : (`,` $rhs_scale^ `,`)? $c `lhs` `=` $lhs_type `rhs` `=` $rhs_type attr-dict
-      `:` type($lhs) `,` type($lhs_scale) `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($d)
+      $lhs (`scale` $lhs_scale^)? `,` $rhs (`scale` $rhs_scale^)? `,` $c `lhs` `=` $lhs_type `rhs` `=` $rhs_type attr-dict
+      `:` type($lhs) (`,` type($lhs_scale)^)? `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($d)
     }];
 }
 
 
@@ -371,10 +371,9 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     auto srcTy = op.getSrc().getType();
     auto dstTy = op.getType();
 
-    // TODO (Keren): Currently, we handle general mma/blocked/slice ->
-    // mma/blocked/slice conversions.
-    // The following tasks must be completed before we can remove the layoutIsOK
-    // check:
+    // TODO (Keren): Currently, we handle general mma/blocked/slice/dot(ampere)
+    // -> mma/blocked/slice/dot(ampere) conversions. The following tasks must be
+    // completed before we can remove the layoutIsOK check:
     // 1. Support for AMD's MFMA and WMMA
     std::function<bool(Attribute)> layoutIsOK = [&](Attribute layout) {
       if (auto nvidiaMma = dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
 
@@ -140,8 +140,11 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
   // Do for all DotOperandEncodingAttr once we have LLs for all of them
   static bool isSupportedDotOpLayout(Attribute layout) {
     if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
+      // Use when the SharedToDotOperandMMAv2OrV3 is known to be buggy:
+      // - kWidth == 8
       if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
-        return mma.isAmpere() && dot.getKWidth() == 8;
+        bool legacyLoweringIsBuggy = dot.getKWidth() >= 8;
+        return legacyLoweringIsBuggy && mma.isAmpere();
       }
       if (isa<AMDMfmaEncodingAttr>(dot.getParent()))
         return true;
 
@@ -52,14 +52,23 @@ LogicalResult UpcastMXFPOp::verify() {
         "all dimensions except the last must match between operands");
   }
 
-  auto dotEncoding =
-      dyn_cast_or_null<DotOperandEncodingAttr>(xTy.getEncoding());
+  auto layoutX = xTy.getEncoding();
+  auto layoutScale = scaleTy.getEncoding();
+  if (bool(layoutX) != bool(layoutScale)) {
+    return emitOpError(
+        "Expected either both or neither operands to have an encoding");
+  }
+  // Nothing to check if no encoding. This is used to infer the return type in
+  // AccelerateMatmul.cpp
+  if (!layoutX) {
+    return success();
+  }
+
+  auto dotEncoding = dyn_cast<DotOperandEncodingAttr>(layoutX);
   if (!dotEncoding) {
     return emitOpError("Expected a DotOperandEncodingAttr for values");
   }
-
-  auto blockedScale =
-      dyn_cast_or_null<BlockedEncodingAttr>(scaleTy.getEncoding());
+  auto blockedScale = dyn_cast<BlockedEncodingAttr>(layoutScale);
   if (!blockedScale) {
     return emitOpError("Expected a BlockOperandEncoding for scales");
   }
@@ -86,22 +95,23 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
   auto xShape = xTy.getShape();
 
   auto encoding = xTy.getEncoding();
-  if (!encoding) {
-    return emitOptionalError(loc, "expected an encoding");
-  }
-  if (!mlir::isa<DotOperandEncodingAttr>(encoding)) {
-    return emitOptionalError(loc, "expected a dotOperand encoding");
-  }
 
   if (typeEncoded == ScaleDotElemType::E2M1) {
-    auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
-    auto newVEncoding = DotOperandEncodingAttr::get(
-        ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(),
-        oldEncoding.getKWidth() * 2);
+    RankedTensorType retTy;
+
     auto newShape = SmallVector<int64_t>(xShape);
     newShape.back() *= 2;
-    inferredReturnTypes.push_back(
-        RankedTensorType::get(newShape, FloatType::getBF16(ctx), newVEncoding));
+    if (!encoding) {
+      retTy = RankedTensorType::get(xShape, FloatType::getBF16(ctx));
+    } else {
+      auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+      auto newVEncoding = DotOperandEncodingAttr::get(
+          ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(),
+          oldEncoding.getKWidth() * 2);
+      retTy = RankedTensorType::get(newShape, FloatType::getBF16(ctx),
+                                    newVEncoding);
+    }
+    inferredReturnTypes.push_back(retTy);
   } else {
     inferredReturnTypes.push_back(xTy);
   }