intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 1 deletion b/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/TritonAttrDefs.td‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Dialect/Triton/IR/TritonAttrDefs.td‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 0 additions & 21 deletions b/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 7 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 1 addition & 9 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 52 additions & 61 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 52 additions & 61 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 4 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 1 addition & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 1 addition & 2 deletions
@@ -68,7 +68,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::registerTritonGPUGlobalScratchAllocationPass();
   mlir::triton::registerConvertTritonGPUToLLVMPass();
   mlir::triton::registerConvertNVGPUToLLVMPass();
-  mlir::triton::registerDecomposeUnsupportedNVIDIAConversions();
   mlir::registerLLVMDIScope();
   mlir::triton::gpu::intel::registerTritonAnnotateModulePass();
   mlir::triton::gpu::intel::registerTritonIntelGPUPasses();
 
@@ -128,8 +128,8 @@ def TT_ScaleDotElemTypeAttr : I32EnumAttr<
       I32EnumAttrCase<"E2M3", 2, "e2m3">,
       I32EnumAttrCase<"E3M2", 3, "e3m2">,
       I32EnumAttrCase<"E2M1", 4, "e2m1">,
-      I32EnumAttrCase<"BF16", 5, "bf16">
-
+      I32EnumAttrCase<"BF16", 5, "bf16">,
+      I32EnumAttrCase<"FP16", 6, "fp16">
     ]>{
   let cppNamespace = "::mlir::triton";
 }
 
@@ -167,27 +167,6 @@ template <typename VecT> bool isConsecutive(const VecT &vec) {
   return isConsecutive(ArrayRef(vec));
 }
 
-// LLVM's STLExtras.h provides a bunch of functions that work over ranges, but
-// it's missing min/max_element until
-// https://github.com/llvm/llvm-project/commit/fab2bb8b makes it into Triton.
-// TODO(jlebar): Remove this once we have the LLVM helpers.
-template <typename R> auto min_element(R &&Range) {
-  return std::min_element(llvm::adl_begin(Range), llvm::adl_end(Range));
-}
-template <typename R, typename Compare>
-auto min_element(R &&Range, Compare &&C) {
-  return std::min_element(llvm::adl_begin(Range), llvm::adl_end(Range),
-                          std::forward<Compare>(C));
-}
-template <typename R> auto max_element(R &&Range) {
-  return std::max_element(llvm::adl_begin(Range), llvm::adl_end(Range));
-}
-template <typename R, typename T, typename Compare>
-auto max_element(R &&Range, Compare &&C) {
-  return std::max_element(llvm::adl_begin(Range), llvm::adl_end(Range),
-                          std::forward<Compare>(C));
-}
-
 } // namespace triton
 } // namespace mlir
 
 
@@ -283,8 +283,8 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [DeclareOpInterfaceMethods<MemoryEf
   }];
 }
 
-def TTG_UpcastMXFPOp : TTG_Op<"upcast_mxfp", [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
-  let summary = "Convert an mxfp tensor to bf16";
+def TTG_UpcastMXFPOp : TTG_Op<"upcast_mxfp", [Pure]> {
+  let summary = "Convert an mxfp tensor to bf16/fp16";
 
   let hasVerifier = 1;
 
@@ -301,6 +301,11 @@ def TTG_UpcastMXFPOp : TTG_Op<"upcast_mxfp", [Pure, DeclareOpInterfaceMethods<In
   let assemblyFormat = [{
     $src `,` $scale  `fp_type` `=` $fp_type attr-dict `:` type($src) `,` type($scale) `->` type($result)
   }];
+
+  let extraClassDeclaration = [{
+    static RankedTensorType deduceOutputType(
+        TypedValue<RankedTensorType> inputTensor, ScaleDotElemType inputElemType, Type outputElemType);
+  }];
 }
 
 // Allocate global memory
 
@@ -33,7 +33,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_LLVM_DEBUG_ONLY",
     "TRITON_ENABLE_ASAN",
-    "TRITON_OVERRIDE_NV_CAPABILITY",
+    "TRITON_OVERRIDE_ARCH",
     "USE_IR_LOC",
     "NVPTX_ENABLE_DUMP",
     "TRITON_INTEL_ADVANCED_PATH",
 
@@ -356,10 +356,6 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     auto srcTy = op.getSrc().getType();
     auto dstTy = op.getType();
 
-    // TODO (Keren): Currently, we handle general mma/blocked/slice/dot(ampere)
-    // -> mma/blocked/slice/dot(ampere) conversions. The following tasks must be
-    // completed before we can remove the layoutIsOK check:
-    // 1. Support for AMD's WMMA dot operand
     std::function<bool(Attribute)> layoutIsOK = [&](Attribute layout) {
       if (isa<MmaEncodingTrait>(layout)) {
         return !useLegacyMMAConversion;
@@ -368,15 +364,11 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
         if (isa<MmaEncodingTrait>(dotOperand.getParent())) {
           return !useLegacyMMAConversion;
         }
-        return false;
-      }
-      if (isa<BlockedEncodingAttr, LinearEncodingAttr>(layout)) {
-        return true;
       }
       if (auto slice = dyn_cast<SliceEncodingAttr>(layout)) {
         return layoutIsOK(slice.getParent());
       }
-      return false;
+      return true;
     };
     if (!layoutIsOK(srcTy.getEncoding()) || !layoutIsOK(dstTy.getEncoding())) {
       return failure();
 
@@ -303,13 +303,15 @@ LogicalResult UpcastMXFPOp::verify() {
 
   auto xTy = getSrc().getType();
   auto scaleTy = getScale().getType();
-
-  if (xTy.getElementType() != FloatType::getBF16(getContext()) &&
-      xTy.getElementType() != IntegerType::get(getContext(), 8)) {
-    return emitOpError("element type of the first operand must be bf16 or i8");
+  Builder b(getContext());
+  if (xTy.getElementType() != b.getBF16Type() &&
+      xTy.getElementType() != b.getF16Type() &&
+      xTy.getElementType() != b.getI8Type()) {
+    return emitOpError(
+        "element type of the first operand must be bf16/fp16 or i8");
   }
 
-  if (scaleTy.getElementType() != IntegerType::get(getContext(), 8)) {
+  if (scaleTy.getElementType() != b.getI8Type()) {
     return emitOpError("element type of the second operand must be uint8");
   }
 
@@ -383,66 +385,55 @@ LogicalResult UpcastMXFPOp::verify() {
   return success();
 }
 
-LogicalResult UpcastMXFPOp::inferReturnTypes(
-    MLIRContext *ctx, std::optional<Location> loc, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties opaqueProperties,
-    RegionRange regions, SmallVectorImpl<Type> &inferredReturnTypes) {
-  auto xTy = cast<RankedTensorType>(operands[0].getType());
-  auto properties = opaqueProperties.as<const Properties *>();
-  auto typeEncoded = properties->fp_type.getValue();
-  auto xShape = xTy.getShape();
+RankedTensorType
+UpcastMXFPOp::deduceOutputType(TypedValue<RankedTensorType> inputTensor,
+                               ScaleDotElemType inputElemType,
+                               Type outputElemType) {
+  MLIRContext *ctx = inputTensor.getContext();
+  auto xTy = inputTensor.getType();
+  if (inputElemType != ScaleDotElemType::E2M1)
+    return xTy;
 
+  auto xShape = xTy.getShape();
+  auto newShape = llvm::to_vector(xShape);
   auto encoding = xTy.getEncoding();
-
-  if (typeEncoded == ScaleDotElemType::E2M1) {
-    RankedTensorType retTy;
-
-    auto newShape = SmallVector<int64_t>(xShape);
-    if (!encoding) {
-      newShape.back() *= 2;
-      retTy = RankedTensorType::get(xShape, FloatType::getBF16(ctx));
-    } else {
-      Type elemType = FloatType::getBF16(ctx);
-      Attribute newVEncoding = nullptr;
-      auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
-      const int opIdx = oldEncoding.getOpIdx();
-      const bool hasBatch = xShape.size() == 3;
-      const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
-      newShape[kIdx] *= 2;
-
-      // Note: For Intel the dot operands layout's kWidth parameter must match
-      // the parent's DPAS layout opsPerChannel so we need to materialize a
-      // new DPAS layout.
-      if (auto dpasEncoding =
-              dyn_cast<intel::DpasEncodingAttr>(oldEncoding.getParent())) {
-        unsigned opsPerChannel =
-            intel::DpasEncodingAttr::getOpsPerChannel(elemType);
-        // e2m1 is packed 2 elements per int8, we must handle continuous 2
-        // elements when upcasting to bf16
-        if (xTy.getElementType() == IntegerType::get(ctx, 8))
-          opsPerChannel *= 2;
-        auto newDpasEncoding = intel::DpasEncodingAttr::get(
-            ctx, dpasEncoding.getRepeatCount(), dpasEncoding.getSystolicDepth(),
-            dpasEncoding.getExecutionSize(), opsPerChannel,
-            dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(),
-            product<unsigned>(dpasEncoding.getThreadsPerWarp()));
-        newVEncoding = DotOperandEncodingAttr::get(
-            ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel());
-      } else {
-        // Figure out the K dimension for the input A/B, given that the return
-        // type is upcasted A/B type so we need to update the proper dim size.
-        newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
-                                                   oldEncoding.getParent(),
-                                                   oldEncoding.getKWidth() * 2);
-      }
-      retTy = RankedTensorType::get(newShape, elemType, newVEncoding);
-    }
-    inferredReturnTypes.push_back(retTy);
+  if (!encoding) {
+    newShape.back() *= 2;
+    return RankedTensorType::get(xShape, outputElemType);
+  }
+
+  Attribute newVEncoding = nullptr;
+  auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+  const int opIdx = oldEncoding.getOpIdx();
+  // Note: For Intel the dot operands layout's kWidth parameter must match
+  // the parent's DPAS layout opsPerChannel so we need to materialize a
+  // new DPAS layout.
+  if (auto dpasEncoding =
+          dyn_cast<intel::DpasEncodingAttr>(oldEncoding.getParent())) {
+    unsigned opsPerChannel =
+        intel::DpasEncodingAttr::getOpsPerChannel(outputElemType);
+    // e2m1 is packed 2 elements per int8, we must handle continuous 2
+    // elements when upcasting to bf16
+    if (xTy.getElementType() == IntegerType::get(ctx, 8))
+      opsPerChannel *= 2;
+    auto newDpasEncoding = intel::DpasEncodingAttr::get(
+        ctx, dpasEncoding.getRepeatCount(), dpasEncoding.getSystolicDepth(),
+        dpasEncoding.getExecutionSize(), opsPerChannel,
+        dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(),
+        product<unsigned>(dpasEncoding.getThreadsPerWarp()));
+    newVEncoding = DotOperandEncodingAttr::get(
+        ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel());
   } else {
-    inferredReturnTypes.push_back(xTy);
+    // Figure out the K dimension for the input A/B, given that the return
+    // type is upcasted A/B type so we need to update the proper dim size.
+    newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
+                                               oldEncoding.getParent(),
+                                               oldEncoding.getKWidth() * 2);
   }
-
-  return success();
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+  newShape[kIdx] *= 2;
+  return RankedTensorType::get(newShape, outputElemType, newVEncoding);
 }
 
 OpFoldResult MemDescTransOp::fold(FoldAdaptor adaptor) {
 
@@ -573,8 +573,10 @@ class DecomposeScaledBlocked
           maybeWithEncoding(scale.getType(), scaleEncoding);
       scale = rewriter.create<ConvertLayoutOp>(scale.getLoc(),
                                                newScaleDotElemType, scale);
-      ret = rewriter.create<triton::gpu::UpcastMXFPOp>(v.getLoc(), ret, scale,
-                                                       type);
+      auto retTy = triton::gpu::UpcastMXFPOp::deduceOutputType(
+          ret, type, Builder(v.getContext()).getBF16Type());
+      ret = rewriter.create<triton::gpu::UpcastMXFPOp>(v.getLoc(), retTy, ret,
+                                                       scale, type);
     }
     return ret;
   }
 
@@ -115,7 +115,7 @@ CoarseSchedule scheduleKeyOps(scf::ForOp forOp,
   }
 
   auto stages = llvm::make_second_range(opToStage);
-  int maxStage = *std::max_element(stages.begin(), stages.end());
+  int maxStage = *llvm::max_element(stages);
   CoarseSchedule schedule(maxStage + 1);
   SmallVector<CoarseSchedule::Cluster> clusters(maxStage + 1);
   for (int i = 0; i <= maxStage; i++) {
 
@@ -257,8 +257,7 @@ DenseMap<Operation *, int> assignLatencies(ModuleOp moduleOp,
 
     // Calculate the stage distance between applicable loads.
     auto vals = llvm::make_second_range(loadOpToIndLevel);
-    int maxIndirectionLevel =
-        vals.empty() ? 0 : *std::max_element(vals.begin(), vals.end());
+    int maxIndirectionLevel = vals.empty() ? 0 : *llvm::max_element(vals);
     unsigned loadLatency = (numStages - 1) / (maxIndirectionLevel + 1);
 
     for (auto [loadOp, dist] : loadOpToIndLevel) {
Original file line number	Diff line number	Diff line change
`@@ -573,8 +573,10 @@ class DecomposeScaledBlocked`
`573`	`573`	`maybeWithEncoding(scale.getType(), scaleEncoding);`
`574`	`574`	`scale = rewriter.create<ConvertLayoutOp>(scale.getLoc(),`
`575`	`575`	`newScaleDotElemType, scale);`
`576`		`- ret = rewriter.create<triton::gpu::UpcastMXFPOp>(v.getLoc(), ret, scale,`
`577`		`- type);`
	`576`	`+ auto retTy = triton::gpu::UpcastMXFPOp::deduceOutputType(`
	`577`	`+ ret, type, Builder(v.getContext()).getBF16Type());`
	`578`	`+ ret = rewriter.create<triton::gpu::UpcastMXFPOp>(v.getLoc(), retTy, ret,`
	`579`	`+ scale, type);`
`578`	`580`	`}`
`579`	`581`	`return ret;`
`580`	`582`	`}`
Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ CoarseSchedule scheduleKeyOps(scf::ForOp forOp,`
`115`	`115`	`}`
`116`	`116`
`117`	`117`	`auto stages = llvm::make_second_range(opToStage);`
`118`		`- int maxStage = *std::max_element(stages.begin(), stages.end());`
	`118`	`+ int maxStage = *llvm::max_element(stages);`
`119`	`119`	`CoarseSchedule schedule(maxStage + 1);`
`120`	`120`	`SmallVector<CoarseSchedule::Cluster> clusters(maxStage + 1);`
`121`	`121`	`for (int i = 0; i <= maxStage; i++) {`