Sync from upstream (#2820)

whitneywhtsang · web-flow · commit 81b0627f1050 · 2024-11-25T22:37:36.000-05:00
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -154,7 +154,7 @@ endfunction()
 if(NOT MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-covered-switch-default -fvisibility=hidden")
 else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX-")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX- /wd4244 /wd4624 /wd4715 /wd4530")
 endif()
 
 include_directories(".")
diff --git a/docs/index.rst b/docs/index.rst
@@ -22,26 +22,6 @@ Getting Started
    getting-started/tutorials/index
 
 
-Programming Guide
------------------
-
-Check out the following documents to learn more about Triton and its comparison with other DSLs for Deep Neural Networks (DNNs):
-
-- Chapter 1: :doc:`Introduction <programming-guide/chapter-1/introduction>`
-- Chapter 2: :doc:`Related Work <programming-guide/chapter-2/related-work>`
-- Chapter 3: :doc:`Debugging <programming-guide/chapter-3/debugging>`
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Programming Guide
-   :hidden:
-
-   programming-guide/chapter-1/introduction
-   programming-guide/chapter-2/related-work
-   programming-guide/chapter-3/debugging
-
-.. _Triton: https://github.com/triton-lang/triton
-
 Python API
 ----------
 
@@ -73,3 +53,23 @@ Triton MLIR Dialects and Ops
    :hidden:
 
    dialects/dialects
+
+Going Further
+-------------
+
+Check out the following documents to learn more about Triton and how it compares against other DSLs for DNNs:
+
+- Chapter 1: :doc:`Introduction <programming-guide/chapter-1/introduction>`
+- Chapter 2: :doc:`Related Work <programming-guide/chapter-2/related-work>`
+- Chapter 3: :doc:`Debugging <programming-guide/chapter-3/debugging>`
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Programming Guide
+   :hidden:
+
+   programming-guide/chapter-1/introduction
+   programming-guide/chapter-2/related-work
+   programming-guide/chapter-3/debugging
+
+.. _Triton: https://github.com/triton-lang/triton
diff --git a/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp
@@ -50,6 +50,7 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
   // know about the op to split the block.
   void llAssert(Operation *op, Value condition, StringRef message,
                 ConversionPatternRewriter &rewriter) const {
+
     auto ctx = rewriter.getContext();
     auto loc = op->getLoc();
 
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -344,7 +344,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     auto dstTy = op.getType();
     auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
     SmallVector<Value> outVals(numRegs);
-    for (int i = 0; i < outVals.size(); i++) {
+    for (int i = 0; i < numRegs; i++) {
       // Remove free masks from the register index
       // For example, if idx = 0b00111, and masks = 0b00100, then we get
       // 0b00011. It means that register 7 (0b111) has the same value as
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
@@ -157,7 +157,7 @@ class SwizzleShmemConvert : public OpRewritePattern<ConvertLayoutOp> {
     if (auto srcCvt = trans.getSrc().getDefiningOp<ConvertLayoutOp>()) {
       srcTy = srcCvt.getSrc().getType();
     }
-    auto sharedLoadTy = cast<RankedTensorType>(cvtOp.getType());
+    RankedTensorType sharedLoadTy = cvtOp.getType();
     auto cvtEncoding =
         dyn_cast<DotOperandEncodingAttr>(sharedLoadTy.getEncoding());
     if (!cvtEncoding)
diff --git a/python/setup.py b/python/setup.py
@@ -764,7 +764,7 @@ def get_git_commit_hash(length=8):
 
 def get_install_requires():
     install_requires = [
-        "packaging",  # used by third_party/intel/backend/compiler.py
+        "packaging",  # used by third_party/intel/backend/driver.py
     ]  # yapf: disable
     return install_requires
 
diff --git a/python/test/regression/test_cast_matmul.py b/python/test/regression/test_cast_matmul.py
@@ -5,6 +5,7 @@
 
 TODO: float8 types
 """
+
 import warnings
 import pytest
 import torch
diff --git a/python/triton/backends/compiler.py b/python/triton/backends/compiler.py
@@ -6,7 +6,7 @@
 
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, Union
+from typing import Dict, List, Tuple, Union
 from types import ModuleType
 
 # Table that associates strings to AttrsDescriptor (sub)classes.
@@ -171,7 +171,7 @@ def from_dict(data):
         return attrs_descriptor
 
     @classmethod
-    def from_hints(cls, hints: list[tuple[int, int]]):
+    def from_hints(cls, hints: List[Tuple[int, int]]):
         """
         Create the class from a set of hints that are passed in.
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
@@ -232,6 +232,12 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
     mfmaInstrK = elemsPerInstr[kDimIdx];
   }
 
+  if (mfmaInstrNonK > shape[nonKDimIdx] || mfmaInstrK > shape[kDimIdx]) {
+    // This pattern does not support cases tensor shape is smaller than
+    // one instruction size, it will be processed by LinearLayout converter
+    return Value();
+  }
+
   auto numReps = mfmaLayout.getRepForOperand(shape, kWidth, opIdx);
   auto numRepNonK = numReps[nonKDimIdx];
   auto numRepK = numReps[kDimIdx];

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ class SwizzleShmemConvert : public OpRewritePattern<ConvertLayoutOp> {`
`157`	`157`	`if (auto srcCvt = trans.getSrc().getDefiningOp<ConvertLayoutOp>()) {`
`158`	`158`	`srcTy = srcCvt.getSrc().getType();`
`159`	`159`	`}`
`160`		`- auto sharedLoadTy = cast<RankedTensorType>(cvtOp.getType());`
	`160`	`+ RankedTensorType sharedLoadTy = cvtOp.getType();`
`161`	`161`	`auto cvtEncoding =`
`162`	`162`	`dyn_cast<DotOperandEncodingAttr>(sharedLoadTy.getEncoding());`
`163`	`163`	`if (!cvtEncoding)`