Merge commit '915c1499789ea2257ab494da833cb78789c9f5af'

whitneywhtsang · whitneywhtsang · commit 45aeede8499c · 2024-11-16T15:52:19.000Z
diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml
@@ -0,0 +1,48 @@
+name: Report a bug
+description: Report triton failing to compile a kernel, or giving incorrect results
+labels: ["bug"]
+
+body:
+- type: markdown
+  attributes:
+    value: |
+      #### Disclaimer
+      The core triton team is small and has very limited capacity. We may not have time to look into your report.
+      For the best results, please:
+        - Avoid submitting duplicates. Search through [the existing and past issues](https://github.com/triton-lang/triton/issues?q=is%3Aissue+sort%3Acreated-desc+) first to see if it's been reported previously.
+        - Check if the issue persists with a build from the latest source.
+        - Provide all relevant information in the initial report, to prevent unnecessary back and forth discussion.
+        - If you can, try to diagnose and/or fix the issue yourself. We welcome high quality contributions.
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: |
+      Please provide a clear and concise description of what the bug is.
+
+      If relevant, add a [minimal complete example](https://stackoverflow.com/help/minimal-reproducible-example) that reproduces the bug. It is very important for the snippet to be as simple as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did, so include both the kernel and launching code as well as any relevant imports.
+
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+
+      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+    placeholder: |
+      A clear and concise description of what the bug is.
+
+      ```python
+      # Sample code to reproduce the problem
+      ```
+
+      ```
+      The error message you got, with the full traceback.
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment details
+    description: |
+      Please include any relevant context about how you're running the reproducer e.g. which version of triton, and what GPU you are using.
+    placeholder: |
+        Triton: ...
+        GPU: ...
+  validations:
+    required: true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: true
+contact_links:
+  - name: Community help
+    url: https://discord.gg/gpumode
+    about: GPU-mode discord community has a triton channel which is a great resource for help writing/learning triton
diff --git a/.github/ISSUE_TEMPLATE/performance.yml b/.github/ISSUE_TEMPLATE/performance.yml
@@ -0,0 +1,44 @@
+name: Report a performance issue
+description: Report cases where triton is generating sub-optimal (but functionally correct) PTX/LLVM IR
+labels: ["performance"]
+
+body:
+- type: markdown
+  attributes:
+    value: |
+      #### Disclaimer
+      The core triton team is small and has very limited capacity. We may not have time to look into your report.
+      For the best results, please:
+        - Avoid submitting duplicates. Search through [the existing and past issues](https://github.com/triton-lang/triton/issues?q=is%3Aissue+sort%3Acreated-desc+) first to see if it's been reported previously.
+        - Check if the issue persists with a build from the latest source.
+        - Provide all relevant information in the initial report, to prevent unnecessary back and forth discussion.
+        - If you can, try to diagnose and/or fix the issue yourself. We welcome high quality contributions.
+- type: textarea
+  attributes:
+    label: Describe the issue
+    description: |
+      Please provide a clear and concise description of the issue.
+
+      Include a [minimal complete example](https://stackoverflow.com/help/minimal-reproducible-example) that reproduces the issue. It is very important for the snippet to be as simple as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did.
+
+      A reproducer could be a python program that runs a triton kernel and prints out the relevant suboptimal IR, or an IR file with an accompanying triton-opt command.
+
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+    placeholder: |
+      A clear and concise description of the issue.
+
+      ```python
+      # Sample code to reproduce the problem
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment details
+    description: |
+      Please include any relevant context about how you're running the reproducer e.g. which version of triton, and what GPU you are using.
+    placeholder: |
+        Triton: ...
+        GPU: ...
+  validations:
+    required: true
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,11 +1,14 @@
+<!---
 The core Triton is a small number of people, and we receive many PRs (thank
 you!).  To help us review your code more quickly, **if you are a new
 contributor (less than 3 PRs merged) we ask that you complete the following
 tasks and include the filled-out checklist in your PR description.**
 
 Complete the following tasks before sending your PR, and replace `[ ]` with
 `[x]` to indicate you have done them.
+-->
 
+# New contributor declaration
 - [ ] I am not making a trivial change, such as fixing a typo in a comment.
 
 - [ ] I have written a PR description following these
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -141,10 +141,6 @@ jobs:
       - name: Check pre-commit
         run: |
           python3 -m pip install --upgrade pre-commit
-          # TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed
-          python3 -m pre_commit run --all-files --verbose yapf &> /dev/null || true
-          # If first run of yapf worked and made changes reset the tree to the original state
-          git reset --hard
           python3 -m pre_commit run --all-files --verbose
       - name: Print diff of changes if pre-commit failed
         if: failure()
diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
@@ -155,10 +155,6 @@ jobs:
       - name: Check pre-commit
         run: |
           python3 -m pip install --upgrade pre-commit
-          # TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed
-          python3 -m pre_commit run --all-files --verbose yapf &> /dev/null || true
-          # If first run of yapf worked and made changes reset the tree to the original state
-          git reset --hard
           python3 -m pre_commit run --all-files --verbose
 
       - name: Print diff of changes if pre-commit failed
diff --git a/.github/workflows/llvm-build.yml b/.github/workflows/llvm-build.yml
@@ -245,12 +245,14 @@ jobs:
 
         # Create temporary container to copy cache and installed artifacts.
         CONTAINER_ID=$(docker create llvm-build)
+
+        # We remove the existing directories, otherwise docker cp will
+        # create a subdirectory inside the existing directory.
+        rm -rf "${{ env.SCCACHE_DIR }}" "${{ env.llvm_install_dir }}"
+
         docker cp "${CONTAINER_ID}:/install" "${{ env.llvm_install_dir }}"
         tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
 
-        # We remove the existing directory, otherwise docker will
-        # create a subdirectory inside the existing directory.
-        rm -rf "${{ env.SCCACHE_DIR }}"
         docker cp "${CONTAINER_ID}:/sccache" "${{ env.SCCACHE_DIR }}"
         sudo chown -R "$(id -u -n):$(id -g -n)" "${{ env.SCCACHE_DIR }}"
 
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -100,11 +100,11 @@ def TT_FpToFpOp : TT_Op<"fp_to_fp", [Elementwise,
     }];
 
     let arguments = (
-      ins TT_FloatTensor:$src,
+      ins TT_FloatLike:$src,
       OptionalAttr<TT_RoundingModeAttr>:$rounding
     );
 
-    let results = (outs TT_FloatTensor:$result);
+    let results = (outs TT_FloatLike:$result);
 
     let assemblyFormat = "$src attr-dict  (`,` `rounding` `=` $rounding^)? `:` type($src) `->` type($result)";
 
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -734,26 +734,34 @@ OpFoldResult FpToFpOp::fold(FoldAdaptor adaptor) {
   auto srcVal = getSrc();
   auto dstTy = getType();
 
-  const llvm::fltSemantics &semantic =
-      llvm::cast<FloatType>(dstTy.getElementType()).getFloatSemantics();
+  auto resElemType = cast<FloatType>(getElementTypeOrSelf(getType()));
+  const llvm::fltSemantics &semantic = resElemType.getFloatSemantics();
 
   if (matchPattern(srcVal, m_PosZeroFloat())) {
     llvm::APFloat posZero =
         llvm::APFloat::getZero(semantic, /*negative=*/false);
-    return DenseFPElementsAttr::get(dstTy, posZero);
+    if (auto tensorTy = dyn_cast<RankedTensorType>(dstTy))
+      return DenseElementsAttr::get(tensorTy, posZero);
+    return Builder(getContext()).getFloatAttr(resElemType, posZero);
   }
 
   if (matchPattern(srcVal, m_NegZeroFloat())) {
     llvm::APFloat negZero = llvm::APFloat::getZero(semantic, /*negative=*/true);
-    return DenseFPElementsAttr::get(dstTy, negZero);
+    if (auto tensorTy = dyn_cast<RankedTensorType>(dstTy))
+      return DenseElementsAttr::get(tensorTy, negZero);
+    return Builder(getContext()).getFloatAttr(resElemType, negZero);
   }
 
   return {};
 }
 
 LogicalResult FpToFpOp::verify() {
-  auto dstType = getType().getElementType();
-  auto srcType = getSrc().getType().getElementType();
+  auto dstType = getType();
+  auto srcType = getSrc().getType();
+  if (auto dstTensorType = dyn_cast<RankedTensorType>(dstType))
+    dstType = dstTensorType.getElementType();
+  if (auto srcTensorType = dyn_cast<RankedTensorType>(srcType))
+    srcType = srcTensorType.getElementType();
   if ((dstType.getIntOrFloatBitWidth() < srcType.getIntOrFloatBitWidth()) &&
       (!getRounding().has_value())) {
     return emitError("Rounding mode is required for FP downcast");
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -1,10 +1,7 @@
 #include "mlir/IR/BuiltinTypes.h"
-#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Dialect/TritonNvidiaGPU/IR/Types.h"
-#include "llvm/Support/raw_ostream.h"
 
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"
@@ -39,19 +36,6 @@ LogicalResult UpcastMXFPOp::verify() {
     return emitOpError("NYI: fpType must be E2M1, E4M3, or E5M2");
   }
 
-  // Change to support fp8 types
-  const auto elems_packed = fpType == ScaleDotElemType::E2M1 ? 2 : 1;
-
-  if (xShape.back() != (32 / elems_packed) * scaleShape.back()) {
-    return emitOpError("last dimension of first operand must be 16 times "
-                       "larger than that of the second operand");
-  }
-
-  if (!std::equal(xShape.begin(), xShape.end() - 1, scaleShape.begin())) {
-    return emitOpError(
-        "all dimensions except the last must match between operands");
-  }
-
   auto layoutX = xTy.getEncoding();
   auto layoutScale = scaleTy.getEncoding();
   if (bool(layoutX) != bool(layoutScale)) {
@@ -82,6 +66,28 @@ LogicalResult UpcastMXFPOp::verify() {
     }
   }
 
+  // Change to support fp8 types
+  const auto elemsPacked = fpType == ScaleDotElemType::E2M1 ? 2 : 1;
+  // Figure out the K dimension for the input A/B. For A/B scale, the K
+  // dimension is always the last dimension.
+  const int opIdx = dotEncoding.getOpIdx();
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+
+  if (xShape[kIdx] != (32 / elemsPacked) * scaleShape.back()) {
+    return emitOpError("K dimension of first operand must be 16 times "
+                       "larger than last/K dimension of the second operand");
+  }
+
+  // Check other dimensions match too. For input A/B, we need to figure out the
+  // index for the M/N dimension. For scale, it's always {(batch), M/N, K}.
+  const int mnIdx = (opIdx == 0 ? 0 : 1) + hasBatch;
+  if (hasBatch && xShape[0] != scaleShape[0])
+    return emitOpError("batch dimension must match between operands");
+  if (xShape[mnIdx] != scaleShape[hasBatch]) {
+    return emitOpError("M/N dimension must match between operands");
+  }
+
   return success();
 }
 
@@ -100,14 +106,20 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
     RankedTensorType retTy;
 
     auto newShape = SmallVector<int64_t>(xShape);
-    newShape.back() *= 2;
     if (!encoding) {
+      newShape.back() *= 2;
       retTy = RankedTensorType::get(xShape, FloatType::getBF16(ctx));
     } else {
       auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
       auto newVEncoding = DotOperandEncodingAttr::get(
           ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(),
           oldEncoding.getKWidth() * 2);
+      // Figure out the K dimension for the input A/B, given that the return
+      // type is upcasted A/B type so we need to update the proper dim size.
+      const int opIdx = oldEncoding.getOpIdx();
+      const bool hasBatch = xShape.size() == 3;
+      const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+      newShape[kIdx] *= 2;
       retTy = RankedTensorType::get(newShape, FloatType::getBF16(ctx),
                                     newVEncoding);
     }
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -495,7 +495,8 @@ class DecomposeScaledBlocked
       assert(type == ScaleDotElemType::E5M2 || type == ScaleDotElemType::E4M3);
       auto vTypeBf16 = RankedTensorType::get(
           newVType.getShape(), rewriter.getBF16Type(), newVType.getEncoding());
-      ret = rewriter.create<FpToFpOp>(v.getLoc(), vTypeBf16, ret);
+      ret = cast<TypedValue<RankedTensorType>>(
+          rewriter.create<FpToFpOp>(v.getLoc(), vTypeBf16, ret).getResult());
     }
     if (opt_scale.has_value()) {
       auto scale = *opt_scale;
diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
@@ -1024,7 +1024,7 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast(
     if (auto fpToFpOp = dyn_cast<FpToFpOp>(op)) {
       auto srcType = cast<RankedTensorType>(fpToFpOp.getOperand().getType());
       return getElementBitWidth(srcType) <
-             getElementBitWidth(fpToFpOp.getType());
+             getElementBitWidth(cast<RankedTensorType>(fpToFpOp.getType()));
     }
     return false;
   };
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -2536,6 +2536,9 @@ def histogram_kernel(x_ptr, z_ptr, M: tl.constexpr, N: tl.constexpr):
         offset2 = tl.arange(0, N)
         x = tl.load(x_ptr + offset1)
         z = tl.histogram(x, N)
+        bias = tl.full([M, N], 1, dtype=tl.int32)
+        # check that histogram produces object compatible with broadcasting
+        biased = z + bias
         tl.store(z_ptr + offset2, z)
 
     torch.manual_seed(17)
@@ -3425,8 +3428,6 @@ def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, nu
         if cc < (8, 9):
             pytest.skip("float8e4nv not supported on CUDA < 8.9")
     if is_hip():
-        if rhs_scale:
-            pytest.skip("scales on rhs not yet support for HIP")
         if not is_hip_cdna():
             pytest.skip("scaled_dot only implemented for HIP CDNA")
         if "e4m3" in (normal_type, mxfp_type) and not is_hip_mi300():
diff --git a/python/test/unit/test_debug.py b/python/test/unit/test_debug.py
@@ -1,4 +1,3 @@
-import os
 import pytest
 import torch
 import triton.language as tl
@@ -10,8 +9,8 @@
 @pytest.mark.parametrize('env_var', [True, False])
 @pytest.mark.parametrize('jit_flag', [True, False])
 @pytest.mark.forked
-def test_device_assert(cond, opt_flag, env_var, jit_flag, device):
-    os.environ['TRITON_DEBUG'] = str(int(env_var))
+def test_device_assert(monkeypatch, cond, opt_flag, env_var, jit_flag, device):
+    monkeypatch.setenv("TRITON_DEBUG", str(int(env_var)))
     torch.zeros([1], dtype=torch.int32, device=device)
 
     @triton.jit(debug=jit_flag)
@@ -34,6 +33,20 @@ def _kernel(COND: tl.constexpr):
     getattr(torch, device).synchronize()
 
 
+def test_device_assert_barrier(monkeypatch, device):
+    monkeypatch.setenv("TRITON_DEBUG", "1")
+    tensor = torch.zeros([16], dtype=torch.int32, device=device)
+
+    @triton.jit
+    def _kernel(in_ptr0):
+        xindex = tl.arange(0, 8)
+        tmp0 = tl.load(in_ptr0 + xindex)
+        tl.device_assert(tmp0 < 1)
+
+    _kernel[(1, )](tensor)
+    getattr(torch, device).synchronize()
+
+
 @pytest.mark.parametrize("cond", [False, True])
 def test_static_assert(cond):
 
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -609,6 +609,8 @@ def __init__(self, element_ty: dtype, shape: List):
         # Note that block_type's shape is a list of int
         # while tensor's shape is a list of constexpr.
 
+        assert (isinstance(shape, list))
+
         # shape can be empty ([]) when an input is a 0D tensor.
         self.shape = _unwrap_shape(shape)
         if not self.shape:
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1684,7 +1684,7 @@ def associative_scan(inputs: Sequence[tl.tensor], axis: int, region_builder_fn,
 def histogram(input: tl.tensor, num_bins: int, builder: ir.builder) -> tl.tensor:
     assert len(input.shape) == 1, "histogram only supports 1D input"
     assert input.dtype.is_int(), "histogram only supports integer input"
-    return tl.tensor(builder.create_histogram(input.handle, num_bins), tl.block_type(tl.int32, (num_bins, )))
+    return tl.tensor(builder.create_histogram(input.handle, num_bins), tl.block_type(tl.int32, [num_bins]))
 
 
 ##
diff --git a/python/triton/runtime/interpreter.py b/python/triton/runtime/interpreter.py
@@ -728,7 +728,7 @@ def check_tensor(self, input):
 
     def to_tensor(self, ret, dtype):
         if hasattr(ret, "shape") and ret.shape:
-            ret_type = tl.block_type(dtype, ret.shape)
+            ret_type = tl.block_type(dtype, list(ret.shape))
         else:
             ret = np.array([ret]).astype(_get_np_dtype(dtype))
             ret_type = dtype
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
diff --git a/test/Triton/canonicalize.mlir b/test/Triton/canonicalize.mlir
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp