[TensorDesc] Add fallback for reduction ops (#6829)

peterbell10 · web-flow · commit 915cc70dd51e · 2025-05-15T17:20:30.000+01:00
This supports most of the same dtype/kind combinations, with the
exception of min/max with float types which are only implemented in the
frontend at the moment.
diff --git a/lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp b/lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
 #include <mlir/Dialect/Arith/IR/Arith.h>
 #include <mlir/Dialect/Func/Transforms/FuncConversions.h>
 #include <mlir/IR/Builders.h>
@@ -253,8 +254,6 @@ struct RewriteLoadPattern : OpConversionPattern<triton::DescriptorLoadOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op.getLoc();
     const auto blockShape = op.getDesc().getType().getBlockType().getShape();
-    const auto rank = blockShape.size();
-
     auto descTy = op.getDesc().getType();
     auto desc = unpackDescriptor(descTy, adaptor.getDesc());
     auto offsets = castToI64(rewriter, op.getIndices());
@@ -279,7 +278,6 @@ struct RewriteStorePattern : OpConversionPattern<triton::DescriptorStoreOp> {
     auto loc = op.getLoc();
     auto descTy = op.getDesc().getType();
     const auto blockShape = descTy.getBlockType().getShape();
-    const auto rank = blockShape.size();
     auto desc = unpackDescriptor(descTy, adaptor.getDesc());
     auto offsets = castToI64(rewriter, op.getIndices());
 
@@ -360,6 +358,68 @@ struct RewriteScatterPattern
   }
 };
 
+std::optional<RMWOp> translateReduceKind(DescriptorReduceKind kind,
+                                         TensorDescType ty) {
+  auto scalarTy = ty.getBlockType().getElementType();
+  switch (kind) {
+  case DescriptorReduceKind::ADD:
+    return scalarTy.isInteger() ? RMWOp::ADD : RMWOp::FADD;
+  case DescriptorReduceKind::MIN:
+    if (scalarTy.isUnsignedInteger()) {
+      return RMWOp::UMIN;
+    } else if (scalarTy.isSignedInteger()) {
+      return RMWOp::MIN;
+    }
+    return {};
+  case DescriptorReduceKind::MAX:
+    if (scalarTy.isUnsignedInteger()) {
+      return RMWOp::UMAX;
+    } else if (scalarTy.isSignedInteger()) {
+      return RMWOp::MAX;
+    }
+    return {};
+  case DescriptorReduceKind::AND:
+    return RMWOp::AND;
+  case DescriptorReduceKind::OR:
+    return RMWOp::OR;
+  case DescriptorReduceKind::XOR:
+    return RMWOp::XOR;
+  default:
+    break;
+  }
+  return {};
+}
+
+struct RewriteReducePattern : OpConversionPattern<triton::DescriptorReduceOp> {
+  using OpConversionPattern<triton::DescriptorReduceOp>::OpConversionPattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(triton::DescriptorReduceOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto descTy = op.getDesc().getType();
+    const auto blockShape = descTy.getBlockType().getShape();
+    auto desc = unpackDescriptor(descTy, adaptor.getDesc());
+    auto offsets = castToI64(rewriter, op.getIndices());
+    auto rmwOp = translateReduceKind(op.getKind(), descTy);
+    if (!rmwOp) {
+      std::string msgstring;
+      llvm::raw_string_ostream msg(msgstring);
+      msg << "Cannot fallback on descriptor atomic op, unsupported for type "
+          << descTy.getBlockType().getElementType();
+      return op->emitError(msgstring);
+    }
+
+    auto newStore = rewriter.create<triton::AtomicRMWOp>(
+        loc, descTy.getSignlessBlockType(), *rmwOp,
+        generatePtr(rewriter, loc, blockShape, desc, offsets), op.getSrc(),
+        generateMask(rewriter, loc, blockShape, desc, offsets),
+        MemSemantic::RELEASE, MemSyncScope::GPU);
+    op.erase();
+    return success();
+  }
+};
+
 /**
  * @brief This implements the pass for converting triton tensor descriptor
  * loads/stores into indexed loads/stores.
@@ -428,9 +488,10 @@ class TritonRewriteTensorDescriptorToPointerPass
     mlir::scf::populateSCFStructuralTypeConversions(converter, patterns);
     triton::populateArithTypeConversions(converter, patterns);
 
-    patterns.add<RewriteMakeTensorDesc, RewriteLoadPattern, RewriteStorePattern,
-                 RewriteGatherPattern, RewriteScatterPattern>(converter,
-                                                              &getContext());
+    patterns
+        .add<RewriteMakeTensorDesc, RewriteLoadPattern, RewriteStorePattern,
+             RewriteGatherPattern, RewriteScatterPattern, RewriteReducePattern>(
+            converter, &getContext());
 
     ConversionConfig config;
     config.buildMaterializations = false;
diff --git a/python/test/unit/cuda/test_tensor_descriptor.py b/python/test/unit/cuda/test_tensor_descriptor.py
@@ -1,124 +1,10 @@
 import pytest
 import torch
-import numpy as np
 
 import triton
-from triton.compiler.errors import CompilationError
 import triton.language as tl
-from triton._internal_testing import is_interpreter, numpy_random, to_triton, requires_tma, unwrap_tensor, tma_dtypes, to_numpy
+from triton._internal_testing import is_interpreter, numpy_random, to_triton, requires_tma, unwrap_tensor, tma_dtypes
 from triton.tools.tensor_descriptor import TensorDescriptor
-from typing import Optional
-
-SUPPORTED_REDUCE_DTYPES = {
-    "add": {tl.uint32, tl.int32, tl.uint64, tl.float32, tl.float16, tl.bfloat16},
-    "min": {tl.uint32, tl.int32, tl.uint64, tl.int64, tl.float16, tl.bfloat16},
-    "max": {tl.uint32, tl.int32, tl.uint64, tl.int64, tl.float16, tl.bfloat16},
-    "and": {tl.uint32, tl.int32, tl.uint64, tl.int64},
-    "or": {tl.uint32, tl.int32, tl.uint64, tl.int64},
-    "xor": {tl.uint32, tl.int32, tl.uint64, tl.int64},
-}
-
-
-def min_op(a, b):
-    out = np.minimum(to_numpy(a), to_numpy(b))
-    return unwrap_tensor(to_triton(out, device=a.device))
-
-
-def max_op(a, b):
-    out = np.maximum(to_numpy(a), to_numpy(b))
-    return unwrap_tensor(to_triton(out, device=a.device))
-
-
-REDUCE_OP = {
-    "add": lambda a, b: unwrap_tensor(a) + unwrap_tensor(b),
-    "min": min_op,
-    "max": max_op,
-    "and": lambda a, b: torch.bitwise_and(unwrap_tensor(a), unwrap_tensor(b)),
-    "or": lambda a, b: torch.bitwise_or(unwrap_tensor(a), unwrap_tensor(b)),
-    "xor": lambda a, b: torch.bitwise_xor(unwrap_tensor(a), unwrap_tensor(b)),
-}
-
-
-@requires_tma
-# TODO: interpreter support
-# @pytest.mark.interpreter
-@pytest.mark.parametrize("kind", ["add", "min", "max", "and", "or", "xor"])
-@pytest.mark.parametrize("dtype_str", tma_dtypes)
-@pytest.mark.parametrize("num_ctas", [1, 2])
-@pytest.mark.parametrize("descriptor", ["host", "device"])
-@pytest.mark.parametrize("M_BLOCK,N_BLOCK", [(2, 16), (8, 16), (8, 32), (8, 128), (512, 32), (1, 1024)])
-def test_tensor_descriptor_reduce(kind, descriptor, dtype_str, num_ctas, M_BLOCK, N_BLOCK):
-
-    @triton.jit(debug=True)
-    def kernel(out_desc, out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr, kind: tl.constexpr):
-        moffset = tl.program_id(0) * M_BLOCK
-        noffset = tl.program_id(1) * N_BLOCK
-
-        midx = moffset + tl.arange(0, M_BLOCK)[:, None]
-        nidx = noffset + tl.arange(0, N_BLOCK)[None, :]
-        idx = midx * N + nidx
-
-        val = tl.load(a_ptr + idx)
-
-        if out_desc is None:
-            desc = tl.make_tensor_descriptor(
-                out_ptr,
-                shape=[M, N],
-                strides=[N, 1],
-                block_shape=[M_BLOCK, N_BLOCK],
-            )
-        else:
-            desc = out_desc
-
-        assert desc.shape[0] == M
-        assert desc.shape[1] == N
-        assert desc.strides[0] == N
-        assert desc.strides[1] == 1
-        assert desc.block_shape == [M_BLOCK, N_BLOCK]
-        if kind == "add":
-            desc.atomic_add([moffset, noffset], val)
-        elif kind == "min":
-            desc.atomic_min([moffset, noffset], val)
-        elif kind == "max":
-            desc.atomic_max([moffset, noffset], val)
-        elif kind == "and":
-            desc.atomic_and([moffset, noffset], val)
-        elif kind == "or":
-            desc.atomic_or([moffset, noffset], val)
-        else:
-            tl.static_assert(kind == "xor")
-            desc.atomic_xor([moffset, noffset], val)
-
-    M, N = M_BLOCK * 2, N_BLOCK * 2
-    rs = np.random.RandomState(seed=17)
-    inp = to_triton(numpy_random((M, N), dtype_str, rs), device="cuda", dst_type=dtype_str)
-    out = to_triton(numpy_random((M, N), dtype_str, rs), device="cuda", dst_type=dtype_str)
-
-    grid_m = M // M_BLOCK
-    grid_n = N // N_BLOCK
-
-    if descriptor == "host":
-        out_desc = TensorDescriptor.from_tensor(out, [M_BLOCK, N_BLOCK])
-    else:
-
-        def alloc_fn(size: int, align: int, stream: Optional[int]):
-            assert size == 128 * (grid_m * grid_n) * num_ctas
-            assert align == 128
-            assert stream == 0
-            return torch.empty(size, dtype=torch.int8, device="cuda")
-
-        triton.set_allocator(alloc_fn)
-        out_desc = None
-
-    supported = getattr(tl, dtype_str) in SUPPORTED_REDUCE_DTYPES[kind]
-    if not supported:
-        with pytest.raises(CompilationError):
-            kernel[(grid_m, grid_n)](out_desc, out, inp, M, N, M_BLOCK, N_BLOCK, kind, num_ctas=num_ctas)
-        return
-
-    expect = REDUCE_OP[kind](inp, out)
-    kernel[(grid_m, grid_n)](out_desc, out, inp, M, N, M_BLOCK, N_BLOCK, kind, num_ctas=num_ctas)
-    torch.testing.assert_close(expect, unwrap_tensor(out), check_dtype=False)
 
 
 @requires_tma
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py
@@ -4,10 +4,12 @@
 
 import triton
 import triton.language as tl
-from triton._internal_testing import is_interpreter, numpy_random, to_triton, unwrap_tensor, tma_dtypes
+from triton._internal_testing import is_interpreter, numpy_random, to_triton, unwrap_tensor, tma_dtypes, to_numpy
 from triton.tools.mxfp import MXFP4Tensor, MXScaleTensor
 from typing import Optional
-from triton._internal_testing import is_cuda, is_hip
+from triton._internal_testing import is_cuda, is_hip, is_hip_cdna3
+from triton.tools.tensor_descriptor import TensorDescriptor
+from triton import CompilationError
 
 
 @pytest.mark.interpreter
@@ -1434,3 +1436,140 @@ def alloc_fn(size: int, align: int, steam):
 
     ref = torch_scatter_rows(input, idx, y, BLOCK_Y, X, Y)
     torch.testing.assert_close(ref, output, atol=0, rtol=0)
+
+
+NATIVE_SUPPORTED_REDUCE_DTYPES = {
+    "add": {tl.uint32, tl.int32, tl.uint64, tl.float32, tl.float16, tl.bfloat16},
+    "min": {tl.uint32, tl.int32, tl.uint64, tl.int64, tl.float16, tl.bfloat16},
+    "max": {tl.uint32, tl.int32, tl.uint64, tl.int64, tl.float16, tl.bfloat16},
+    "and": {tl.uint32, tl.int32, tl.uint64, tl.int64},
+    "or": {tl.uint32, tl.int32, tl.uint64, tl.int64},
+    "xor": {tl.uint32, tl.int32, tl.uint64, tl.int64},
+}
+FALLBACK_SUPPORTED_REDUCE_DTYPES = {
+    "add": {tl.uint32, tl.int32, tl.uint64, tl.float32, tl.float16, tl.bfloat16},
+    "min": {tl.uint32, tl.int32, tl.uint64, tl.int64},
+    "max": {tl.uint32, tl.int32, tl.uint64, tl.int64},
+    "and": {tl.uint32, tl.int32, tl.uint64, tl.int64},
+    "or": {tl.uint32, tl.int32, tl.uint64, tl.int64},
+    "xor": {tl.uint32, tl.int32, tl.uint64, tl.int64},
+}
+
+
+def min_op(a, b):
+    out = np.minimum(to_numpy(a), to_numpy(b))
+    return unwrap_tensor(to_triton(out, device=a.device))
+
+
+def max_op(a, b):
+    out = np.maximum(to_numpy(a), to_numpy(b))
+    return unwrap_tensor(to_triton(out, device=a.device))
+
+
+REDUCE_OP = {
+    "add": lambda a, b: unwrap_tensor(a) + unwrap_tensor(b),
+    "min": min_op,
+    "max": max_op,
+    "and": lambda a, b: torch.bitwise_and(unwrap_tensor(a), unwrap_tensor(b)),
+    "or": lambda a, b: torch.bitwise_or(unwrap_tensor(a), unwrap_tensor(b)),
+    "xor": lambda a, b: torch.bitwise_xor(unwrap_tensor(a), unwrap_tensor(b)),
+}
+
+REDUCE_SKIP_HIP_CDNA3 = [
+    ("min", "int32", 1, 1024),
+    ("max", "int32", 1, 1024),
+    ("add", "bfloat16", 1, 1024),
+]
+
+
+# TODO: interpreter support
+# @pytest.mark.interpreter
+@pytest.mark.parametrize("kind", ["add", "min", "max", "and", "or", "xor"])
+@pytest.mark.parametrize("dtype_str", tma_dtypes)
+@pytest.mark.parametrize("num_ctas", [1, 2])
+@pytest.mark.parametrize("descriptor", ["host", "device"])
+@pytest.mark.parametrize("M_BLOCK,N_BLOCK", [(2, 16), (8, 16), (8, 32), (8, 128), (512, 32), (1, 1024)])
+def test_tensor_descriptor_reduce(kind, descriptor, dtype_str, num_ctas, M_BLOCK, N_BLOCK):
+    is_native = is_cuda() and torch.cuda.get_device_capability()[0] >= 9
+    if not is_native:
+        if num_ctas != 1:
+            pytest.skip("Multi-CTA not supported")
+        if descriptor == "host":
+            pytest.skip("NYI: Host side tensor descriptor fallback")
+        if is_hip_cdna3() and (kind, dtype_str, M_BLOCK, N_BLOCK) in REDUCE_SKIP_HIP_CDNA3:
+            pytest.skip("Broken on rocm")
+
+    @triton.jit(debug=True)
+    def kernel(out_desc, out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr, kind: tl.constexpr):
+        moffset = tl.program_id(0) * M_BLOCK
+        noffset = tl.program_id(1) * N_BLOCK
+
+        midx = moffset + tl.arange(0, M_BLOCK)[:, None]
+        nidx = noffset + tl.arange(0, N_BLOCK)[None, :]
+        idx = midx * N + nidx
+
+        val = tl.load(a_ptr + idx)
+
+        if out_desc is None:
+            desc = tl.make_tensor_descriptor(
+                out_ptr,
+                shape=[M, N],
+                strides=[N, 1],
+                block_shape=[M_BLOCK, N_BLOCK],
+            )
+        else:
+            desc = out_desc
+
+        assert desc.shape[0] == M
+        assert desc.shape[1] == N
+        assert desc.strides[0] == N
+        assert desc.strides[1] == 1
+        assert desc.block_shape == [M_BLOCK, N_BLOCK]
+        if kind == "add":
+            desc.atomic_add([moffset, noffset], val)
+        elif kind == "min":
+            desc.atomic_min([moffset, noffset], val)
+        elif kind == "max":
+            desc.atomic_max([moffset, noffset], val)
+        elif kind == "and":
+            desc.atomic_and([moffset, noffset], val)
+        elif kind == "or":
+            desc.atomic_or([moffset, noffset], val)
+        else:
+            tl.static_assert(kind == "xor")
+            desc.atomic_xor([moffset, noffset], val)
+
+    M, N = M_BLOCK * 2, N_BLOCK * 2
+    rs = np.random.RandomState(seed=17)
+    inp = to_triton(numpy_random((M, N), dtype_str, rs), device="cuda", dst_type=dtype_str)
+    out = to_triton(numpy_random((M, N), dtype_str, rs), device="cuda", dst_type=dtype_str)
+
+    grid_m = M // M_BLOCK
+    grid_n = N // N_BLOCK
+
+    if descriptor == "host":
+        out_desc = TensorDescriptor.from_tensor(out, [M_BLOCK, N_BLOCK])
+    else:
+
+        def alloc_fn(size: int, align: int, stream: Optional[int]):
+            assert size == 128 * (grid_m * grid_n) * num_ctas
+            assert align == 128
+            assert stream == 0
+            return torch.empty(size, dtype=torch.int8, device="cuda")
+
+        triton.set_allocator(alloc_fn)
+        out_desc = None
+
+    dtype = getattr(tl, dtype_str)
+    native_supported = dtype in NATIVE_SUPPORTED_REDUCE_DTYPES[kind]
+    fallback_supported = dtype in FALLBACK_SUPPORTED_REDUCE_DTYPES[kind]
+    supported = native_supported if is_native else fallback_supported
+    if not supported:
+        exc_type = CompilationError if not native_supported else RuntimeError
+        with pytest.raises(exc_type):
+            kernel[(grid_m, grid_n)](out_desc, out, inp, M, N, M_BLOCK, N_BLOCK, kind, num_ctas=num_ctas)
+        return
+
+    expect = REDUCE_OP[kind](inp, out)
+    kernel[(grid_m, grid_n)](out_desc, out, inp, M, N, M_BLOCK, N_BLOCK, kind, num_ctas=num_ctas)
+    torch.testing.assert_close(expect, unwrap_tensor(out), check_dtype=False)