[TensorDesc] Add fallback for gather and scatter (#6822)

peterbell10 · web-flow · commit 9aa2c86706d3 · 2025-05-15T01:31:17.000Z
diff --git a/lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp b/lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp
@@ -104,9 +104,9 @@ Value getExpandedOffsetWithRange(OpBuilder &builder, const Location &loc,
   return expandOffsets(builder, loc, blockShape, offsets, dim);
 }
 
-Value generatePtr(OpBuilder &builder, const Location &loc,
-                  ArrayRef<std::int64_t> blockShape, Descriptor &desc,
-                  ValueRange offsets) {
+Value generatePtrFromOffsetRanges(OpBuilder &builder, Location loc,
+                                  ArrayRef<int64_t> blockShape,
+                                  Descriptor &desc, ValueRange offsets) {
   assert(blockShape.size() == desc.shape.size());
   assert(blockShape.size() == offsets.size());
   auto indexTensorType =
@@ -117,15 +117,12 @@ Value generatePtr(OpBuilder &builder, const Location &loc,
   // Generate offsets per dimension
   Value ptr = builder.create<triton::SplatOp>(loc, ptrTensorType, desc.base);
   for (unsigned i = 0; i < blockShape.size(); ++i) {
-    auto offsetWithRange =
-        getExpandedOffsetWithRange(builder, loc, blockShape, offsets[i], i);
-
     // We must splat strides into the expanded shape not a row for retaining
     // the divisibility information given by strides
     Value splatStride = builder.create<triton::SplatOp>(
-        loc, offsetWithRange.getType(), desc.strides[i]);
+        loc, offsets[i].getType(), desc.strides[i]);
     Value offsetWithStride =
-        builder.create<arith::MulIOp>(loc, offsetWithRange, splatStride);
+        builder.create<arith::MulIOp>(loc, offsets[i], splatStride);
     Value broadcasted = builder.create<triton::BroadcastOp>(
         loc, indexTensorType, offsetWithStride);
 
@@ -137,32 +134,47 @@ Value generatePtr(OpBuilder &builder, const Location &loc,
   return ptr;
 }
 
-Value generateMask(OpBuilder &builder, const Location &loc,
-                   ArrayRef<std::int64_t> blockShape, Descriptor &desc,
-                   ValueRange offsets) {
+Value generatePtr(OpBuilder &builder, const Location &loc,
+                  ArrayRef<std::int64_t> blockShape, Descriptor &desc,
+                  ValueRange offsets) {
   assert(blockShape.size() == desc.shape.size());
   assert(blockShape.size() == offsets.size());
+  SmallVector<Value> offsetRanges;
+  for (unsigned i = 0; i < blockShape.size(); ++i) {
+    auto offsetWithRange =
+        getExpandedOffsetWithRange(builder, loc, blockShape, offsets[i], i);
+    offsetRanges.push_back(offsetWithRange);
+  }
+
+  return generatePtrFromOffsetRanges(builder, loc, blockShape, desc,
+                                     offsetRanges);
+}
+
+Value generateMaskFromOffsetRanges(OpBuilder &builder, const Location &loc,
+                                   ArrayRef<std::int64_t> blockShape,
+                                   Descriptor &desc, ValueRange offsetRanges) {
+  assert(blockShape.size() == desc.shape.size());
+  assert(blockShape.size() == offsetRanges.size());
 
   // Generate mask per dimension
   auto maskTensorType = RankedTensorType::get(blockShape, builder.getI1Type());
   Value mask;
   for (std::size_t i = 0; i < blockShape.size(); ++i) {
-    auto offsetWithRange =
-        getExpandedOffsetWithRange(builder, loc, blockShape, offsets[i], i);
+    auto offsetWithRange = offsetRanges[i];
 
     // Compare with lower bound
     Value lowerBound = builder.create<mlir::arith::ConstantIntOp>(
         loc, 0, builder.getI64Type());
     Value splatLowerBound = builder.create<triton::SplatOp>(
-        loc, offsetWithRange.getType(), lowerBound);
+        loc, offsetRanges[i].getType(), lowerBound);
     Value cmpLower = builder.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sge, offsetWithRange, splatLowerBound);
+        loc, arith::CmpIPredicate::sge, offsetRanges[i], splatLowerBound);
 
     // Compare with upper bound
     Value splatUpperBound = builder.create<triton::SplatOp>(
-        loc, offsetWithRange.getType(), desc.shape[i]);
+        loc, offsetRanges[i].getType(), desc.shape[i]);
     Value cmpUpper = builder.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, offsetWithRange, splatUpperBound);
+        loc, arith::CmpIPredicate::slt, offsetRanges[i], splatUpperBound);
 
     // And and broadcast
     Value andResult = builder.create<arith::AndIOp>(loc, cmpLower, cmpUpper);
@@ -180,15 +192,35 @@ Value generateMask(OpBuilder &builder, const Location &loc,
   return mask;
 }
 
-Value generateOther(OpBuilder &builder, const Location &loc,
-                    TensorDescType descTy) {
-  auto scalarTy = descTy.getSignlessBlockType().getElementType();
-  auto blockTy =
-      RankedTensorType::get(descTy.getBlockType().getShape(), scalarTy);
+Value generateMask(OpBuilder &builder, const Location &loc,
+                   ArrayRef<std::int64_t> blockShape, Descriptor &desc,
+                   ValueRange offsets) {
+  assert(blockShape.size() == desc.shape.size());
+  assert(blockShape.size() == offsets.size());
+  SmallVector<Value> offsetRanges;
+  for (unsigned i = 0; i < blockShape.size(); ++i) {
+    auto offsetWithRange =
+        getExpandedOffsetWithRange(builder, loc, blockShape, offsets[i], i);
+    offsetRanges.push_back(offsetWithRange);
+  }
+
+  return generateMaskFromOffsetRanges(builder, loc, blockShape, desc,
+                                      offsetRanges);
+}
+
+Value generateOther(OpBuilder &builder, Location loc, Type scalarTy,
+                    ArrayRef<int64_t> blockShape) {
+  auto blockTy = RankedTensorType::get(blockShape, scalarTy);
   auto attr = builder.getZeroAttr(blockTy);
   return builder.create<arith::ConstantOp>(loc, attr);
 }
 
+Value generateOther(OpBuilder &builder, Location loc, TensorDescType descTy) {
+  auto blockTy = descTy.getSignlessBlockType();
+  return generateOther(builder, loc, blockTy.getElementType(),
+                       blockTy.getShape());
+}
+
 SmallVector<mlir::Value> castToI64(OpBuilder &builder,
                                    mlir::ValueRange values) {
   auto i64Type = builder.getI64Type();
@@ -261,6 +293,73 @@ struct RewriteStorePattern : OpConversionPattern<triton::DescriptorStoreOp> {
   }
 };
 
+std::pair<Value, Value>
+generateGatherScatterPtrMask(OpBuilder &builder, Location loc,
+                             ArrayRef<int64_t> blockShape, Descriptor &desc,
+                             Value xOffsets, Value yOffset) {
+  Value xOffsetRange =
+      expandOffsets(builder, loc, blockShape, xOffsets, /*dim=*/0);
+  yOffset = castToI64(builder, {yOffset})[0];
+  auto xOffsetI64Ty = RankedTensorType::get(
+      cast<RankedTensorType>(xOffsetRange.getType()).getShape(),
+      yOffset.getType());
+  xOffsetRange =
+      builder.create<arith::ExtSIOp>(loc, xOffsetI64Ty, xOffsetRange);
+  auto yOffsetRange =
+      getExpandedOffsetWithRange(builder, loc, blockShape, yOffset, /*dim=*/1);
+  auto ptr = generatePtrFromOffsetRanges(builder, loc, blockShape, desc,
+                                         {xOffsetRange, yOffsetRange});
+  auto mask = generateMaskFromOffsetRanges(builder, loc, blockShape, desc,
+                                           {xOffsetRange, yOffsetRange});
+  return {ptr, mask};
+}
+
+struct RewriteGatherPattern : OpConversionPattern<triton::DescriptorGatherOp> {
+  using OpConversionPattern<triton::DescriptorGatherOp>::OpConversionPattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(triton::DescriptorGatherOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto descTy = op.getDesc().getType();
+    const auto blockShape = op.getResult().getType().getShape();
+    auto desc = unpackDescriptor(descTy, adaptor.getDesc());
+    auto [ptr, mask] = generateGatherScatterPtrMask(
+        rewriter, loc, blockShape, desc, op.getXOffsets(), op.getYOffset());
+    auto other = generateOther(rewriter, loc,
+                               descTy.getSignlessBlockType().getElementType(),
+                               blockShape);
+    auto newLoad = rewriter.replaceOpWithNewOp<triton::LoadOp>(
+        op, ptr, mask, other, triton::CacheModifier::NONE,
+        triton::EvictionPolicy::NORMAL, false);
+    newLoad->setAttrs(filterSegmentSizes(op->getAttrs()));
+
+    return llvm::success();
+  }
+};
+
+struct RewriteScatterPattern
+    : OpConversionPattern<triton::DescriptorScatterOp> {
+  using OpConversionPattern<triton::DescriptorScatterOp>::OpConversionPattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(triton::DescriptorScatterOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto descTy = op.getDesc().getType();
+    const auto blockShape = op.getSrc().getType().getShape();
+    auto desc = unpackDescriptor(descTy, adaptor.getDesc());
+    auto [ptr, mask] = generateGatherScatterPtrMask(
+        rewriter, loc, blockShape, desc, op.getXOffsets(), op.getYOffset());
+    auto newStore = rewriter.replaceOpWithNewOp<triton::StoreOp>(
+        op, ptr, op.getSrc(), mask, triton::CacheModifier::NONE,
+        triton::EvictionPolicy::NORMAL);
+    newStore->setAttrs(filterSegmentSizes(op->getAttrs()));
+
+    return llvm::success();
+  }
+};
+
 /**
  * @brief This implements the pass for converting triton tensor descriptor
  * loads/stores into indexed loads/stores.
@@ -329,9 +428,9 @@ class TritonRewriteTensorDescriptorToPointerPass
     mlir::scf::populateSCFStructuralTypeConversions(converter, patterns);
     triton::populateArithTypeConversions(converter, patterns);
 
-    patterns
-        .add<RewriteMakeTensorDesc, RewriteLoadPattern, RewriteStorePattern>(
-            converter, &getContext());
+    patterns.add<RewriteMakeTensorDesc, RewriteLoadPattern, RewriteStorePattern,
+                 RewriteGatherPattern, RewriteScatterPattern>(converter,
+                                                              &getContext());
 
     ConversionConfig config;
     config.buildMaterializations = false;
diff --git a/python/test/unit/cuda/test_tensor_descriptor.py b/python/test/unit/cuda/test_tensor_descriptor.py
@@ -121,151 +121,6 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
     torch.testing.assert_close(expect, unwrap_tensor(out), check_dtype=False)
 
 
-@triton.jit
-def tma_gather_rows_kernel(out_ptr, in_ptr, idx_ptr, y, X: tl.constexpr, Y: tl.constexpr, BLOCK_X: tl.constexpr,
-                           BLOCK_Y: tl.constexpr):
-    idx = tl.load(idx_ptr + tl.arange(0, BLOCK_X))
-    desc = tl.make_tensor_descriptor(in_ptr, [X, Y], [Y, 1], [1, BLOCK_Y])
-    out = desc.gather(idx, y)
-    tl.store(out_ptr + tl.arange(0, BLOCK_X)[:, None] * BLOCK_Y + tl.arange(0, BLOCK_Y)[None, :], out)
-
-
-def torch_gather_rows(input, idx, y, block_y):
-    out = torch.empty(0, device=input.device, dtype=input.dtype)
-    for i in idx:
-        x = input[i][y:y + block_y]
-        out = torch.cat((out, x.reshape(1, x.shape[0])), dim=0)
-    return out
-
-
-@pytest.mark.interpreter
-@pytest.mark.parametrize("X, Y", [(128, 128), (64, 256)])
-@pytest.mark.parametrize("BLOCK_X, BLOCK_Y", [(32, 32), (64, 128), (16, 128), (512, 16)])
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int8])
-@pytest.mark.parametrize("y", [0, 32, 48])
-@pytest.mark.skipif(not is_interpreter() and torch.cuda.get_device_capability()[0] != 10,
-                    reason="TMA Gather only works on cloud Blackwell Chips")
-def test_tma_gather(X, Y, BLOCK_X, BLOCK_Y, dtype, y, device):
-    if BLOCK_X > X or y + BLOCK_Y > Y:
-        pytest.skip()
-
-    torch.manual_seed(42)
-    if dtype != torch.int8:
-        input = torch.rand((X, Y), dtype=dtype, device=device)
-    else:
-        input = torch.arange(X * Y, dtype=dtype, device=device).reshape(X, Y)
-    output = torch.empty((BLOCK_X, BLOCK_Y), dtype=dtype, device=device)
-
-    idx = torch.randint(BLOCK_X, (BLOCK_X, ), dtype=torch.int32, device=device)
-
-    def alloc_fn(size: int, align: int, steam):
-        return torch.empty(size, dtype=torch.int8, device=device)
-
-    triton.set_allocator(alloc_fn)
-
-    tma_gather_rows_kernel[(1, )](output, input, idx, y, X, Y, BLOCK_X, BLOCK_Y)
-
-    ref = torch_gather_rows(input, idx, y, BLOCK_Y)
-    torch.testing.assert_close(ref, output, atol=0, rtol=0)
-
-
-@triton.jit
-def tma_gather_dot_pipeline(  #
-        a_ptr, b_ptr, output_ptr,  #
-        stride_am, stride_ak,  #
-        stride_bk, stride_bn,  #
-        stride_cm, stride_cn,  #
-        K: tl.constexpr,  #
-        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,  #
-):
-    a_desc = tl.make_tensor_descriptor(a_ptr, [BLOCK_M, K], [K, 1], [1, BLOCK_K])
-    b_desc = tl.make_tensor_descriptor(b_ptr, [K, BLOCK_N], [BLOCK_N, 1], [1, BLOCK_N])
-
-    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=output_ptr.dtype.element_ty)
-    for k in range(0, K, BLOCK_K):
-        a = a_desc.gather(tl.arange(0, BLOCK_M), k)
-        b = b_desc.gather(tl.arange(0, BLOCK_K) + k, 0)
-        accumulator = tl.dot(a, b, acc=accumulator)
-
-    offs_cm = tl.arange(0, BLOCK_M)
-    offs_cn = tl.arange(0, BLOCK_N)
-    output_ptrs = output_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    tl.store(output_ptrs, accumulator)
-
-
-@pytest.mark.interpreter
-@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(16, 16, 16)])
-@pytest.mark.parametrize("K", [128])
-@pytest.mark.skipif(not is_interpreter() and torch.cuda.get_device_capability()[0] != 10,
-                    reason="TMA Gather only works on cloud Blackwell Chips")
-def test_tma_gather_dot_pipeline(BLOCK_M, BLOCK_N, BLOCK_K, K, device):
-
-    def alloc_fn(size: int, align: int, steam):
-        return torch.empty(size, dtype=torch.int8, device=device)
-
-    triton.set_allocator(alloc_fn)
-
-    a = torch.arange(BLOCK_M * K, device=device).reshape(BLOCK_M, K).float()
-    b = torch.arange(K * BLOCK_N, device=device).reshape(K, BLOCK_N).float()
-
-    c = a @ b
-
-    output = torch.zeros((BLOCK_M, BLOCK_N), dtype=torch.float32, device=device)
-    if not is_interpreter():
-        kernel = tma_gather_dot_pipeline.warmup(a, b, output, a.stride(0), a.stride(1), b.stride(0), b.stride(1),
-                                                output.stride(0), output.stride(1), K, BLOCK_M, BLOCK_N, BLOCK_K,
-                                                grid=(1, ))
-        assert kernel.asm["ttgir"].count("ttng.async_tma_gather") == 6
-    tma_gather_dot_pipeline[(1, 1, 1)](a, b, output, a.stride(0), a.stride(1), b.stride(0), b.stride(1),
-                                       output.stride(0), output.stride(1), K, BLOCK_M, BLOCK_N, BLOCK_K)
-
-    torch.testing.assert_close(c, output)
-
-
-def torch_scatter_rows(input, idx, y, block_y, X, Y):
-    out = torch.zeros((X, Y), dtype=input.dtype, device=input.device)
-    for i, j in enumerate(idx):
-        out[j][y:y + block_y] = input[i]
-    return out
-
-
-@triton.jit
-def tma_scatter_rows_kernel(out_ptr, in_ptr, idx_ptr, y, X: tl.constexpr, Y: tl.constexpr, BLOCK_X: tl.constexpr,
-                            BLOCK_Y: tl.constexpr):
-    idx = tl.load(idx_ptr + tl.arange(0, BLOCK_X))
-    data = tl.load(in_ptr + tl.arange(0, BLOCK_X)[:, None] * BLOCK_Y + tl.arange(0, BLOCK_Y)[None, :])
-    desc = tl.make_tensor_descriptor(out_ptr, [X, Y], [Y, 1], [1, BLOCK_Y])
-    desc.scatter(data, idx, y)
-
-
-@pytest.mark.interpreter
-@pytest.mark.parametrize("X, Y", [(128, 128), (64, 256)])
-@pytest.mark.parametrize("BLOCK_X, BLOCK_Y", [(32, 32), (64, 128), (16, 128), (512, 16)])
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int8])
-@pytest.mark.parametrize("y", [0, 32, 48])
-@pytest.mark.skipif(not is_interpreter() and torch.cuda.get_device_capability()[0] != 10,
-                    reason="TMA Gather only works on cloud Blackwell Chips")
-def test_tma_scatter(X, Y, BLOCK_X, BLOCK_Y, dtype, y):
-    if BLOCK_X > X or y + BLOCK_Y > Y:
-        pytest.skip()
-
-    torch.manual_seed(42)
-    input = torch.arange(BLOCK_X * BLOCK_Y, dtype=dtype, device='cuda').reshape(BLOCK_X, BLOCK_Y)
-    output = torch.zeros((X, Y), dtype=dtype, device='cuda')
-
-    idx = torch.randperm(BLOCK_X, dtype=torch.int32, device='cuda')
-
-    def alloc_fn(size: int, align: int, steam):
-        return torch.empty(size, dtype=torch.int8, device='cuda')
-
-    triton.set_allocator(alloc_fn)
-
-    tma_scatter_rows_kernel[(1, )](output, input, idx, y, X, Y, BLOCK_X, BLOCK_Y)
-
-    ref = torch_scatter_rows(input, idx, y, BLOCK_Y, X, Y)
-    torch.testing.assert_close(ref, output, atol=0, rtol=0)
-
-
 @requires_tma
 @pytest.mark.interpreter()
 @pytest.mark.parametrize("dtype_str", tma_dtypes)
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py