[AMD] Support ConvertLayout in CanonicalizePointers (#6142)

AlexAUT · web-flow · commit fa8b7bbbbcdd · 2025-03-10T15:18:37.000Z
`CanonicalizePointers` pass was missing a rewrite pattern for
`ConvertLayout` to change it to use the offset of the fat ptr.
Without this change the pass will fail if the ptr of the `tt.load`
was transformed by a `ConvertLayout`.

Added a lit test for it and a general correctness test for indirect
loads and stores.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -7211,3 +7211,47 @@ def aliasing_kernel(buffer, buffer2):
     buffer = torch.zeros(1, device=device)
     aliasing_kernel[(1, )](buffer, buffer)
     assert buffer[0] == 1
+
+
+@pytest.mark.interpreter
+@pytest.mark.parametrize("dtype", list(dtypes) + ["bfloat16"])
+def test_indirect_load(dtype, device):
+
+    @triton.jit
+    def indirect_load(offset_ptr, x_ptr, output_ptr, SIZE: tl.constexpr):
+        linear_offsets = tl.arange(0, SIZE)
+        offsets = tl.load(offset_ptr + linear_offsets)
+        x = tl.load(x_ptr + offsets)
+        tl.store(output_ptr + linear_offsets, x)
+
+    SIZE = 512
+    x = numpy_random(SIZE, dtype_str=dtype)
+    x_tri = to_triton(x, device)
+    # Flip the range to load the tensor in reverse order
+    ptr = torch.arange(SIZE, device=device, dtype=torch.int32).flip(0)
+    out_tri = torch.empty(SIZE, device=device)
+    indirect_load[(1, 1)](ptr, x_tri, out_tri, SIZE)
+
+    np.testing.assert_allclose(np.flip(x), to_numpy(out_tri))
+
+
+@pytest.mark.interpreter
+@pytest.mark.parametrize("dtype", list(dtypes) + ["bfloat16"])
+def test_indirect_store(dtype, device):
+
+    @triton.jit
+    def indirect_store(offset_ptr, x_ptr, output_ptr, SIZE: tl.constexpr):
+        linear_offsets = tl.arange(0, SIZE)
+        offsets = tl.load(offset_ptr + linear_offsets)
+        x = tl.load(x_ptr + linear_offsets)
+        tl.store(output_ptr + offsets, x)
+
+    SIZE = 512
+    x = numpy_random(SIZE, dtype_str=dtype)
+    x_tri = to_triton(x, device)
+    # Flip the range to store the tensor in reverse order
+    ptr = torch.arange(SIZE, device=device, dtype=torch.int32).flip(0)
+    out_tri = torch.empty(SIZE, device=device)
+    indirect_store[(1, 1)](ptr, x_tri, out_tri, SIZE)
+
+    np.testing.assert_allclose(np.flip(x), to_numpy(out_tri))
diff --git a/test/TritonGPU/amd/amd-canonicalize-pointers.mlir b/test/TritonGPU/amd/amd-canonicalize-pointers.mlir
@@ -132,6 +132,40 @@ module attributes {"ttg.num-warps" = 4 : i32} {
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @convertLayoutOp(%arg0: !tt.ptr<f32>, %arg1: !tt.ptr<i32>, %arg2: tensor<1024xi32, #blocked>) -> tensor<1024xf32, #blocked1> {
+    %0 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %1 = tt.splat %arg1 : !tt.ptr<i32> -> tensor<1024x!tt.ptr<i32>, #blocked>
+    %2 = tt.addptr %1, %arg2 : tensor<1024x!tt.ptr<i32>, #blocked>, tensor<1024xi32, #blocked>
+    %3 = tt.load %2 : tensor<1024x!tt.ptr<i32>, #blocked>
+    %4 = tt.addptr %0, %3 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    %5 = ttg.convert_layout %4 : tensor<1024x!tt.ptr<f32>, #blocked> -> tensor<1024x!tt.ptr<f32>, #blocked1>
+    %6 = tt.load %5 : tensor<1024x!tt.ptr<f32>, #blocked1>
+    tt.return %6 : tensor<1024xf32, #blocked1>
+  }
+}
+
+// CHECK: #[[$ATTR_0:.+]] = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+// CHECK: #[[$ATTR_1:.+]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+
+// CHECK-LABEL:   tt.func public @convertLayoutOp(
+// CHECK-SAME:  %[[VAL_0:.*]]: !tt.ptr<f32>, %[[VAL_1:.*]]: !tt.ptr<i32>, %[[VAL_2:.*]]: tensor<1024xi32, #[[$ATTR_0]]>) -> tensor<1024xf32, #[[$ATTR_1]]> {
+// CHECK:           %[[VAL_3:.*]] = tt.splat %[[VAL_1]] : !tt.ptr<i32> -> tensor<1024x!tt.ptr<i32>, #[[$ATTR_0]]>
+// CHECK:           %[[VAL_4:.*]] = tt.addptr %[[VAL_3]], %[[VAL_2]] : tensor<1024x!tt.ptr<i32>, #[[$ATTR_0]]>, tensor<1024xi32, #[[$ATTR_0]]>
+// CHECK:           %[[VAL_5:.*]] = tt.load %[[VAL_4]] : tensor<1024x!tt.ptr<i32>, #[[$ATTR_0]]>
+// CHECK:           %[[VAL_6:.*]] = arith.extsi %[[VAL_5]] : tensor<1024xi32, #[[$ATTR_0]]> to tensor<1024xi64, #[[$ATTR_0]]>
+// CHECK:           %[[VAL_7:.*]] = ttg.convert_layout %[[VAL_6]] : tensor<1024xi64, #[[$ATTR_0]]> -> tensor<1024xi64, #[[$ATTR_1]]>
+// CHECK:           %[[VAL_8:.*]] = arith.trunci %[[VAL_7]] : tensor<1024xi64, #[[$ATTR_1]]> to tensor<1024xi32, #[[$ATTR_1]]>
+// CHECK:           %[[VAL_9:.*]] = tt.splat %[[VAL_0]] : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #[[$ATTR_1]]>
+// CHECK:           %[[VAL_10:.*]] = tt.addptr %[[VAL_9]], %[[VAL_8]] : tensor<1024x!tt.ptr<f32>, #[[$ATTR_1]]>, tensor<1024xi32, #[[$ATTR_1]]>
+// CHECK:           %[[VAL_11:.*]] = tt.load %[[VAL_10]] : tensor<1024x!tt.ptr<f32>, #[[$ATTR_1]]>
+// CHECK:           tt.return %[[VAL_11]] : tensor<1024xf32, #[[$ATTR_1]]>
+// CHECK:         }
+
+// -----
+
 module attributes {"ttg.num-warps" = 4 : i32} {
   tt.func @forOp(%arg0: !tt.ptr<f32>, %arg1: tensor<1024xf32>) -> tensor<1024xf32> {
     %c1024_i32 = arith.constant 1024 : i32
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
@@ -1079,6 +1079,48 @@ class ConvertExpandDims
   }
 };
 
+/// convert integer offset, keep base
+class ConvertConvertLayoutOp
+    : public PointerCanonicalizationPattern<tt::gpu::ConvertLayoutOp> {
+public:
+  using PointerCanonicalizationPattern::PointerCanonicalizationPattern;
+
+  LogicalResult
+  matchAndRewrite_(tt::gpu::ConvertLayoutOp cvtOp, OneToNOpAdaptor adaptor,
+                   ConversionPatternRewriter &rewriter) const override {
+    ValueRange remappedOperands = adaptor.getSrc();
+    if (remappedOperands.size() != 2) {
+      // some prior op materialized the fat ptr, e.g.:
+      // %3 = tt.bitcast %2
+      // %4 = tt.splat %3
+      return success();
+    }
+    Value fatPtrBase = remappedOperands[0];
+    Value fatPtrOffset = remappedOperands[1];
+    if (!llvm::isa<tt::PointerType>(fatPtrBase.getType())) {
+      return rewriter.notifyMatchFailure(cvtOp,
+                                         "non tt.ptr base unimplemented");
+    }
+    auto offsetTensorTy = dyn_cast<RankedTensorType>(fatPtrOffset.getType());
+    if (!offsetTensorTy) {
+      return rewriter.notifyMatchFailure(
+          cvtOp, "non RankedTensorType offset unimplemented");
+    }
+
+    RankedTensorType outType = cvtOp.getResult().getType();
+    auto newOffsetType = RankedTensorType::get(outType.getShape(),
+                                               offsetTensorTy.getElementType(),
+                                               outType.getEncoding());
+    tt::gpu::ConvertLayoutOp cvtOffset =
+        rewriter.create<tt::gpu::ConvertLayoutOp>(cvtOp.getLoc(), newOffsetType,
+                                                  fatPtrOffset);
+    rewriter.replaceOpWithMultiple(cvtOp, {{fatPtrBase, cvtOffset}});
+    fatPtrs[{fatPtrBase, cvtOffset}] = fatPtrs.at({fatPtrBase, fatPtrOffset});
+
+    return success();
+  }
+};
+
 template <typename SourceOp, int PtrLikeIdx = 0>
 class MaterializeFatPointer : public PointerCanonicalizationPattern<SourceOp> {
 public:
@@ -1452,7 +1494,8 @@ void TritonAMDGPUCanonicalizePointersPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   patterns.add<
       ConvertFuncOpArgsUnrealizedCasts, ConvertBroadcastOp, ConvertSplatOp,
-      ConvertAddPtrOp, MaterializeFatPointer<tt::AtomicCASOp>,
+      ConvertConvertLayoutOp, ConvertAddPtrOp,
+      MaterializeFatPointer<tt::AtomicCASOp>,
       MaterializeFatPointer<tt::AtomicRMWOp>,
       MaterializeFatPointer<tt::BitcastOp>, MaterializeFatPointer<tt::LoadOp>,
       MaterializeFatPointer<triton::gpu::AsyncCopyGlobalToLocalOp>,