[Backend] Support broadcasted layouts in warp shuffle gather codegen (#5395)

Mogball · web-flow · commit 3cb3e693aefd · 2024-12-11T11:54:22.000-08:00
The original implementation side-skirts the issue of dealing with
broadcasting layouts. In trying to implement layout selection for gather
in the middle end, I found that it's pretty common to have tensors that
are too small to avoid broadcasting. E.g. for 4 warps and 32 threads,
the tensor needs to have at least 128 elements.

It turns out that "enabling" broadcasting support is pretty trivial.
Since we know that in a broadcasted layout, broadcasted threads can just
index into any other "group" of threads mapped to the same gather
column, and that the codegen does not vary based on the broadcasted
warps, we can use the pseudo-inverse of the source layout, regardless of
what it is computed to.
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
@@ -679,6 +679,10 @@ class LinearLayout {
 
   // Get the layout that is the inverse of this layout.
   [[nodiscard]] LinearLayout invert() const;
+  // Compute and return a psueodinverse of this layout. This is a layout such
+  // that `B = A.psuedoinvert()` implies that `A(B(x)) = I`. If `A` is
+  // invertible, then this returns `A^-1`.
+  [[nodiscard]] LinearLayout pseudoinvert() const;
 
   // For each in-dim, returns a bitmask of the "free variables" in the layout
   // function.
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -491,13 +491,8 @@ bool GatherLoweringHelper::isWarpLocal() {
   // in the index and source tensors are the same. This means we don't need to
   // xor shuffle across threads before emitting index shuffles; we push warp
   // shuffling to layout conversions.
-  if (srcLayout->sublayout(kLane, otherDims) !=
-      idxLayout->sublayout(kLane, otherDims))
-    return false;
-
-  // Otherwise, the source layout has to be invertible. This primarily means
-  // the codegen path doesn't support broadcasted source layouts.
-  return srcLayout->isInvertible();
+  return srcLayout->sublayout(kLane, otherDims) ==
+         idxLayout->sublayout(kLane, otherDims);
 }
 
 unsigned getNumScratchElements(ArrayRef<unsigned> shape) {
diff --git a/lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp
@@ -240,9 +240,10 @@ void GatherOpConversion::emitWarpLocalGather(
   // `llvm.select` using `src_reg` to get the right one. `K` is the number of
   // elements per column owned by a thread.
 
-  // Fully invert the source layout. We know it is invertible because
-  // `isWarpLocal` checked this.
-  LinearLayout invSrcLayout = srcLayout.invert();
+  // Invert the source layout. It doesn't matter whether it is fully invertible
+  // with respect to anything except the register input dimension, since we know
+  // those don't vary in ways that matter for codegen.
+  LinearLayout invSrcLayout = srcLayout.pseudoinvert();
 
   // Sanity check: the warp must be invariant to the index because otherwise the
   // gather would need to read across warps!
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
@@ -957,9 +957,13 @@ LinearLayout LinearLayout::invertAndCompose(const LinearLayout &outer) const {
 }
 
 LinearLayout LinearLayout::invert() const {
-  // A^-1(x) = A^-1(I(x)), thus A.invert() = I.invertAndCompose(A)
   assert(isInvertible() &&
          "A linear layout must be surjective and square to be invertible");
+  return pseudoinvert();
+}
+
+LinearLayout LinearLayout::pseudoinvert() const {
+  // A^-1(x) = A^-1(I(x)), thus A.invert() = I.invertAndCompose(A)
   LinearLayout identity = LinearLayout::empty();
   for (auto outDim : getOutDimNames()) {
     identity *= LinearLayout::identity1D(getOutDimSize(outDim), outDim, outDim);
diff --git a/test/Conversion/allocate_shared_memory.mlir b/test/Conversion/allocate_shared_memory.mlir
@@ -1,6 +1,6 @@
 // RUN: triton-opt %s --allocate-shared-memory | FileCheck %s
 
-#blocked = #ttg.blocked<{sizePerThread = [32, 32], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [1, 0]}>
 
 // CHECK-LABEL: module
 // CHECK-SAME: ttg.shared = 131072 : i32
diff --git a/test/Conversion/gather_to_llvm.mlir b/test/Conversion/gather_to_llvm.mlir
@@ -16,7 +16,10 @@
 #crazy_2d_src = #ttg.linear<{register = [[0, 2], [2, 0]], lane = [[0, 8], [8, 0], [1, 0], [4, 0], [16, 0]], warp = [[0, 1], [0, 4]], block = []}>
 #crazy_2d_idx = #ttg.linear<{register = [[2, 0], [0, 2]], lane = [[0, 8], [16, 0], [1, 0], [8, 0], [4, 0]], warp = [[0, 1], [0, 4]], block = []}>
 
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
+#broadcasted_lane_1d = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#broadcasted_warp_2d = #ttg.blocked<{sizePerThread = [2, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
 
 // Each source element is mapped to a single thread, so we expect one index shuffle.
 // CHECK-LABEL: @gather_warp_local_trivial
@@ -222,6 +225,59 @@ tt.func private @gather_2d_crazy(%arg0: tensor<32x16xi32, #crazy_2d_idx>, %arg1:
   tt.return %0 : tensor<32x16xf32, #crazy_2d_idx>
 }
 
+// There are 16 elements in the tensor. For each warp, each half-warp is mapped
+// to the 16 elements, so it doesn't matter if the second half [16, 32) indexes
+// into [0, 16), since they contain the same data.
+// CHECK-LABEL: @gather_broadcasted_lane_1d
+tt.func private @gather_broadcasted_lane_1d(%arg0: tensor<16xi32, #broadcasted_lane_1d>, %arg1: tensor<16xf32, #broadcasted_lane_1d>) -> tensor<16xf32, #broadcasted_lane_1d> {
+  // CHECK-NEXT: [[SRC:%.*]] = extractvalue { float } %1, 0
+  // CHECK-NEXT: [[IDX:%.*]] = extractvalue { i32 } %0, 0
+
+  // CHECK-NEXT: [[LANEID:%.*]] = and i32 [[IDX]], 15
+  // CHECK-NEXT: [[VALUE:%.*]] = bitcast float [[SRC]] to i32
+  // CHECK-NEXT: [[RES_i32:%.*]] = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 [[VALUE]], i32 [[LANEID]], i32 31)
+  %0 = tt.gather %arg1[%arg0] {axis = 0 : i32} : (tensor<16xf32, #broadcasted_lane_1d>, tensor<16xi32, #broadcasted_lane_1d>) -> tensor<16xf32, #broadcasted_lane_1d>
+
+  // CHECK-NEXT: [[RES:%.*]] = bitcast i32 [[RES_i32]] to float
+  // CHECK-NEXT: ret float [[RES]]
+  tt.return %0 : tensor<16xf32, #broadcasted_lane_1d>
+}
+
+// Single gather column with 64 elements, all of which have to fit into a single
+// warp, so the whole column is broadcasted across the 4 warps. Each process the
+// same data so the warp doesn't matter.
+// CHECK-LABEL: @gather_broadcasted_warp_2d
+tt.func private @gather_broadcasted_warp_2d(%arg0: tensor<64x1xi32, #broadcasted_warp_2d>, %arg1: tensor<64x1xf32, #broadcasted_warp_2d>) -> tensor<64x1xf32, #broadcasted_warp_2d> {
+  // CHECK-NEXT: [[SRC0:%.*]] = extractvalue { float, float } %1, 0
+  // CHECK-NEXT: [[SRC1:%.*]] = extractvalue { float, float } %1, 1
+  // CHECK-NEXT: [[IDX0:%.*]] = extractvalue { i32, i32 } %0, 0
+  // CHECK-NEXT: [[IDX1:%.*]] = extractvalue { i32, i32 } %0, 1
+
+  // CHECK-NEXT: [[REGID0:%.*]] = and i32 [[IDX0]], 1
+  // CHECK-NEXT: [[TMP:%.*]] = lshr i32 [[IDX0]], 1
+  // CHECK-NEXT: [[LANEID0:%.*]] = and i32 [[TMP]], 31
+
+  // CHECK-NEXT: [[VALUE0:%.*]] = bitcast float [[SRC0]] to i32
+  // CHECK-NEXT: [[RES0_i32:%.*]] = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 [[VALUE0]], i32 [[LANEID0]], i32 31)
+  // CHECK-NEXT: [[VALUE1:%.*]] = bitcast float [[SRC1]] to i32
+  // CHECK-NEXT: [[RES1_i32:%.*]] = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 [[VALUE1]], i32 [[LANEID0]], i32 31)
+
+  // CHECK-NEXT: [[PICK0:%.*]] = icmp eq i32 [[REGID0]], 0
+  // CHECK-NEXT: select i1 [[PICK0]], i32 [[RES0_i32]], i32 [[RES1_i32]]
+
+  // CHECK: [[REGID1:%.*]] = and i32 [[IDX1]], 1
+  // CHECK-NEXT: [[TMP:%.*]] = lshr i32 [[IDX1]], 1
+  // CHECK-NEXT: [[LANEID1:%.*]] = and i32 [[TMP]], 31
+
+  // CHECK-NEXT: [[RES0_i32:%.*]] = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 [[VALUE0]], i32 [[LANEID1]], i32 31)
+  // CHECK-NEXT: [[RES1_i32:%.*]] = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 [[VALUE1]], i32 [[LANEID1]], i32 31)
+
+  // CHECK-NEXT: [[PICK1:%.*]] = icmp eq i32 [[REGID1]], 0
+  // CHECK-NEXT: select i1 [[PICK1]], i32 [[RES0_i32]], i32 [[RES1_i32]]
+  %0 = tt.gather %arg1[%arg0] {axis = 0 : i32} : (tensor<64x1xf32, #broadcasted_warp_2d>, tensor<64x1xi32, #broadcasted_warp_2d>) -> tensor<64x1xf32, #broadcasted_warp_2d>
+  tt.return %0 : tensor<64x1xf32, #broadcasted_warp_2d>
+}
+
 // Keep LLVM from DCE'ing the above functions. Use volatile stores to stop LLVM
 // from removing unused function results.
 tt.func @anchor(%ptr: !llvm.ptr,
@@ -235,7 +291,11 @@ tt.func @anchor(%ptr: !llvm.ptr,
     %arg7: tensor<32x2xi32, #span_2d_cols>,
     %arg8: tensor<32x2xf32, #span_2d_cols>,
     %arg9: tensor<32x16xi32, #crazy_2d_idx>,
-    %arg10: tensor<32x16xf32, #crazy_2d_src>) {
+    %arg10: tensor<32x16xf32, #crazy_2d_src>,
+    %arg11: tensor<16xi32, #broadcasted_lane_1d>,
+    %arg12: tensor<16xf32, #broadcasted_lane_1d>,
+    %arg13: tensor<64x1xi32, #broadcasted_warp_2d>,
+    %arg14: tensor<64x1xf32, #broadcasted_warp_2d>) {
 
   %0 = tt.call @gather_warp_local_trivial(%arg0, %arg1) : (tensor<32xi32, #trivial_layout>, tensor<32xf32, #trivial_layout>) -> tensor<32xf32, #trivial_layout>
   %1 = builtin.unrealized_conversion_cast %0 : tensor<32xf32, #trivial_layout> to !llvm.struct<(f32)>
@@ -265,6 +325,14 @@ tt.func @anchor(%ptr: !llvm.ptr,
   %13 = builtin.unrealized_conversion_cast %12 : tensor<32x16xf32, #crazy_2d_idx> to !llvm.struct<(f32, f32, f32, f32)>
   llvm.store volatile %13, %ptr : !llvm.struct<(f32, f32, f32, f32)>, !llvm.ptr
 
+  %14 = tt.call @gather_broadcasted_lane_1d(%arg11, %arg12) : (tensor<16xi32, #broadcasted_lane_1d>, tensor<16xf32, #broadcasted_lane_1d>) -> tensor<16xf32, #broadcasted_lane_1d>
+  %15 = builtin.unrealized_conversion_cast %14 : tensor<16xf32, #broadcasted_lane_1d> to !llvm.struct<(f32)>
+  llvm.store volatile %15, %ptr : !llvm.struct<(f32)>, !llvm.ptr
+
+  %16 = tt.call @gather_broadcasted_warp_2d(%arg13, %arg14) : (tensor<64x1xi32, #broadcasted_warp_2d>, tensor<64x1xf32, #broadcasted_warp_2d>) -> tensor<64x1xf32, #broadcasted_warp_2d>
+  %17 = builtin.unrealized_conversion_cast %16 : tensor<64x1xf32, #broadcasted_warp_2d> to !llvm.struct<(f32, f32)>
+  llvm.store volatile %17, %ptr : !llvm.struct<(f32, f32)>, !llvm.ptr
+
   tt.return
 }