Fix assertion in setOptimizedGatherLayout for 1D tensor (#6959)

chengjunlu · web-flow · commit 766f7fa90903 · 2025-05-29T13:59:45.000+01:00
Fixes #6958

The `setOptimizedGatherLayout` only works properly for the GatherOp with
the input tensor rank &gt;= 2.
The assertion is hit if the input operand is tensor of 1D. 
Return failure for 1D tensor early and skip the following optimization
which doesn't work for 1D tensor.

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp
@@ -99,7 +99,7 @@ static RankedTensorType replaceEncoding(RankedTensorType oldType,
 
 // This function considers a gather op in isolation and attempts to determine
 // whether an optimized layout can be applied to the source and index tensors.
-static void setOptimizedGatherLayout(GatherOp op, RewriterBase &b) {
+static LogicalResult setOptimizedGatherLayout(GatherOp op, RewriterBase &b) {
   RankedTensorType srcType = op.getSrc().getType();
   RankedTensorType idxType = op.getIndices().getType();
 
@@ -137,6 +137,8 @@ static void setOptimizedGatherLayout(GatherOp op, RewriterBase &b) {
   // for `sizePerThread[axis]`.
   unsigned axis = op.getAxis();
   unsigned rank = srcType.getRank();
+  if (rank == 1)
+    return failure();
   SmallVector<unsigned> threadsPerWarp(rank);
   SmallVector<unsigned> warpsPerCTA(rank);
   SmallVector<unsigned> order;
@@ -223,6 +225,8 @@ static void setOptimizedGatherLayout(GatherOp op, RewriterBase &b) {
 
   // Make sure we did this right.
   assert(GatherLoweringHelper(op).isWarpLocal());
+
+  return success();
 }
 
 namespace {
@@ -233,8 +237,7 @@ struct OptimizeGatherLayoutPattern : public mlir::OpRewritePattern<GatherOp> {
                                 PatternRewriter &rewriter) const override {
     if (op.getEfficientLayout())
       return failure();
-    setOptimizedGatherLayout(op, rewriter);
-    return success();
+    return setOptimizedGatherLayout(op, rewriter);
   }
 };
 } // namespace
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -7184,8 +7184,24 @@ def gather_test_kernel(src_ptr, idx_ptr, out_ptr, axis: tl.constexpr, src_dim0:
     tl.store(out_ptr + out_offs, out)
 
 
+@triton.jit
+def gather_test_kernel_1d(src_ptr, idx_ptr, out_ptr, axis: tl.constexpr, src_dim0: tl.constexpr, idx_dim0: tl.constexpr,
+                          out_dim0: tl.constexpr):
+    src_offs = tl.arange(0, src_dim0)
+    src = tl.load(src_ptr + src_offs)
+
+    idx_offs = tl.arange(0, idx_dim0)
+    idx = tl.load(idx_ptr + idx_offs)
+
+    out = tl.gather(src, idx, axis)
+
+    out_offs = tl.arange(0, out_dim0)
+    tl.store(out_ptr + out_offs, out)
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize("src_shape, indices_shape, axis", [
+    ([32], [64], 0),
     ([4, 4], [8, 4], 0),
     ([128, 64], [256, 64], 0),
     ([128, 64], [128, 128], 1),
@@ -7195,10 +7211,13 @@ def test_gather(src_shape, indices_shape, axis, device):
     def triton_gather(src: torch.Tensor, axis: int, indices: torch.Tensor):
         output = torch.empty(indices.shape, dtype=src.dtype, device=src.device)
 
-        gather_test_kernel[(1, )](src, indices, output, axis, src.shape[0],
-                                  src.shape[1], src.stride(0), src.stride(1), indices.shape[0], indices.shape[1],
-                                  indices.stride(0), indices.stride(1), output.shape[0], output.shape[1],
-                                  output.stride(0), output.stride(1))
+        if len(src_shape) == 1:
+            gather_test_kernel_1d[(1, )](src, indices, output, axis, src.shape[0], indices.shape[0], output.shape[0])
+        else:
+            gather_test_kernel[(1, )](src, indices, output, axis, src.shape[0], src.shape[1], src.stride(0),
+                                      src.stride(1), indices.shape[0], indices.shape[1], indices.stride(0),
+                                      indices.stride(1), output.shape[0], output.shape[1], output.stride(0),
+                                      output.stride(1))
 
         return output
 
diff --git a/test/TritonGPU/optimize-locality.mlir b/test/TritonGPU/optimize-locality.mlir
@@ -769,3 +769,21 @@ tt.func @set_warp_shuffle_layout_large_source(%arg0: tensor<256x256xf32, #blocke
 }
 
 }
+
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+
+// CHECK: [[LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+
+// CHECK: skip_optimize_on_1d_tensor
+tt.func @skip_optimize_on_1d_tensor(%arg0: tensor<256xf32, #blocked>, %arg1: tensor<8xi32, #blocked>) -> tensor<8xf32, #blocked> {
+  // CHECK: tt.gather {{.*}} [[LAYOUT]]>
+  %0 = tt.gather %arg0[%arg1] {axis = 0 : i32} : (tensor<256xf32, #blocked>, tensor<8xi32, #blocked>) -> tensor<8xf32, #blocked>
+  tt.return %0 : tensor<8xf32, #blocked>
+}
+
+}