[ANALYSIS] Don't consider descending sequences as contiguous in AxisInfoAnalysis (#4871)

ienkovich · web-flow · commit 82fae4e6091b · 2024-10-08T20:26:35.000-04:00
Contiguity is used to issue wide load operations instead of multiple loads. This always assumes that the address of the first element in a sequence can be used to load the sequence of elements. If a sequence is diminishing, it leads to a wrong wide load operation. This patch fixes that by not preserving a contiguity of RHS for SubIOp operation. - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [ ] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) --------- Signed-off-by: Ilya Enkovich <ilya.enkovich@intel.com>
diff --git a/lib/Analysis/AxisInfo.cpp b/lib/Analysis/AxisInfo.cpp
@@ -278,6 +278,11 @@ class AddSubOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
 private:
   int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                         int dim) override {
+    // Contiguity assumes an increasing sequence. So for SubIOp contiguous
+    // RHS doesn't produce a contiguous result.
+    if (isa<arith::SubIOp>(op))
+      return gcd(lhs.getContiguity(dim), rhs.getConstancy(dim));
+
     return std::max(gcd(lhs.getConstancy(dim), rhs.getContiguity(dim)),
                     gcd(lhs.getContiguity(dim), rhs.getConstancy(dim)));
   }
diff --git a/python/test/regression/test_functional_regressions.py b/python/test/regression/test_functional_regressions.py
@@ -224,3 +224,18 @@ def grid(META):
         BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_K=BLOCK_SIZE_K, type=type,  #
         num_stages=num_stages)
     torch.testing.assert_close(torch_output, triton_output, rtol=1e-2, atol=1e-2)
+
+
+def test_reverse_range(device):
+
+    @triton.jit
+    def kernel(in_ptr, out_ptr):
+        x0 = tl.arange(0, 512)
+        tmp0 = tl.load(in_ptr + (512 - x0))
+        tl.store(out_ptr + x0, tmp0)
+
+    data = torch.randn((516, ), dtype=torch.float32, device=device)
+    res = torch.empty((512, ), dtype=torch.float32, device=device)
+    kernel[(1, )](data, res)
+    ref = torch.flip(data[1:513], [0])
+    assert (res == ref).all()
diff --git a/test/Analysis/test-alignment.mlir b/test/Analysis/test-alignment.mlir
@@ -97,10 +97,12 @@ tt.func @sub() {
   %1 = arith.constant dense<1> : tensor<128xi32>
   // CHECK-NEXT: contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>
   %2 = arith.subi %0, %1 : tensor<128xi32>
+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  %3 = arith.subi %1, %0 : tensor<128xi32>
   // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 129
-  %3 = arith.constant dense<129> : tensor<128xi32>
+  %4 = arith.constant dense<129> : tensor<128xi32>
   // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
-  %4 = arith.subi %3, %1 : tensor<128xi32>
+  %5 = arith.subi %4, %1 : tensor<128xi32>
   tt.return
 }