intel
diff --git a/‎include/triton/Analysis/AxisInfo.h‎
Lines changed: 58 additions & 3 deletions b/‎include/triton/Analysis/AxisInfo.h‎
Lines changed: 58 additions & 3 deletions
diff --git a/‎test/Analysis/intel/test-axis-info.mlir‎
Lines changed: 27 additions & 6 deletions b/‎test/Analysis/intel/test-axis-info.mlir‎
Lines changed: 27 additions & 6 deletions
diff --git a/‎third_party/intel/lib/Analysis/AxisInfo.cpp‎
Lines changed: 72 additions & 15 deletions b/‎third_party/intel/lib/Analysis/AxisInfo.cpp‎
Lines changed: 72 additions & 15 deletions
@@ -25,20 +25,73 @@ class AxisInfo {
   typedef SmallVector<int64_t> DimVectorT;
 
 public:
-  AxisInfo() : AxisInfo({}, {}, {}) {}
+  AxisInfo() : AxisInfo({}, {}, {}, {}, std::nullopt) {}
 
   AxisInfo(ArrayRef<int64_t> contiguity, ArrayRef<int64_t> divisibility,
            ArrayRef<int64_t> constancy)
       : AxisInfo(contiguity, divisibility, constancy, std::nullopt) {}
 
   AxisInfo(ArrayRef<int64_t> contiguity, ArrayRef<int64_t> divisibility,
            ArrayRef<int64_t> constancy, std::optional<int64_t> constantValue)
-      : contiguity(contiguity), divisibility(divisibility),
+      : AxisInfo(AxisInfo::DimVectorT(contiguity.size(), -1), contiguity,
+                 divisibility, constancy, constantValue) {
+    for (size_t i = 0; i < contiguity.size(); ++i) {
+      if (contiguity[i] > 1) {
+        stride[i] = 1;
+      }
+    }
+  }
+
+  AxisInfo(ArrayRef<int64_t> stride, ArrayRef<int64_t> contiguity,
+           ArrayRef<int64_t> divisibility, ArrayRef<int64_t> constancy,
+           std::optional<int64_t> constantValue)
+      : stride(stride), contiguity(contiguity), divisibility(divisibility),
         constancy(constancy), constantValue(constantValue) {
+    assert(stride.size() == contiguity.size());
     assert(divisibility.size() == contiguity.size());
     assert(constancy.size() == contiguity.size());
   }
 
+  // TODO: Support non compile time constant strides.
+  // stride[d] is the stride of contiguityWithStride[d] elements along dimension
+  // d. Value -1 is used to represent the unknown stride.
+  // For example, the 2D array
+  //
+  //   [[10, 11, 12, 13, 18, 19, 20, 21],
+  //    [20, 21, 22, 23, 28, 29, 30, 31]]
+  //
+  // has stride [10, 1], and
+  //
+  //   [[12, 16, 20, 24],
+  //    [13, 17, 21, 25],
+  //    [14, 18, 22, 26],
+  //    [15, 19, 23, 27],
+  //    [18, 22, 26, 30],
+  //    [19, 23, 27, 31]]
+  //
+  // has stride [1, 4].
+  int64_t getStride(size_t dim) const { return stride[dim]; }
+  const DimVectorT &getStride() const { return stride; }
+
+  // TODO: Add contiguity with stride.
+  // contiguityWithStride[d] is the length of the shortest sequence of
+  // contiguous integers with the same stride along dimension d. For example,
+  // the 2D array
+  //
+  //   [[10, 11, 12, 13, 18, 19, 20, 21],
+  //    [20, 21, 22, 23, 28, 29, 30, 31]]
+  //
+  // has contiguityWithStride [2, 4], and
+  //
+  //   [[12, 16, 20, 24],
+  //    [13, 17, 21, 25],
+  //    [14, 18, 22, 26],
+  //    [15, 19, 23, 27],
+  //    [18, 22, 26, 30],
+  //    [19, 23, 27, 31]]
+  //
+  // has contiguityWithStride [2, 4].
+
   // contiguity[d] is the length of the shortest sequence of contiguous integers
   // along dimension d.
   //
@@ -134,7 +187,8 @@ class AxisInfo {
       llvm::interleaveComma(vec, os);
       os << "]";
     };
-    print("contiguity", contiguity);
+    print("stride", stride);
+    print(", contiguity", contiguity);
     print(", divisibility", divisibility);
     print(", constancy", constancy);
     os << ", constant_value = ";
@@ -145,6 +199,7 @@ class AxisInfo {
   }
 
 private:
+  DimVectorT stride;
   DimVectorT contiguity;
   DimVectorT divisibility;
   DimVectorT constancy;
 
@@ -702,13 +702,13 @@ tt.func @vecadd_mask_align_16(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
   %7 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>>
   %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
   %9 = tt.splat %n_elements : i32 -> tensor<64xi32>
-  // CHECK: arith.cmpi slt, %{{.*}} => contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
+  // CHECK: arith.cmpi slt, %{{.*}} => stride = [-1], contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
   %mask = arith.cmpi slt, %4, %9 : tensor<64xi32>
   %11 = tt.load %6, %mask : tensor<64x!tt.ptr<f32>>
   %12 = tt.load %8, %mask : tensor<64x!tt.ptr<f32>>
   %13 = arith.addf %11, %12 : tensor<64xf32>
   %14 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>>
-  // CHECK: tt.addptr %{{.*}} => contiguity = [64], divisibility = [16], constancy = [1], constant_value = <none>
+  // CHECK: tt.addptr %{{.*}} => stride = [1], contiguity = [64], divisibility = [16], constancy = [1], constant_value = <none>
   %15 = tt.addptr %14, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
   tt.store %15, %13, %mask : tensor<64x!tt.ptr<f32>>
   tt.return
@@ -731,7 +731,7 @@ tt.func @vecadd_mask_align_1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %
   %7 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>>
   %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
   %9 = tt.splat %n_elements : i32 -> tensor<64xi32>
-  // CHECK: arith.cmpi slt, %{{.*}} => contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
+  // CHECK: arith.cmpi slt, %{{.*}} => stride = [-1], contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
   %10 = arith.cmpi slt, %4, %9 : tensor<64xi32>
   %11 = tt.load %6, %10 : tensor<64x!tt.ptr<f32>>
   %12 = tt.load %8, %10 : tensor<64x!tt.ptr<f32>>
@@ -885,11 +885,32 @@ tt.func public @make_tensor_ptr(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f8E5M2> {tt.
   %c1_i64 = arith.constant 1 : i64
   %c32_i64 = arith.constant 32 : i64
   %c128_i64 = arith.constant 128 : i64
-  // CHECK: tt.make_tensor_ptr %arg0, {{.*}} => contiguity = [128, 32], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // CHECK: tt.make_tensor_ptr %arg0, {{.*}} => stride = [1, 1], contiguity = [128, 32], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
   %0 = tt.make_tensor_ptr %arg0, [%c128_i64, %c32_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : !tt.ptr<tensor<128x32xf16>>
-  // CHECK: tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [64, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = <none>
+  // CHECK: tt.make_tensor_ptr %arg1, {{.*}} => stride = [1, -1], contiguity = [64, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = <none>
   %1 = tt.make_tensor_ptr %arg1, [%c32_i64, %c32_i64], [%c1_i64, %arg2], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x16xf8E5M2>>
-  // CHECK: tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [32, 64], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  // CHECK: tt.make_tensor_ptr %arg1, {{.*}} => stride = [1, 1], contiguity = [32, 64], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
   %2 = tt.make_tensor_ptr %arg1, [%arg2, %c128_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf8E5M2>>
   tt.return
 }
+
+// -----
+
+// CHECK-LABEL: @ptr_offset
+tt.func public @ptr_offset(%arg0: i32) {
+  // CHECK: stride = [0, 0], contiguity = [1, 1], divisibility = [512, 512], constancy = [128, 1], constant_value = 512
+  %cst = arith.constant dense<512> : tensor<128x1xi32>
+  // CHECK: stride = [0], contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
+  %0 = tt.splat %arg0 : i32 -> tensor<128xi32>
+  // CHECK: stride = [1], contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
+  %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
+  // CHECK: stride = [1], contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>
+  %2 = arith.addi %0, %1 : tensor<128xi32>
+  // CHECK: stride = [1, 0], contiguity = [128, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
+  %3 = tt.expand_dims %2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
+  // CHECK: stride = [512, 0], contiguity = [1, 1], divisibility = [512, 512], constancy = [1, 1], constant_value = <none>
+  %4 = arith.muli %3, %cst : tensor<128x1xi32>
+  // CHECK: stride = [512, 0], contiguity = [1, 1], divisibility = [512, 512], constancy = [1, 64], constant_value = <none>
+  %5 = tt.broadcast %4 : tensor<128x1xi32> -> tensor<128x64xi32>
+  tt.return
+}
@@ -107,28 +107,37 @@ class BinaryOpVisitorImpl : public AxisInfoVisitorImpl<OpTy> {
     const auto &rhsInfo = operands[1]->getValue();
     auto rank = lhsInfo.getRank();
     assert(operands.size() == 2 && "Expected two operands");
+    AxisInfo::DimVectorT stride;
     AxisInfo::DimVectorT contiguity;
     AxisInfo::DimVectorT divisibility;
     AxisInfo::DimVectorT constancy;
     auto constantValue = getConstantValue(op, lhsInfo, rhsInfo);
     for (auto d = 0; d < rank; ++d) {
       if (constantValue.has_value()) {
+        stride.push_back(0);
         contiguity.push_back(1);
         constancy.push_back(
             std::max(lhsInfo.getConstancy(d), rhsInfo.getConstancy(d)));
         divisibility.push_back(
             highestPowOf2Divisor<int64_t>(constantValue.value()));
       } else {
+        stride.push_back(getStride(op, lhsInfo, rhsInfo, d));
         contiguity.push_back(getContiguity(op, lhsInfo, rhsInfo, d));
         constancy.push_back(getConstancy(op, lhsInfo, rhsInfo, d));
         divisibility.push_back(getDivisibility(op, lhsInfo, rhsInfo, d));
       }
     }
-    return AxisInfo(std::move(contiguity), std::move(divisibility),
-                    std::move(constancy), constantValue);
+    return AxisInfo(std::move(stride), std::move(contiguity),
+                    std::move(divisibility), std::move(constancy),
+                    constantValue);
   }
 
 protected:
+  virtual int64_t getStride(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
+                            int dim) {
+    return -1;
+  }
+
   virtual int64_t getContiguity(OpTy op, const AxisInfo &lhs,
                                 const AxisInfo &rhs, int dim) {
     return 1;
@@ -252,7 +261,7 @@ class ConstantOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
         value = intAttr.getValue().getZExtValue();
       else
         value = boolAttr.getValue() ? 1 : 0;
-      return AxisInfo(/*contiguity=*/{1},
+      return AxisInfo(/*stride=*/{0}, /*contiguity=*/{1},
                       /*divisibility=*/{highestPowOf2Divisor(value)},
                       /*constancy=*/{1},
                       /*knownConstantValue=*/{value});
@@ -263,6 +272,7 @@ class ConstantOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
       int64_t value = splatAttr.template getSplatValue<APInt>().getZExtValue();
       TensorType ty = cast<TensorType>(splatAttr.getType());
       return AxisInfo(
+          /*stride=*/AxisInfo::DimVectorT(ty.getRank(), 0),
           /*contiguity=*/AxisInfo::DimVectorT(ty.getRank(), 1),
           /*divisibility=*/
           AxisInfo::DimVectorT(ty.getRank(), highestPowOf2Divisor(value)),
@@ -302,6 +312,15 @@ class AddSubOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
   using BinaryOpVisitorImpl<OpTy>::BinaryOpVisitorImpl;
 
 private:
+  int64_t getStride(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
+                    int dim) override {
+    if (lhs.getStride(dim) < 0 || rhs.getStride(dim) < 0)
+      return -1;
+    if (isa<arith::SubIOp>(op))
+      return std::max(lhs.getStride(dim) - rhs.getStride(dim), int64_t(-1));
+    return lhs.getStride(dim) + rhs.getStride(dim);
+  }
+
   int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                         int dim) override {
     // Contiguity assumes an increasing sequence. So for SubIOp contiguous
@@ -373,6 +392,17 @@ class MulIOpAxisInfoVisitor final : public BinaryOpVisitorImpl<arith::MulIOp> {
   using BinaryOpVisitorImpl<arith::MulIOp>::BinaryOpVisitorImpl;
 
 private:
+  int64_t getStride(arith::MulIOp op, const AxisInfo &lhs, const AxisInfo &rhs,
+                    int dim) override {
+    if (lhs.getStride(dim) > 0 && rhs.getConstantValue().has_value())
+      return lhs.getStride(dim) * rhs.getConstantValue().value();
+    if (rhs.getStride(dim) > 0 && lhs.getConstantValue().has_value())
+      return lhs.getConstantValue().value() * rhs.getStride(dim);
+    if (lhs.getStride(dim) == 0 || rhs.getStride(dim) == 0)
+      return 0;
+    return -1;
+  }
+
   int64_t getContiguity(arith::MulIOp op, const AxisInfo &lhs,
                         const AxisInfo &rhs, int dim) override {
     // lhs * 1 = lhs
@@ -425,6 +455,22 @@ class DivOpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
   using BinaryOpVisitorImpl<OpTy>::BinaryOpVisitorImpl;
 
 private:
+  int64_t getStride(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
+                    int dim) override {
+    if (getContiguity(op, lhs, rhs, dim) > 1)
+      return 1;
+    if (lhs.getStride(dim) > 0 && rhs.getConstantValue().has_value() &&
+        rhs.getConstantValue().has_value() != 0 &&
+        lhs.getStride(dim) % rhs.getConstantValue().value() == 0)
+      return lhs.getStride(dim) / rhs.getConstantValue().value();
+    if (rhs.getStride(dim) > 0 && lhs.getConstantValue().has_value() &&
+        lhs.getConstantValue().value() % rhs.getStride(dim) == 0)
+      return lhs.getConstantValue().value() / rhs.getStride(dim);
+    if (lhs.getStride(dim) == 0)
+      return 0;
+    return -1;
+  }
+
   int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
                         int dim) override {
     // lhs / 1 = lhs
@@ -559,16 +605,18 @@ class SplatOpAxisInfoVisitor final
     Type _retTy = *op->result_type_begin();
     TensorType retTy = cast<TensorType>(_retTy);
     AxisInfo opInfo = operands[0]->getValue();
+    AxisInfo::DimVectorT stride;
     AxisInfo::DimVectorT contiguity;
     AxisInfo::DimVectorT divisibility;
     AxisInfo::DimVectorT constancy;
     for (int d = 0; d < retTy.getRank(); ++d) {
+      stride.push_back(0);
       contiguity.push_back(1);
       divisibility.push_back(opInfo.getDivisibility(0));
       constancy.push_back(retTy.getShape()[d]);
     }
-    return AxisInfo(std::move(contiguity), std::move(divisibility),
-                    std::move(constancy),
+    return AxisInfo(std::move(stride), std::move(contiguity),
+                    std::move(divisibility), std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -613,6 +661,7 @@ class ExpandDimsOpAxisInfoVisitor final
   getAxisInfo(triton::ExpandDimsOp op,
               ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
     AxisInfo opInfo = operands[0]->getValue();
+    AxisInfo::DimVectorT stride = opInfo.getStride();
     AxisInfo::DimVectorT contiguity = opInfo.getContiguity();
     AxisInfo::DimVectorT divisibility = opInfo.getDivisibility();
     AxisInfo::DimVectorT constancy = opInfo.getConstancy();
@@ -631,11 +680,12 @@ class ExpandDimsOpAxisInfoVisitor final
                 opInfo.getContiguity(d) > 1 ? 1 : opInfo.getDivisibility(d));
       }
     }
+    stride.insert(stride.begin() + op.getAxis(), 0);
     contiguity.insert(contiguity.begin() + op.getAxis(), 1);
     divisibility.insert(divisibility.begin() + op.getAxis(), newDivisibility);
     constancy.insert(constancy.begin() + op.getAxis(), 1);
-    return AxisInfo(std::move(contiguity), std::move(divisibility),
-                    std::move(constancy),
+    return AxisInfo(std::move(stride), std::move(contiguity),
+                    std::move(divisibility), std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -655,17 +705,19 @@ class BroadcastOpAxisInfoVisitor final
     ArrayRef<int64_t> retShape = retTy.getShape();
     ArrayRef<int64_t> opShape = opTy.getShape();
     AxisInfo opInfo = operands[0]->getValue();
+    AxisInfo::DimVectorT stride;
     AxisInfo::DimVectorT contiguity;
     AxisInfo::DimVectorT divisibility;
     AxisInfo::DimVectorT constancy;
     for (int d = 0; d < retTy.getRank(); ++d) {
+      stride.push_back(opInfo.getStride(d));
       contiguity.push_back(opShape[d] == 1 ? 1 : opInfo.getContiguity(d));
       divisibility.push_back(opInfo.getDivisibility(d));
       constancy.push_back(opShape[d] == 1 ? retShape[d]
                                           : opInfo.getConstancy(d));
     }
-    return AxisInfo(std::move(contiguity), std::move(divisibility),
-                    std::move(constancy),
+    return AxisInfo(std::move(stride), std::move(contiguity),
+                    std::move(divisibility), std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -1048,15 +1100,18 @@ class MakeTensorPtrOpAxisInfoVisitor final
     if (rank > 2)
       return AxisInfo();
 
-    SmallVector<AxisInfo> strideInfo;
+    SmallVector<AxisInfo, 2> strideInfo;
     for (int i = rank + 1; i <= rank * 2; ++i)
       strideInfo.emplace_back(operands[i]->getValue());
 
     AxisInfo ptrInfo = operands[0]->getValue();
     int64_t ptrDivisibility = ptrInfo.getDivisibility(0);
 
-    AxisInfo::DimVectorT contiguity, constancy, divisibility;
+    AxisInfo::DimVectorT stride, contiguity, constancy, divisibility;
     for (int dim = 0; dim < rank; ++dim) {
+      stride.push_back(strideInfo[dim].getConstantValue().has_value()
+                           ? strideInfo[dim].getConstantValue().value()
+                           : -1);
       contiguity.push_back(
           strideInfo[dim].getConstantValue() == 1 ? blkShape[dim] : 1);
       divisibility.push_back(
@@ -1069,8 +1124,9 @@ class MakeTensorPtrOpAxisInfoVisitor final
       constancy.push_back(1);
     }
 
-    auto axisInfo = AxisInfo(std::move(contiguity), std::move(divisibility),
-                             std::move(constancy));
+    auto axisInfo =
+        AxisInfo(std::move(stride), std::move(contiguity),
+                 std::move(divisibility), std::move(constancy), std::nullopt);
 
     LLVM_DEBUG({
       std::string axisStr;
@@ -1176,8 +1232,9 @@ LogicalResult AxisInfoAnalysis::visitOperation(
     auto vals = cast<DenseElementsAttr>(attr).getValues<int>();
     newConstancy = AxisInfo::DimVectorT(vals.begin(), vals.end());
   }
-  curr = AxisInfo(std::move(newContiguity), std::move(newDivisibility),
-                  std::move(newConstancy), curr.getConstantValue());
+  curr = AxisInfo(curr.getStride(), std::move(newContiguity),
+                  std::move(newDivisibility), std::move(newConstancy),
+                  curr.getConstantValue());
   // join all lattice elements
   for (auto *result : results)
     propagateIfChanged(result, result->join(curr));