intel · etiotto · Nov 5, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 10, 2024
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -235,7 +235,8 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_accelerate_matmul(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
-        intel.passes.ttgpuir.add_rewrite_tensor_pointer(pm)
+        if os.getenv("TRITON_INTEL_REWRITE_TENSOR_POINTER", "0") == "1":
+            intel.passes.ttgpuir.add_rewrite_tensor_pointer(pm)
         intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False)
 
         intel.passes.ttgpuir.add_coalesce(pm)

diff --git a/third_party/intel/include/Analysis/AxisInfo.h b/third_party/intel/include/Analysis/AxisInfo.h
@@ -1,169 +1,24 @@
 #ifndef TRITON_INTEL_ANALYSIS_AXISINFO_H
 #define TRITON_INTEL_ANALYSIS_AXISINFO_H
 
-#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include "mlir/Support/LLVM.h"
-#include "triton/Analysis/Utility.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
-
-#include <optional>
+#include "triton/Analysis/AxisInfo.h"
 
 namespace mlir::triton::intel {
 
-//===----------------------------------------------------------------------===//
-// AxisInfo
-//===----------------------------------------------------------------------===//
-
-/// This lattice value represents known information on the axes of a lattice.
-class AxisInfo {
-public:
-  typedef SmallVector<int64_t> DimVectorT;
-
-public:
-  AxisInfo() : AxisInfo({}, {}, {}) {}
-
-  AxisInfo(const DimVectorT &contiguity, const DimVectorT &divisibility,
-           const DimVectorT &constancy)
-      : AxisInfo(contiguity, divisibility, constancy, std::nullopt) {}
-
-  AxisInfo(const DimVectorT &contiguity, const DimVectorT &divisibility,
-           const DimVectorT &constancy, std::optional<int64_t> constantValue)
-      : contiguity(contiguity), divisibility(divisibility),
-        constancy(constancy), constantValue(constantValue) {
-    assert(divisibility.size() == contiguity.size());
-    assert(constancy.size() == contiguity.size());
-  }
-
-  // contiguity[d] is the length of the shortest sequence of contiguous integers
-  // along dimension d.
-  //
-  // If we have an array of N elements with a contiguity value C, then the array
-  // can be divided into a list of N/C sequences of C contiguous elements.
-  // Since we have N = 2^k, C must be a power of two.
-  //
-  // For example, the 2D array
-  //
-  //   [[10, 11, 12, 13, 18, 19, 20, 21],
-  //    [20, 21, 22, 23, 28, 29, 30, 31]]
-  //
-  // has contiguity [1, 4], and
-  //
-  //   [[12, 16, 20, 24],
-  //    [13, 17, 21, 25],
-  //    [14, 18, 22, 26],
-  //    [15, 19, 23, 27],
-  //    [18, 22, 26, 30],
-  //    [19, 23, 27, 31]]
-  //
-  // has contiguity [2, 1].
-  int64_t getContiguity(size_t dim) const { return contiguity[dim]; }
-  const DimVectorT &getContiguity() const { return contiguity; }
-
-  // divisibility[d] is the largest power of two that divides the first element
-  // of all groups of length contiguity[d] along dimension d.
-  //
-  // For example,
-  //
-  //   [[10, 11, 12, 13, 18, 19, 20, 21],
-  //    [20, 21, 22, 23, 28, 29, 30, 31]]
-  //
-  //  has divisibility [1, 2], and
-  //
-  //    [[12, 16, 20, 24],
-  //     [13, 17, 21, 25],
-  //     [14, 18, 22, 26],
-  //     [15, 19, 23, 27]]
-  //
-  // has divisibility [4, 1].
-  //
-  // On the other hand,
-  //
-  //   [0, 1, 2, 0, 4, 5, 6, 7]
-  //
-  // has divisibility 1 because its contiguity is 1.
-  int64_t getDivisibility(size_t dim) const { return divisibility[dim]; }
-  const DimVectorT &getDivisibility() const { return divisibility; }
-
-  // constancy[d] is the length of the shortest sequence of repeating integers
-  // along dimension d.
-  //
-  // This is particularly useful to infer the contiguity of operations (e.g.
-  // add) involving a constant.
-  //
-  // If we have an array of N elements, with a constancy value C, then the array
-  // can be divided into a list of N/C sequences of C elements with the same
-  // value.  Since we have N = 2^k, C must be a power of two.
-  //
-  // For example
-  //
-  //   [[8, 8, 8, 8, 12, 12, 12, 12],
-  //    [16, 16, 16, 16, 20, 20, 20, 20]]
-  //
-  // has constancy [1, 4].
-  int64_t getConstancy(size_t dim) const { return constancy[dim]; }
-  const DimVectorT &getConstancy() const { return constancy; }
-
-  int getRank() const { return contiguity.size(); }
-
-  std::optional<int64_t> getConstantValue() const { return constantValue; }
-
-  template <class T>
-  static void
-  initPessimisticStateFromFunc(int argNumber, T funcOp, DimVectorT *contiguity,
-                               DimVectorT *divisibility, DimVectorT *constancy);
-
-  bool operator==(const AxisInfo &other) const {
-    return contiguity == other.contiguity &&
-           divisibility == other.divisibility && constancy == other.constancy &&
-           constantValue == other.constantValue;
-  }
-
-  static AxisInfo getPessimisticValueState(Value value);
-
-  // The gcd of both arguments for each dimension
-  static AxisInfo join(const AxisInfo &lhs, const AxisInfo &rhs);
-
-  void print(raw_ostream &os) const {
-    auto print = [&](StringRef name, DimVectorT vec) {
-      os << name << " = [";
-      llvm::interleaveComma(vec, os);
-      os << "]";
-    };
-    print("contiguity", contiguity);
-    print(", divisibility", divisibility);
-    print(", constancy", constancy);
-    os << ", constant_value = ";
-    if (constantValue)
-      os << *constantValue;
-    else
-      os << "<none>";
-  }
-
-private:
-  DimVectorT contiguity;
-  DimVectorT divisibility;
-  DimVectorT constancy;
-
-  // The constant value of the lattice if we can infer it.
-  std::optional<int64_t> constantValue;
-};
-
 // Module level axis info analysis based on the call graph, assuming that we do
 // not have recursive functions.
 //
 // Since each function will be called multiple times, we need to calculate the
 // axis info based on the axis info of all the callers.  In the future, we can
 // perform optimization using function cloning so that each call site will have
 // unique axis info.
-using AxisInfoMapT = DenseMap<Value, AxisInfo>;
-class ModuleAxisInfoAnalysis : public CallGraph<AxisInfoMapT> {
+// using AxisInfoMapT = DenseMap<Value, AxisInfo>;
+class ModuleAxisInfoAnalysis : public triton::ModuleAxisInfoAnalysis {
 public:
   explicit ModuleAxisInfoAnalysis(ModuleOp moduleOp)
-      : CallGraph<AxisInfoMapT>(moduleOp) {
+      : triton::ModuleAxisInfoAnalysis(moduleOp) {
+    funcMap.clear();
+
     SmallVector<FunctionOpInterface> funcs;
     for (auto root : getRoots()) {
       walk<WalkOrder::PreOrder, WalkOrder::PostOrder>(
@@ -187,10 +42,11 @@ class ModuleAxisInfoAnalysis : public CallGraph<AxisInfoMapT> {
     }
   }
 
-  AxisInfo *getAxisInfo(Value value) {
+  AxisInfo *getAxisInfo(Value value) const {
     auto funcOp =
         value.getParentRegion()->getParentOfType<FunctionOpInterface>();
-    auto *axisInfoMap = getFuncData(funcOp);
+    auto *axisInfoMap =
+        const_cast<ModuleAxisInfoAnalysis *>(this)->getFuncData(funcOp);
     if (!axisInfoMap) {
       return nullptr;
     }
@@ -201,9 +57,9 @@ class ModuleAxisInfoAnalysis : public CallGraph<AxisInfoMapT> {
     return &(it->second);
   }
 
-  unsigned getPtrContiguity(Value ptr);
-  unsigned getPtrAlignment(Value ptr);
-  unsigned getMaskAlignment(Value mask);
+  unsigned getPtrContiguity(Value ptr) const;
+  unsigned getPtrAlignment(Value ptr) const;
+  unsigned getMaskAlignment(Value mask) const;
 
 private:
   void initialize(FunctionOpInterface funcOp);

diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/Utils.h b/third_party/intel/include/Dialect/TritonIntelGPU/IR/Utils.h
@@ -28,7 +28,7 @@ inline unsigned getNumElementsPerThread(
           ? cast<RankedTensorType>(cast<PointerType>(valTy).getPointeeType())
           : cast<RankedTensorType>(valTy);
   auto shapePerCTA = getShapePerCTA(ty);
-  mlir::triton::intel::AxisInfo &valInfo = *axisInfoAnalysis.getAxisInfo(val);
+  mlir::triton::AxisInfo &valInfo = *axisInfoAnalysis.getAxisInfo(val);
 
   unsigned elemNumBits = getElementBitWidth(ty);
   unsigned elemNumBytes = std::max(elemNumBits / 8, 1u);