triton-lang
diff --git a/‎bin/triton-translate.cpp‎
Lines changed: 0 additions & 10 deletions b/‎bin/triton-translate.cpp‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 6 additions & 1 deletion b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 9 additions & 0 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 12 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 13 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎include/triton/tools/sys/getenv.hpp‎
Lines changed: 9 additions & 0 deletions b/‎include/triton/tools/sys/getenv.hpp‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 74 additions & 0 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 13 additions & 0 deletions b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 13 additions & 0 deletions
@@ -64,16 +64,6 @@ OwningOpRef<ModuleOp> loadMLIRModule(llvm::StringRef inputFilename,
     return nullptr;
   }
 
-  mlir::PassManager pm(module->getContext());
-  applyPassManagerCLOptions(pm);
-
-  pm.addPass(createConvertTritonGPUToLLVMPass());
-
-  if (failed(pm.run(module->getOperation()))) {
-    llvm::errs() << "Pass execution failed";
-    return nullptr;
-  }
-
   return module;
 }
 
 
@@ -14,7 +14,12 @@ namespace mlir {
 
 namespace triton {
 class AllocationAnalysis;
-}
+
+SmallVector<unsigned>
+getScratchConfigForCvtLayout(triton::gpu::ConvertLayoutOp op, unsigned &inVec,
+                             unsigned &outVec);
+
+} // namespace triton
 
 /// Modified from llvm-15.0: llvm/ADT/AddressRanges.h
 /// A class that represents an interval, specified using a start and an end
 
@@ -2,7 +2,10 @@
 #define TRITON_ANALYSIS_UTILITY_H
 
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include <algorithm>
+#include <numeric>
 #include <string>
+
 namespace mlir {
 
 bool isSharedEncoding(Value value);
@@ -11,6 +14,12 @@ bool maybeSharedAllocationOp(Operation *op);
 
 std::string getValueOperandName(Value value, AsmState &state);
 
+template <typename Int> Int product(llvm::ArrayRef<Int> arr) {
+  return std::accumulate(arr.begin(), arr.end(), 1, std::multiplies{});
+}
+
+template <typename Int> Int ceil(Int m, Int n) { return (m + n - 1) / n; }
+
 } // namespace mlir
 
 #endif // TRITON_ANALYSIS_UTILITY_H
@@ -18,6 +18,14 @@ class TritonLLVMConversionTarget : public ConversionTarget {
                                       mlir::LLVMTypeConverter &typeConverter);
 };
 
+class TritonLLVMFunctionConversionTarget : public ConversionTarget {
+  mlir::LLVMTypeConverter &typeConverter;
+
+public:
+  explicit TritonLLVMFunctionConversionTarget(
+      MLIRContext &ctx, mlir::LLVMTypeConverter &typeConverter);
+};
+
 namespace triton {
 
 // Names for identifying different NVVM annotations. It is used as attribute
 
@@ -16,4 +16,16 @@
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.h.inc"
 
+namespace mlir {
+namespace triton {
+namespace gpu {
+
+unsigned getElemsPerThread(Attribute layout, ArrayRef<int64_t> shape);
+
+unsigned getShapePerCTA(const Attribute &layout, unsigned d);
+
+} // namespace gpu
+} // namespace triton
+} // namespace mlir
+
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
@@ -31,6 +31,10 @@ Then, attaching $\mathcal{L} to a tensor $T$ would mean that:
 
 Right now, Triton implements two classes of layouts: shared, and distributed.
   }];
+
+  code extraBaseClassDeclaration = [{
+    unsigned getElemsPerThread(ArrayRef<int64_t> shape) const;
+  }];
 }
 
 //===----------------------------------------------------------------------===//
@@ -64,6 +68,8 @@ A_{3, 2}  A_{3, 3}  A_{3, 0}  A_{3, 1} ...   [phase 1] /
     "unsigned":$vec, "unsigned":$perPhase, "unsigned":$maxPhase,
     ArrayRefParameter<"unsigned", "order of axes by the rate of changing">:$order
   );
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 }
 
 //===----------------------------------------------------------------------===//
@@ -93,6 +99,8 @@ Then the data of A would be distributed as follow between the 16 CUDA threads:
 L(A) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
          {4,12}, {5,13}, {6,14}, {7,15}, {4,12}, {5, 13}, {6, 14}, {7, 15} ]
   }];
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 }
 
 //===----------------------------------------------------------------------===//
@@ -171,11 +179,10 @@ for
     }]>
   ];
 
-  let extraClassDeclaration = [{
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
     SliceEncodingAttr squeeze(int axis);
   }];
 
-
   let parameters = (
     ins
     ArrayRefParameter<"unsigned">:$sizePerThread,
@@ -282,6 +289,8 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
     "unsigned":$version,
     ArrayRefParameter<"unsigned">:$warpsPerCTA
   );
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 }
 
 def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
@@ -311,6 +320,8 @@ def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
     // TODO: constraint here to only take distributed encodings
     "Attribute":$parent
   );
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 }
 
 
 
@@ -22,6 +22,7 @@
 #ifndef TDL_TOOLS_SYS_GETENV_HPP
 #define TDL_TOOLS_SYS_GETENV_HPP
 
+#include <algorithm>
 #include <cstdlib>
 #include <string>
 
@@ -37,6 +38,14 @@ inline std::string getenv(const char *name) {
   return result;
 }
 
+inline bool getBoolEnv(const std::string &env) {
+  const char *s = std::getenv(env.c_str());
+  std::string str(s ? s : "");
+  std::transform(str.begin(), str.end(), str.begin(),
+                 [](unsigned char c) { return std::tolower(c); });
+  return (str == "on" || str == "true" || str == "1");
+}
+
 } // namespace tools
 
 } // namespace triton
 
@@ -8,13 +8,66 @@
 
 #include <algorithm>
 #include <limits>
+#include <numeric>
+
+using ::mlir::triton::gpu::BlockedEncodingAttr;
+using ::mlir::triton::gpu::MmaEncodingAttr;
+using ::mlir::triton::gpu::SharedEncodingAttr;
 
 namespace mlir {
 
 //===----------------------------------------------------------------------===//
 // Shared Memory Allocation Analysis
 //===----------------------------------------------------------------------===//
 namespace triton {
+
+SmallVector<unsigned>
+getScratchConfigForCvtLayout(triton::gpu::ConvertLayoutOp op, unsigned &inVec,
+                             unsigned &outVec) {
+  auto srcTy = op.src().getType().cast<RankedTensorType>();
+  auto dstTy = op.result().getType().cast<RankedTensorType>();
+  Attribute srcLayout = srcTy.getEncoding();
+  Attribute dstLayout = dstTy.getEncoding();
+  assert(srcLayout && dstLayout &&
+         "Unexpect layout in getScratchConfigForCvtLayout()");
+  unsigned rank = dstTy.getRank();
+  SmallVector<unsigned> paddedRepShape(rank);
+  // TODO: move to TritonGPUAttrDefs.h.inc
+  auto getShapePerCTA = [&](const Attribute &layout, unsigned d) -> unsigned {
+    if (auto blockedLayout = layout.dyn_cast<BlockedEncodingAttr>()) {
+      return blockedLayout.getSizePerThread()[d] *
+             blockedLayout.getThreadsPerWarp()[d] *
+             blockedLayout.getWarpsPerCTA()[d];
+    } else {
+      assert(0 && "Unimplemented usage of getShapePerCTA");
+      return 0;
+    }
+  };
+  if (srcLayout.isa<BlockedEncodingAttr>() &&
+      dstLayout.isa<BlockedEncodingAttr>()) {
+    auto srcBlockedLayout = srcLayout.cast<BlockedEncodingAttr>();
+    auto dstBlockedLayout = dstLayout.cast<BlockedEncodingAttr>();
+    auto inOrd = srcBlockedLayout.getOrder();
+    auto outOrd = dstBlockedLayout.getOrder();
+    // TODO: Fix the legacy issue that ourOrd[0] == 0 always means
+    //       that we cannot do vectorization.
+    inVec = outOrd[0] == 0  ? 1
+            : inOrd[0] == 0 ? 1
+                            : srcBlockedLayout.getSizePerThread()[inOrd[0]];
+    outVec =
+        outOrd[0] == 0 ? 1 : dstBlockedLayout.getSizePerThread()[outOrd[0]];
+    unsigned pad = std::max(inVec, outVec);
+    for (unsigned d = 0; d < rank; ++d) {
+      paddedRepShape[d] = std::max(
+          std::min<unsigned>(srcTy.getShape()[d], getShapePerCTA(srcLayout, d)),
+          std::min<unsigned>(dstTy.getShape()[d],
+                             getShapePerCTA(dstLayout, d)));
+    }
+    paddedRepShape[outOrd[0]] += pad;
+  }
+  return paddedRepShape;
+}
+
 class AllocationAnalysis {
 public:
   AllocationAnalysis(Operation *operation, Allocation *allocation)
@@ -73,6 +126,27 @@ class AllocationAnalysis {
                      tensorType.getElementTypeBitWidth() / 8;
         allocation->addBuffer<BufferT::BufferKind::Scratch>(op, bytes);
       }
+    } else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
+      auto srcTy = cvtLayout.src().getType().cast<RankedTensorType>();
+      auto dstTy = cvtLayout.result().getType().cast<RankedTensorType>();
+      auto srcEncoding = srcTy.getEncoding();
+      auto dstEncoding = dstTy.getEncoding();
+      if (srcEncoding.isa<SharedEncodingAttr>() ||
+          dstEncoding.isa<SharedEncodingAttr>()) {
+        // Only blocked -> blocked conversion requires for scratch allocation
+        return;
+      }
+      // ConvertLayoutOp with both input/output non-shared_layout
+      // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
+      //       also possible to realize it with other approaches in restricted
+      //       conditions, such as warp-shuffle
+      unsigned inVec = 0;
+      unsigned outVec = 0;
+      auto smemShape = getScratchConfigForCvtLayout(cvtLayout, inVec, outVec);
+      unsigned elems = std::accumulate(smemShape.begin(), smemShape.end(), 1,
+                                       std::multiplies{});
+      auto bytes = elems * srcTy.getElementTypeBitWidth() / 8;
+      allocation->addBuffer<BufferT::BufferKind::Scratch>(op, bytes);
     }
   }
 
 
@@ -1,4 +1,5 @@
 #include "mlir/Analysis/DataFlowAnalysis.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "llvm/Support/raw_ostream.h"
 #include <iostream>
 
@@ -46,6 +47,11 @@ AxisInfo AxisInfo::getPessimisticValueState(Value value) {
           fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility");
       if (attr)
         divHint = attr.cast<IntegerAttr>().getValue().getZExtValue();
+    } else if (auto fun = dyn_cast<LLVM::LLVMFuncOp>(op)) {
+      Attribute attr =
+          fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility");
+      if (attr)
+        divHint = attr.cast<IntegerAttr>().getValue().getZExtValue();
     }
   }
   DimVectorT contiguity(rank, 1);
@@ -203,6 +209,13 @@ ChangeResult AxisInfoAnalysis::visitOperation(
     }
     curr = AxisInfo(contiguity, divisibility, constancy);
   }
+  // UnrealizedConversionCast
+  // This is needed by TritonGPUToLLVM, to get AxisInfo when the graph is
+  // in the process of a PartialConversion, where UnrealizedConversionCast
+  // may exist
+  if (llvm::isa<mlir::UnrealizedConversionCastOp>(op)) {
+    curr = operands[0]->getValue();
+  }
   if (curr.getRank() == 0) {
     return markAllPessimisticFixpoint(op->getResults());
   }
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`#include "mlir/Analysis/DataFlowAnalysis.h"`
	`2`	`+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"`
`2`	`3`	`#include "llvm/Support/raw_ostream.h"`
`3`	`4`	`#include <iostream>`
`4`	`5`
`@@ -46,6 +47,11 @@ AxisInfo AxisInfo::getPessimisticValueState(Value value) {`
`46`	`47`	`fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility");`
`47`	`48`	`if (attr)`
`48`	`49`	`divHint = attr.cast<IntegerAttr>().getValue().getZExtValue();`
	`50`	`+ } else if (auto fun = dyn_cast<LLVM::LLVMFuncOp>(op)) {`
	`51`	`+ Attribute attr =`
	`52`	`+ fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility");`
	`53`	`+ if (attr)`
	`54`	`+ divHint = attr.cast<IntegerAttr>().getValue().getZExtValue();`
`49`	`55`	`}`
`50`	`56`	`}`
`51`	`57`	`DimVectorT contiguity(rank, 1);`
`@@ -203,6 +209,13 @@ ChangeResult AxisInfoAnalysis::visitOperation(`
`203`	`209`	`}`
`204`	`210`	`curr = AxisInfo(contiguity, divisibility, constancy);`
`205`	`211`	`}`
	`212`	`+ // UnrealizedConversionCast`
	`213`	`+ // This is needed by TritonGPUToLLVM, to get AxisInfo when the graph is`
	`214`	`+ // in the process of a PartialConversion, where UnrealizedConversionCast`
	`215`	`+ // may exist`
	`216`	`+ if (llvm::isa<mlir::UnrealizedConversionCastOp>(op)) {`
	`217`	`+ curr = operands[0]->getValue();`
	`218`	`+ }`
`206`	`219`	`if (curr.getRank() == 0) {`
`207`	`220`	`return markAllPessimisticFixpoint(op->getResults());`
`208`	`221`	`}`