intel
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 0 additions & 45 deletions b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 0 additions & 45 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 20 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 0 additions & 10 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 0 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 1 addition & 118 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 1 addition & 118 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 5 additions & 6 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 5 additions & 6 deletions
@@ -20,54 +20,9 @@ using AllocationAnalysisScratchSizeFn = std::function<unsigned(Operation *)>;
 
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op);
 
-// To convert a tensor from one layout to another, we need to allocate a
-// temporary buffer (i.e., scratch buffer) in shared memory. The conversion may
-// require multiple iterations, with each iteration involving multiple
-// vectorized loads/stores. The scratch buffer has a shape (`repShape`) that
-// represents the maximum size accessed in each dimension during each iteration.
-// It is padded (`paddedRepShape`) to avoid bank conflicts and is accessed in a
-// specific `order`.
-struct ScratchConfig {
-  SmallVector<unsigned> repShape;
-  SmallVector<unsigned> paddedRepShape;
-  SmallVector<unsigned> order;
-  unsigned inVec;
-  unsigned outVec;
-
-  ScratchConfig(SmallVector<unsigned> repShape,
-                SmallVector<unsigned> paddedRepShape, unsigned inVec = 1,
-                unsigned outVec = 1)
-      : repShape(repShape), paddedRepShape(paddedRepShape), inVec(inVec),
-        outVec(outVec) {}
-
-  void print(llvm::raw_ostream &os) const {
-    os << "repShape: [";
-    llvm::interleaveComma(repShape, os);
-    os << "]";
-    os << ", paddedRepShape: [";
-    llvm::interleaveComma(paddedRepShape, os);
-    os << "]";
-    os << ", order: [";
-    llvm::interleaveComma(order, os);
-    os << "]";
-    os << ", inVec: " << inVec << ", outVec: " << outVec << "\n";
-  }
-};
-
-// For a layout conversion between `srcTy` and `dstTy`, return the vector length
-// that can be used for the stores to and loads from shared memory,
-// respectively.
-std::pair</*inVec*/ unsigned, /*outVec*/ unsigned>
-getScratchCvtInOutVecLengths(RankedTensorType srcTy, RankedTensorType dstTy);
-
-ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
-                                     RankedTensorType dstTy);
-
 unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
                                        RankedTensorType dstTy);
 
-unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
-                                     RankedTensorType dstTy);
 } // namespace triton
 
 /// Modified from llvm-15.0: llvm/ADT/AddressRanges.h
 
@@ -627,26 +627,6 @@ void makeAllWarpGroupsIsolatedFromAbove(Operation *op);
 // Set the correct loop annotation on LLVM branch ops.
 void fixUpLoopAnnotation(ModuleOp mod);
 
-/// Converts ConverLayoutOp to llvm using padded pattern.
-/// This pattern adds unused memory locations after every rows of tensor fastest
-/// changing dimension:
-/// e0 e1 e2 e3 p p \
-/// e4 e5 e6 e7 p p \
-/// ...
-/// e e e e p p
-/// Dimension order is chosen in order to use wide output reads.
-///
-/// \param op operation to convert
-/// \param src llvm structure containing operation input
-/// \param targetInfo
-/// \param typeConverter
-/// \param rewriter
-/// \returns llvm structure containing converted output
-Value transferWithinBlockPadding(triton::gpu::ConvertLayoutOp op, Value src,
-                                 const TargetInfoBase &targetInfo,
-                                 const LLVMTypeConverter *typeConverter,
-                                 RewriterBase &rewriter);
-
 LogicalResult
 transferWithinBlockSwizzling(triton::gpu::ConvertLayoutOp op, Value src,
                              const TargetInfoBase &targetInfo,
 
@@ -210,16 +210,6 @@ SmallVector<unsigned> getCTASplitNum(Attribute layout);
 
 SmallVector<unsigned> getCTAOrder(Attribute layout);
 
-/* The difference between ShapePerCTATile and ShapePerCTA:
- * (1) ShapePerCTATile is defined by SizePerThread * ThreadsPerWarp *
- *     WarpsPerCTA in each dimension and is independent from the tensor shape.
- * (2) ShapePerCTA is defined by shape / CTASplitNum in each dimension.
- * (3) In the implementation of emitIndices, ShapePerCTATile will
- *     be replicated or wrapped to fit ShapePerCTA.
- */
-// [FIXME LL] Kill this function
-SmallVector<unsigned> getShapePerCTATile(RankedTensorType layout);
-
 // Returns the "logical" shape per CTA.
 // When shape and CTASplitNum have different number of dimensions, we assume
 // only the last N between common dimensions are split.
 
@@ -751,7 +751,6 @@ def LinearEncodingAttr : DistributedEncoding<"LinearEncoding", "linear_encoding"
     SmallVector<unsigned> getWarpsPerCTA() const;
 
     // [FIXME LL] Supports legacy behaviour. We should remove these functions
-    SmallVector<unsigned> getShapePerCTATile() const;
     SmallVector<unsigned> getSizePerThread() const;
   }];
 
 
@@ -29,15 +29,6 @@ namespace mlir {
 //===----------------------------------------------------------------------===//
 namespace triton {
 
-// Max shmem LDS/STS instruction in bits
-constexpr int kMaxShmemVecBitLength = 128;
-
-unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
-                                     RankedTensorType dstTy) {
-  auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-  return getNumScratchElements(scratchConfig.paddedRepShape);
-}
-
 unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
                                        RankedTensorType dstTy) {
   auto *ctx = srcTy.getContext();
@@ -51,40 +42,6 @@ unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
   return smem.getTotalOutDimSize() / reps;
 }
 
-static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
-                                               RankedTensorType dstTy) {
-  Attribute srcLayout = srcTy.getEncoding();
-  Attribute dstLayout = dstTy.getEncoding();
-
-  if (!cvtNeedsSharedMemory(srcTy, dstTy)) {
-    return {};
-  }
-
-  if (shouldUseDistSmem(srcLayout, dstLayout)) {
-    // TODO: padding to avoid bank conflicts
-    return convertType<unsigned, int64_t>(gpu::getShapePerCTA(srcTy));
-  }
-
-  assert(srcLayout && dstLayout && "Unexpected layout in getRepShapeForCvt()");
-
-  auto srcShapePerCTA = gpu::getShapePerCTA(srcTy);
-  auto dstShapePerCTA = gpu::getShapePerCTA(dstTy);
-  auto srcShapePerCTATile = gpu::getShapePerCTATile(srcTy);
-  auto dstShapePerCTATile = gpu::getShapePerCTATile(dstTy);
-
-  assert(srcTy.getRank() == dstTy.getRank() &&
-         "src and dst must have the same rank");
-
-  unsigned rank = dstTy.getRank();
-  SmallVector<unsigned> repShape(rank);
-  for (unsigned d = 0; d < rank; ++d) {
-    repShape[d] =
-        std::max(std::min<unsigned>(srcShapePerCTA[d], srcShapePerCTATile[d]),
-                 std::min<unsigned>(dstShapePerCTA[d], dstShapePerCTATile[d]));
-  }
-  return repShape;
-}
-
 // Both `atomic_cas` and `atomic_rmw` may need scratch memory to store values
 // because Triton's block-based programming model ensures that
 // all threads sharing the same partition of the tensor see the same values,
@@ -99,7 +56,7 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
             return variableMask.second != 0;
           })) {
         // The tensor has broadcasted dimensions
-        smemShape = gpu::getShapePerCTATile(tensorTy);
+        smemShape = convertType<unsigned>(gpu::getShapePerCTA(tensorTy));
       }
     } else {
       // If the result is a scalar, we need to allocate a single element.
@@ -109,80 +66,6 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
   return smemShape;
 }
 
-std::pair<unsigned, unsigned>
-getScratchCvtInOutVecLengths(RankedTensorType srcTy, RankedTensorType dstTy) {
-  Attribute srcLayout = srcTy.getEncoding();
-  Attribute dstLayout = dstTy.getEncoding();
-
-  auto srcLinAttr = gpu::toLinearEncoding(srcTy);
-  auto dstLinAttr = gpu::toLinearEncoding(dstTy);
-  auto inOrd = srcLinAttr.getOrder();
-  auto outOrd = dstLinAttr.getOrder();
-
-  unsigned rank = srcTy.getRank();
-
-  unsigned srcContigPerThread = srcLinAttr.getContigPerThread()[inOrd[0]];
-  unsigned dstContigPerThread = dstLinAttr.getContigPerThread()[outOrd[0]];
-  // TODO: Fix the legacy issue that outOrd[0] == 0 always means
-  //       that we cannot do vectorization.
-  unsigned innerDim = rank - 1;
-  unsigned inVec = outOrd[0] != innerDim  ? 1
-                   : inOrd[0] != innerDim ? 1
-                                          : srcContigPerThread;
-  unsigned outVec = outOrd[0] != innerDim ? 1 : dstContigPerThread;
-
-  if (isa<gpu::NvidiaMmaEncodingAttr>(srcLayout) &&
-      isa<gpu::BlockedEncodingAttr>(dstLayout)) {
-    // when storing from mma layout and loading in blocked layout vectorizing
-    // the load back gives better performance even if there is a
-    // transposition.
-    outVec = dstContigPerThread;
-  }
-  return {inVec, outVec};
-}
-
-ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
-                                     RankedTensorType dstTy) {
-  // Initialize vector sizes and stride
-  auto repShape = getRepShapeForCvt(srcTy, dstTy);
-  if (repShape.empty())
-    return ScratchConfig({}, {});
-  ScratchConfig scratchConfig(repShape, repShape);
-  auto rank = repShape.size();
-  Attribute srcLayout = srcTy.getEncoding();
-  Attribute dstLayout = dstTy.getEncoding();
-
-  assert(cvtNeedsSharedMemory(srcTy, dstTy));
-  auto outOrd = gpu::getOrder(dstTy);
-  scratchConfig.order = outOrd;
-
-  std::tie(scratchConfig.inVec, scratchConfig.outVec) =
-      getScratchCvtInOutVecLengths(srcTy, dstTy);
-  // We can't write a longer vector than the shape of shared memory.
-  // This shape might be smaller than the tensor shape in case we decided to
-  // do the conversion in multiple iterations.
-  unsigned contiguousShapeDim = scratchConfig.repShape[scratchConfig.order[0]];
-  scratchConfig.inVec = std::min(scratchConfig.inVec, contiguousShapeDim);
-  scratchConfig.outVec = std::min(scratchConfig.outVec, contiguousShapeDim);
-  // Clamp the vector length to kMaxShmemVecBitLength / element bitwidth as this
-  // is the max vectorisation
-  auto inBitWidth = getBitwidth(srcTy);
-  auto outBitWidth = getBitwidth(dstTy);
-  scratchConfig.inVec =
-      std::min(scratchConfig.inVec, kMaxShmemVecBitLength / inBitWidth);
-  scratchConfig.outVec =
-      std::min(scratchConfig.outVec, kMaxShmemVecBitLength / outBitWidth);
-
-  // No padding is required if the tensor is 1-D, or if all dimensions except
-  // the first accessed dimension have a size of 1.
-  if (rank <= 1 || product(repShape) == repShape[outOrd[0]])
-    return scratchConfig;
-
-  auto paddedSize = std::max(scratchConfig.inVec, scratchConfig.outVec);
-  scratchConfig.paddedRepShape[outOrd[0]] += paddedSize;
-  return scratchConfig;
-}
-
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
   if (auto reduceOp = dyn_cast<ReduceOp>(op)) {
     ReduceOpHelper helper(reduceOp);
 
@@ -22,15 +22,15 @@ using namespace mlir;
 using namespace mlir::triton::gpu;
 
 constexpr int kPtrBitWidth = 64;
-struct ConvertLayoutOpUsingLinearLayoutsConversion
+struct ConvertLayoutOpConversion
     : public ConvertOpToLLVMPattern<ConvertLayoutOp> {
   const TargetInfoBase &targetInfo;
 
   // Set benefit to 2 so that this pattern applies before other convert-layout
   // conversions.  TODO(jlebar): Eventually we want this to be the only pattern.
-  explicit ConvertLayoutOpUsingLinearLayoutsConversion(
-      LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo,
-      PatternBenefit benefit = 1)
+  explicit ConvertLayoutOpConversion(LLVMTypeConverter &typeConverter,
+                                     const TargetInfoBase &targetInfo,
+                                     PatternBenefit benefit = 1)
       : ConvertOpToLLVMPattern(typeConverter, benefit), targetInfo(targetInfo) {
   }
 
@@ -590,6 +590,5 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
 void mlir::triton::populateConvertLayoutOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo,
     RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<ConvertLayoutOpUsingLinearLayoutsConversion>(
-      typeConverter, targetInfo, benefit);
+  patterns.add<ConvertLayoutOpConversion>(typeConverter, targetInfo, benefit);
 }