intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 19 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 19 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
Lines changed: 3 additions & 269 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
Lines changed: 3 additions & 269 deletions
@@ -624,6 +624,25 @@ inline bool isCanonicalIndex(unsigned index, unsigned freeVarMask) {
 // group code isolated from above by invoking this function.
 void makeAllWarpGroupsIsolatedFromAbove(Operation *op);
 
+/// Converts ConverLayoutOp to llvm using padded pattern.
+/// This pattern adds unused memory locations after every rows of tensor fastest
+/// changing dimension:
+/// e0 e1 e2 e3 p p \
+/// e4 e5 e6 e7 p p \
+/// ...
+/// e e e e p p
+/// Dimension order is chosen in order to use wide output reads.
+///
+/// \param op operation to convert
+/// \param src llvm structure containing operation input
+/// \param targetInfo
+/// \param typeConverter
+/// \param rewriter
+/// \returns llvm structure containing converted output
+Value transferWithinBlockPadding(triton::gpu::ConvertLayoutOp op, Value src,
+                                 const TargetInfoBase &targetInfo,
+                                 const LLVMTypeConverter *typeConverter,
+                                 RewriterBase &rewriter);
 } // namespace mlir
 
 #endif
@@ -272,13 +272,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
                                     const LinearLayout &dstLayout,
                                     OpAdaptor adaptor,
                                     ConversionPatternRewriter &rewriter) const {
-    MLIRContext *ctx = op.getContext();
-    auto loc = op.getLoc();
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-    auto srcTy = op.getSrc().getType();
-    auto dstTy = op.getType();
-
-    assert(cvtNeedsSharedMemory(srcTy, dstTy));
+    assert(cvtNeedsSharedMemory(op.getSrc().getType(), op.getType()));
 
     // Try to use swizzling to implement the conversion
     // HACK Remove once AMD tests pass for the swizzling path
@@ -287,52 +281,9 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       return success();
     }
 
-    SmallVector<Value> inVals =
-        unpackLLElements(loc, adaptor.getSrc(), rewriter);
-    assert(!inVals.empty());
-
-    // We munge the input values by converting i<n> (n<8) elements to i8 and
-    // pointers to i64. This is necessary because TargetInfo::loadDShared and
-    // storeDShared can't handle vectors of pointers or sub-byte elements.
-    auto elemTy = srcTy.getElementType();
-    auto isSubByteInt =
-        elemTy.isInteger() && elemTy.getIntOrFloatBitWidth() < 8;
-    auto isPtr = isa<triton::PointerType>(elemTy);
-    auto llvmElemTyOrig = getTypeConverter()->convertType(elemTy);
-    if (isSubByteInt)
-      elemTy = IntegerType::get(elemTy.getContext(), 8);
-    else if (isPtr)
-      elemTy = IntegerType::get(elemTy.getContext(), 64);
-    auto llvmElemTy = getTypeConverter()->convertType(elemTy);
-
-    // Munge input values
-    for (const auto &it : llvm::enumerate(inVals)) {
-      if (isSubByteInt) {
-        inVals[it.index()] = b.zext(llvmElemTy, it.value());
-      } else if (isPtr) {
-        inVals[it.index()] = b.ptrtoint(llvmElemTy, it.value());
-      }
-    }
-
-    // Pretty sure this is the identity function ATM
-    // It'd be better to simply call `quotient({kBlock})` and
-    // remove kBlock from transferWithinBlockImpl
-    auto srcLayoutWithinBlock = getLayoutWithinBlock(srcLayout);
-    auto dstLayoutWithinBlock = getLayoutWithinBlock(dstLayout);
-    SmallVector<Value> outVals = transferWithinBlockImpl(
-        inVals, op, srcLayoutWithinBlock, dstLayoutWithinBlock, rewriter);
-
-    // Unmunge output values
-    for (const auto &it : llvm::enumerate(outVals)) {
-      if (isSubByteInt) {
-        outVals[it.index()] = b.trunc(llvmElemTyOrig, it.value());
-      } else if (isPtr) {
-        outVals[it.index()] = b.inttoptr(llvmElemTyOrig, it.value());
-      }
-    }
+    Value result = transferWithinBlockPadding(op, adaptor.getSrc(), targetInfo,
+                                              getTypeConverter(), rewriter);
 
-    Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter,
-                                  op.getType());
     rewriter.replaceOp(op, result);
     return success();
   }
@@ -343,223 +294,6 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
                           DecomposedWarpConversion decomposed,
                           OpAdaptor adaptor,
                           ConversionPatternRewriter &rewriter) const;
-
-  SmallVector<Value>
-  transferWithinBlockImpl(ArrayRef<Value> inVals, ConvertLayoutOp op,
-                          const LinearLayout &srcLayout,
-                          const LinearLayout &dstLayout,
-                          ConversionPatternRewriter &rewriter) const {
-    MLIRContext *ctx = op.getContext();
-    auto loc = op.getLoc();
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-
-    RankedTensorType srcTy = op.getSrc().getType();
-    auto srcElemTy = srcTy.getElementType();
-    const bool isInt1 = srcElemTy.isInteger(1);
-
-    StringAttr kRegister = str_attr("register");
-    StringAttr kLane = str_attr("lane");
-    StringAttr kWarp = str_attr("warp");
-    StringAttr kBlock = str_attr("block");
-    StringAttr kOffset = str_attr("offset");
-    StringAttr kIteration = str_attr("iteration");
-
-    auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
-
-    auto scratchConfig =
-        getScratchConfigForCvt(op.getSrc().getType(), op.getType());
-    auto tensorShapePerCTA = convertType<unsigned, int64_t>(getShapePerCTA(
-        op.getSrc().getType().getEncoding(), op.getType().getShape()));
-    // Input dims: [offset, iteration, block]
-    // Output dims: dimN-1, dimN-2, ..., dim0, where N is obtained from repShape
-    LinearLayout sharedLayout = chooseShemLayoutForRegToRegConversion(
-        ctx, tensorShapePerCTA, scratchConfig.repShape, scratchConfig.order);
-
-    // Layout for the store from registers to shared memory.
-    //
-    // Note: If two threads in the same warp write to the same shmem offset, the
-    // hardware resolves that without a stall or a bank conflict.  Therefore we
-    // don't need to avoid duplicate writes.
-    // Input dims: [reg, lane, warp]
-    // Output dims: [offset, iteration]
-    bool isStMatrix = targetInfo.canUseStMatrix(
-        op.getSrc().getType(), scratchConfig.repShape,
-        scratchConfig.paddedRepShape, scratchConfig.order,
-        /*swizzleByteSize=*/0);
-    LinearLayout shmemStoreLayout =
-        isStMatrix ? chooseStMatrixLayout(ctx, op.getSrc().getType(),
-                                          /*swizzleByteSize=*/0)
-                   : srcLayout.invertAndCompose(sharedLayout);
-
-    const int shmemAllocatedNumElems =
-        getNumScratchElements(scratchConfig.paddedRepShape);
-    assert(shmemStoreLayout.getOutDimSize(kOffset) <= shmemAllocatedNumElems);
-
-    // Layout for the load from shmem to registers.
-    LinearLayout shmemLoadLayout = dstLayout.invertAndCompose(sharedLayout);
-
-    // Check that the `register` fully determines the `iteration`.  That is,
-    // each thread does exactly the same reads and writes to shmem on each
-    // iteration, just with different input/output registers.
-    assert(
-        shmemStoreLayout.sublayoutIsZero({kLane, kWarp, kBlock}, {kIteration}));
-    assert(
-        shmemLoadLayout.sublayoutIsZero({kLane, kWarp, kBlock}, {kIteration}));
-
-    // iteration -> registers
-    SmallVector<SmallVector<int>> inRegsForIter =
-        collectRegsForIter(ctx, shmemStoreLayout);
-    SmallVector<SmallVector<int>> outRegsForIter =
-        collectRegsForIter(ctx, shmemLoadLayout);
-
-    Value smemBase =
-        LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op.getOperation());
-    auto sharedPtrTy = smemBase.getType();
-    Type elemTy = inVals[0].getType();
-    auto outSize = shmemLoadLayout.getInDimSize(kRegister);
-    auto iterations = sharedLayout.getInDimSize(kIteration);
-    assert(scratchConfig.inVec * iterations <= inVals.size());
-    assert(scratchConfig.outVec * iterations <= outSize);
-
-    // Check only one dimension has been padded.
-    // This means the difference between the padded shape and the original shape
-    // should only be in one dimension, specifically in
-    // `scratchConfig.order[0]`.
-    auto rank = scratchConfig.repShape.size();
-    for (auto i = 0; i < rank; i++) {
-      if (i == scratchConfig.order[0]) {
-        continue;
-      }
-      assert(scratchConfig.repShape[i] == scratchConfig.paddedRepShape[i]);
-    }
-    auto paddedStride = scratchConfig.repShape[scratchConfig.order[0]];
-    auto paddedSize =
-        scratchConfig.paddedRepShape[scratchConfig.order[0]] - paddedStride;
-
-    // Linear layout function is split in two parts below:
-    //
-    // L(r, t, w, b) = L(0, t, w, b) xor L(r, 0, 0, 0)
-    //   offset      =    regBase   xor    regIdx
-    //
-    // It is the same hack as what we've done in the emitIndices function to get
-    // around performance issues on AMD GPUs
-    auto getVecAddr = [&](LinearLayout &layout, Value &regBase,
-                          int regSlice) -> Value {
-      auto regIdx = layout
-                        .apply({{kRegister, regSlice},
-                                {kLane, 0},
-                                {kWarp, 0},
-                                {kBlock, 0}})[0]
-                        .second;
-      Value offset = b.xor_(regBase, b.i32_val(regIdx));
-      if (paddedSize > 0) {
-        assert(llvm::isPowerOf2_32(paddedStride));
-        assert(llvm::isPowerOf2_32(paddedSize));
-        auto rshiftVal = llvm::Log2_32(paddedStride);
-        auto lshiftVal = llvm::Log2_32(paddedSize);
-        offset = b.add(
-            b.shl(b.lshr(offset, b.i32_val(rshiftVal)), b.i32_val(lshiftVal)),
-            offset);
-      }
-      auto vecAddr = b.gep(sharedPtrTy, elemTy, smemBase, offset,
-                           LLVM::GEPNoWrapFlags::inbounds);
-      return vecAddr;
-    };
-
-    auto storeBase = applyLinearLayout(loc, rewriter, shmemStoreLayout,
-                                       {{kRegister, b.i32_val(0)},
-                                        {kLane, laneId},
-                                        {kWarp, warpId},
-                                        {kBlock, b.i32_val(0)}})[0]
-                         .second;
-    auto loadBase = applyLinearLayout(loc, rewriter, shmemLoadLayout,
-                                      {{kRegister, b.i32_val(0)},
-                                       {kLane, laneId},
-                                       {kWarp, warpId},
-                                       {kBlock, b.i32_val(0)}})[0]
-                        .second;
-    // register idx -> Value
-    llvm::MapVector<int, Value> outVals;
-    for (int i = 0; i < iterations; i++) {
-      if (i != 0)
-        b.barrier();
-
-      auto &inRegs = inRegsForIter[i];
-      auto &outRegs = outRegsForIter[i];
-
-      // When using `stmatrix`, we can store `inVec` elements even if they are
-      // not contiguous
-      auto inVec = isStMatrix ? shmemStoreLayout.getNumConsecutiveInOut()
-                              : scratchConfig.inVec;
-      for (int j = 0; j < inVals.size() / iterations; j += inVec) {
-        auto inRegSlice = inRegs[j];
-        Value vecAddr = getVecAddr(shmemStoreLayout, storeBase, inRegSlice);
-        SmallVector<Value> inValsVec;
-        for (int k = 0; k < inVec; k++)
-          inValsVec.push_back(inVals[inRegSlice + k]);
-        Value valsVec = packLLVector(loc, inValsVec, rewriter);
-        if (isStMatrix) {
-          targetInfo.storeMatrixShared(rewriter, loc, vecAddr, valsVec);
-        } else {
-          targetInfo.storeDShared(rewriter, loc, vecAddr, std::nullopt, valsVec,
-                                  /*pred=*/b.true_val());
-        }
-      }
-
-      b.barrier();
-
-      for (int j = 0; j < outSize / iterations; j += scratchConfig.outVec) {
-        auto outRegSlice = outRegs[j];
-        auto vecAddr = getVecAddr(shmemLoadLayout, loadBase, outRegSlice);
-        Value valsVec =
-            targetInfo.loadDShared(rewriter, loc, vecAddr, std::nullopt,
-                                   vec_ty(elemTy, scratchConfig.outVec),
-                                   /*pred=*/b.true_val());
-        for (Value v : unpackLLVector(loc, valsVec, rewriter)) {
-          if (isInt1) {
-            // TODO(Intel): special handling for the boolean case required. Does
-            // this prevent a later optimization that we can't handle, or is
-            // there something about the layout/SLM loads and stores that
-            // requires special "transcribing" the boolean to the result of the
-            // cmp?
-            outVals[outRegSlice++] =
-                b.icmp_ne(v, rewriter.create<LLVM::ConstantOp>(
-                                 loc, i8_ty, rewriter.getI8IntegerAttr(0)));
-          } else {
-            outVals[outRegSlice++] = v;
-          }
-        }
-      }
-    }
-
-    SmallVector<Value> outValsVec;
-    for (size_t i = 0; i < outVals.size(); i++)
-      outValsVec.push_back(outVals[i]);
-    return outValsVec;
-  }
-
-  // Determine which registers are read/written in which iteration of the shmem
-  // transfer specified by `layout`.
-  SmallVector<SmallVector<int> /*registers*/>
-  collectRegsForIter(MLIRContext *ctx, const LinearLayout &layout) const {
-    StringAttr kRegister = str_attr("register");
-    StringAttr kLane = str_attr("lane");
-    StringAttr kWarp = str_attr("warp");
-    StringAttr kBlock = str_attr("block");
-    StringAttr kIteration = str_attr("iteration");
-
-    // The choice of iteration should be determined only by the register.  That
-    // is, it should be correct to split the register dimension into iterations.
-    assert(layout.sublayoutIsZero({kLane, kWarp, kBlock}, {kIteration}));
-
-    LinearLayout sublayout = layout.sublayout({kRegister}, {kIteration});
-    SmallVector<SmallVector<int>> ret(sublayout.getOutDimSize(kIteration));
-    for (int reg = 0; reg < sublayout.getInDimSize(kRegister); reg++) {
-      auto idx = sublayout.apply({{kRegister, reg}});
-      ret[idx.begin()->second].push_back(reg);
-    }
-    return ret;
-  }
 };
 
 } // namespace