intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions b/‎CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 26 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 35 additions & 0 deletions b/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 104 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 104 deletions
@@ -13,6 +13,7 @@ jobs:
   integration-tests-amd:
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 45
+    continue-on-error: ${{ matrix.runner[1] == 'gfx90a' }}
     strategy:
       matrix:
         runner: ${{ fromJson(inputs.matrix) }}
 
@@ -89,10 +89,6 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release")
 endif()
 
-if(NOT WIN32)
-  find_library(TERMINFO_LIBRARY tinfo)
-endif()
-
 if(TRITON_BUILD_UT)
   # This is an aggregate target for all unit tests.
   add_custom_target(TritonUnitTests)
 
@@ -528,32 +528,6 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
                   triton::gpu::PaddedSharedEncodingAttr layout,
                   unsigned bitwidth, Value smemOffset, bool offsetInBytes);
 
-// Emits IR to load data from shared memory into registers, or to store data
-// from registers into shared memory.
-//
-// You supply perVectorCallback, which is called once per group of register
-// elements to transfer.  You can use this callback to emit IR to load or store
-// data from or to shared memory.
-//
-// elemLlvmTy should be dstTy's element type converted to an LLVM-dialect type.
-//
-// If maxVecElems is provided, we won't vectorize more than this many elements.
-//
-// Returns true on success.
-[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
-    Type elemLlvmTy, std::optional<int32_t> maxVecElems,
-    const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
-    const TargetInfoBase &target,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
-
-[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    Value laneId, Value warpId,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
-
 // Close cousin of lowerLdStMatrix in MemoryOpToLLVM.cpp
 // We might want to merge them at some point, but having to support
 // ldmatrix.trans makes the code in lowerLdStMatrix a bit specific
 
@@ -54,6 +54,7 @@ class CoarseSchedule {
     iterator end() { return orderClusters.end(); }
     const_iterator end() const { return orderClusters.end(); }
     size_t size() const { return orderClusters.size(); }
+    void clear() { orderClusters.clear(); }
     iterator newAtBack() {
       orderClusters.push_back(orderClusters.size());
       return std::prev(orderClusters.end());
@@ -157,7 +158,10 @@ class CoarseSchedule {
   // Set <stage, cluster> based on CoarseSchedule.
   void serialize(scf::ForOp &forOp) const;
   // Create a CoarseSchedule based on forOp's <stage, cluster>.
-  LogicalResult deSerialize(scf::ForOp &forOp);
+  // If normalizeClusterId is true, clusters [minClusterId, maxClusterId] will
+  // be remapped to [0, maxClusterId - minClusterId].
+  // If false, it won't remap and clusters [0, maxClusterId] will be created.
+  LogicalResult deSerialize(scf::ForOp &forOp, bool normalizeClusterId = true);
 
   static ClusterHash hashCluster(Cluster cluster) {
     return reinterpret_cast<ClusterHash>(&*cluster);
 
@@ -147,6 +147,41 @@ std::pair<int, ColumnAction>
 largestVectorisation(MLIRContext *ctx, const LinearLayout &cvt, int bitwidth,
                      std::optional<int> maybeMaxVecElems = std::nullopt);
 
+// Close cousin of doing zerosLike(tile) * divideLeft(cvt, tile)
+// This one is a tad more general in the sense that it allows to divide
+//  cvt:
+// - register=1 -> (0, 1)
+//   register=2 -> (8, 0)
+//   register=4 -> (0, 8)
+//   register=8 -> (0, 16)
+//   register=16 -> (0, 32)
+//   register=32 -> (0, 64)
+//   register=64 -> (16, 0)
+// - lane=1 -> (0, 2)
+//   lane=2 -> (0, 4)
+//   lane=4 -> (1, 0)
+//   lane=8 -> (2, 0)
+//   lane=16 -> (4, 0)
+// - warp=1 -> (32, 0)
+//   warp=2 -> (64, 0)
+// - block is a size 1 dimension
+// where out dims are: [row (size 128), col (size 128)]
+// tile:
+//  - register=1 -> (0, 1)
+//    register=2 -> (8, 0)
+//  - lane=1 -> (0, 2)
+//    lane=2 -> (0, 4)
+//    lane=4 -> (1, 0)
+//    lane=8 -> (2, 0)
+//    lane=16 -> (4, 0)
+//  - warp=1 -> (32, 0)
+//    warp=2 -> (64, 0)
+// where out dims are: [row (size 128), col (size 8)]
+// which would not be possible to lower via the divideLeft approach as we
+// cannot divide by the tile given the `register=64 -> (16, 0)` basis.
+std::optional<LinearLayout> getReps(const LinearLayout &cvt,
+                                    const LinearLayout &tile);
+
 } // namespace mlir::triton
 
 #endif // TRITON_TOOLS_LAYOUTUTILS_H
@@ -43,6 +43,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
     "TRITON_F32_DEFAULT",
     "TRITON_PREFER_TMEM_16x256_LAYOUT",
+    "TRITON_ENABLE_EXPERIMENTAL_CONSAN",
     "TRITON_INTEL_AGGRESSIVE_DPAS_REUSE",
     "TRITON_INTEL_ENABLE_BLOCK_IO_ALL_LAYOUTS",
     "TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32",
 
@@ -706,110 +706,6 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
                          maybeMaxVecElems, localLoadOp);
 }
 
-bool emitTransferBetweenRegistersAndShared(
-    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    Value laneId, Value warpId,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
-  MLIRContext *ctx = rewriter.getContext();
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-
-  StringAttr kBlock = str_attr("block");
-  StringAttr kRegister = str_attr("register");
-  StringAttr kLane = str_attr("lane");
-  StringAttr kWarp = str_attr("warp");
-  StringAttr kOffset = str_attr("offset");
-
-  auto shape = sharedTy.getShape();
-  auto paddedEnc =
-      dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
-  LinearLayout regToSharedLayout = LinearLayout::empty();
-  if (paddedEnc) {
-    const auto &sharedLL = paddedEnc.getLinearComponent();
-    regToSharedLayout = regLayout.invertAndCompose(sharedLL);
-  } else {
-    auto sharedLL = triton::gpu::toLinearLayout(sharedTy);
-    regToSharedLayout = regLayout.invertAndCompose(sharedLL);
-  }
-
-  // TODO(jlebar): We don't currently support loading from shared memory in a
-  // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
-  if (regToSharedLayout.hasInDim(kBlock) &&
-      regToSharedLayout.hasOutDim(kBlock) &&
-      !regToSharedLayout.isTrivialOver({kBlock})) {
-    return false;
-  }
-
-  // Determine how many consecutive registers map to consecutive shmem elements
-  // in out-dimension offsetN.  This is our load instruction's vector width.
-  //
-  // It's OK if the vector width we choose here is wider than the hardware
-  // supports; LLVM will legalize it.
-  int vecElems =
-      std::min({regToSharedLayout.getNumConsecutiveInOut(),
-                maxVecElems.value_or(std::numeric_limits<int>::max())});
-  if (paddedEnc) {
-    vecElems = std::min(vecElems, int(paddedEnc.getMinInterval()));
-  }
-
-  auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
-  Value blockId =
-      withCTAOffset ? target.getClusterCTAId(rewriter, loc) : b.i32_val(0);
-
-  int numElems = regToSharedLayout.getInDimSize(kRegister);
-  auto vecTy = vec_ty(elemLlvmTy, vecElems);
-  SmallVector<uint32_t> regIds;
-  for (int i = 0; i < numElems / vecElems; i++) {
-    regIds.push_back(i * vecElems);
-  }
-
-  auto smemBase = smemObj.getBase();
-
-  auto indicesVec = applyLinearLayoutVec(loc, rewriter, regToSharedLayout,
-                                         {{kRegister, b.i32_val(0)},
-                                          {kLane, laneId},
-                                          {kWarp, warpId},
-                                          {kBlock, blockId}},
-                                         regIds);
-
-  // Compute affine offset given by memdesc_subslice
-  auto offset = smemObj.getShmemOffset(loc, rewriter, sharedTy);
-  SmallVector<Value> vecAddrVec;
-  for (auto &indices : indicesVec) {
-    Value smemOffset = indices[0].second;
-    smemOffset = b.xor_(smemOffset, offset);
-    if (paddedEnc) {
-      // Apply the offset needed for padding.
-      auto bitwidth = elemLlvmTy.getIntOrFloatBitWidth();
-      Value padOffset = emitPadding(loc, rewriter, paddedEnc, bitwidth,
-                                    smemOffset, /*offsetInBytes=*/false);
-      smemOffset = b.add(smemOffset, padOffset);
-    }
-    auto vecAddr = b.gep(smemBase.getType(), elemLlvmTy, smemBase, smemOffset,
-                         LLVM::GEPNoWrapFlags::inbounds);
-    vecAddrVec.push_back(vecAddr);
-  }
-
-  for (Value &vecAddr : vecAddrVec) {
-    perVectorCallback(vecTy, vecAddr);
-  }
-  return true;
-}
-
-bool emitTransferBetweenRegistersAndShared(
-    RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
-    Type elemLlvmTy, std::optional<int32_t> maxVecElems,
-    const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
-    const TargetInfoBase &target,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
-  auto regLayout = triton::gpu::toLinearLayout(registerTy);
-  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
-  return emitTransferBetweenRegistersAndShared(
-      regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
-      target, laneId, warpId, perVectorCallback);
-}
-
 SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
                                     RewriterBase &rewriter) {
   assert(bool(llvmStruct) && "can not unpack null values");