intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Tools/GenericSwizzling.h‎
Lines changed: 11 additions & 12 deletions b/‎include/triton/Tools/GenericSwizzling.h‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 6 additions & 0 deletions b/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 35 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp‎
Lines changed: 5 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/Tools/GenericSwizzling.cpp‎
Lines changed: 27 additions & 27 deletions b/‎lib/Tools/GenericSwizzling.cpp‎
Lines changed: 27 additions & 27 deletions
diff --git a/‎lib/Tools/LayoutUtils.cpp‎
Lines changed: 34 additions & 0 deletions b/‎lib/Tools/LayoutUtils.cpp‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 1 addition & 2 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 11 additions & 12 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 11 additions & 12 deletions
@@ -428,7 +428,7 @@ and `pN` to mean padding:
      x1, x3, p2, p3
      ...]
 
-2. 2D single interval-padding with rearanged rows.
+2. 2D single interval-padding with rearranged rows.
 
     #ttg.padded_shared<[16:+1] {offset = [[0, 1], [0, 2], /*gap, stride by 2 rows*/[2, 0], [4, 0], [1, 0]]], block = []}>
     [
 
@@ -40,18 +40,17 @@ optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
 LinearLayout optimalSwizzlingLdSt(const LinearLayout &src,
                                   const LinearLayout &dst, int32_t bitwidth);
 
-std::pair<int, int> logBankConflictsLdSt(const LinearLayout &src,
-                                         const LinearLayout &dst,
-                                         const LinearLayout &smem,
-                                         int32_t bitwidth);
-
-int logBankConflictsMemDesc(const LinearLayout &reg, const LinearLayout &smem,
-                            int32_t bitwidth);
-
-std::pair<int, int> logBankConflicts(llvm::ArrayRef<int32_t> tileSrc,
-                                     llvm::ArrayRef<int32_t> tileDst,
-                                     const LinearLayout &smem,
-                                     int32_t bitwidth);
+std::pair<int, int> bankConflictsLdSt(const LinearLayout &src,
+                                      const LinearLayout &dst,
+                                      const LinearLayout &smem,
+                                      int32_t bitwidth);
+
+int bankConflictsMemDesc(const LinearLayout &reg, const LinearLayout &smem,
+                         int32_t bitwidth);
+
+std::pair<int, int> bankConflicts(llvm::ArrayRef<int32_t> tileSrc,
+                                  llvm::ArrayRef<int32_t> tileDst,
+                                  const LinearLayout &smem);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_GENERIC_SWIZZLING_H
@@ -141,6 +141,12 @@ LinearLayout reshapeLayout(MLIRContext *ctx, LinearLayout layout,
 // order.
 LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order);
 
+// Given a distributed into shmem layout, return the largest vectorisation
+// that can be used to lower the layout via ld/st.
+std::pair<int, ColumnAction>
+largestVectorisation(MLIRContext *ctx, const LinearLayout &cvt, int bitwidth,
+                     std::optional<int> maybeMaxVecElems = std::nullopt);
+
 } // namespace mlir::triton
 
 #endif // TRITON_TOOLS_LAYOUTUTILS_H
@@ -525,41 +525,6 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
   return padOffset;
 }
 
-namespace {
-std::pair<int, ColumnAction>
-largestVectorisation(MLIRContext *ctx, const LinearLayout &cvt, int bitwidth,
-                     std::optional<int> maybeMaxVecElems = std::nullopt) {
-  // Find the largest vectorisation we can use:
-  StringAttr kReg = str_attr("register");
-  StringAttr kOffset = str_attr("offset");
-  LinearLayout quot;
-  LinearLayout tile;
-  ColumnAction permutation;
-  // If there are restrictions on the vectorisation, we don't allow
-  // permutations.
-  auto allowPerm = !maybeMaxVecElems.has_value();
-  auto maxVecElems = maybeMaxVecElems.value_or(128 / bitwidth);
-  for (int v = maxVecElems; v >= 1; v /= 2) {
-    tile = LinearLayout::identity1D(v, kReg, kOffset);
-    auto maybePerm = regPermForDivide(cvt, tile, /*left=*/true);
-    if (!maybePerm) {
-      continue;
-    }
-    permutation = *maybePerm;
-    if (!allowPerm && !permutation.isIdentity()) {
-      continue;
-    }
-    auto newCvt = permutation.apply(cvt);
-    auto maybeQuot = divideLeft(newCvt, tile);
-    if (!maybeQuot) {
-      continue;
-    }
-    return {v, permutation};
-  }
-  llvm_unreachable("Vectorization < 1 is not valid");
-}
-} // namespace
-
 SmallVector<Value>
 lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
                 ArrayRef<Value> valsArray, // Input for store, output for load
 
@@ -1938,8 +1938,8 @@ LogicalResult PaddedSharedEncodingAttr::verify(
     }
     // Ensure all non zero elements are a power of 2. Combined with the
     // broadcast check above this prevents per element swizzling. The intent of
-    // the linear component is to rearange whole rows or cache-line sized chunks
-    // of rows.
+    // the linear component is to rearrange whole rows or cache-line sized
+    // chunks of rows.
     if (!llvm::all_of(dimBases, [&](const auto &basis) {
           return llvm::all_of(
               basis, [](auto v) { return v == 0 || llvm::isPowerOf2_32(v); });
 
@@ -181,6 +181,11 @@ class CombineTMEMStoreAndAlloc : public OpRewritePattern<ttng::TMEMStoreOp> {
       return failure();
     if (alloc->getBlock() != store->getBlock())
       return failure();
+    if (auto srcDef = store.getSrc().getDefiningOp()) {
+      if (alloc->getBlock() == srcDef->getBlock() &&
+          alloc->isBeforeInBlock(srcDef))
+        return failure();
+    }
     alloc.getSrcMutable().assign(store.getSrc());
     rewriter.replaceOp(store, alloc.getToken());
     return success();
 
@@ -231,37 +231,30 @@ SmallVector<int32_t> intersectionBasis(ArrayRef<int32_t> b1,
   }
 }
 
-std::pair<int, int> logBankConflicts(ArrayRef<int32_t> tileSrc,
-                                     ArrayRef<int32_t> tileDst,
-                                     const LinearLayout &smem,
-                                     int32_t bitwidth) {
+std::pair<int, int> bankConflicts(ArrayRef<int32_t> tileSrc,
+                                  ArrayRef<int32_t> tileDst,
+                                  const LinearLayout &smem) {
   auto *ctx = smem.getOutDimNames().begin()->getContext();
   auto smemFlat = smem.flattenOuts();
   auto inDim = *smem.getInDimNames().begin();
-  // Take all the bases in the first bank (32 bits)
-  auto smemBases =
-      flatten(smemFlat.flattenIns(), *smemFlat.getInDimNames().begin());
-  auto nBankZero = llvm::Log2_32(std::max<int32_t>(1, 32 / bitwidth));
-  if (smemBases.size() >= nBankZero) {
-    smemBases.resize(nBankZero);
-  }
-  // And segments
+  // Look at the intersection between the segment bases and the tile bases
+  // We don't need to intersect with the bases that covert the bank (as in
+  // the first 32 / bitwidth bases) because if we hit any of those broadcasting
+  // will avoid the bank conflict
   auto segment = StringAttr::get(ctx, "segment");
   auto segmentBases = flatten(smemFlat, segment);
-  auto bankZero =
-      llvm::to_vector(llvm::concat<int32_t>(smemBases, segmentBases));
 
   int32_t rank = smem.getTotalOutDimSizeLog2();
   // compute conflicts
-  int write = intersectionBasis(bankZero, tileSrc, rank).size();
-  int read = intersectionBasis(bankZero, tileDst, rank).size();
-  return {read, write};
+  int write = 1 << intersectionBasis(segmentBases, tileSrc, rank).size();
+  int read = 1 << intersectionBasis(segmentBases, tileDst, rank).size();
+  return {read - 1, write - 1};
 }
 
-std::pair<int, int> logBankConflictsLdSt(const LinearLayout &src,
-                                         const LinearLayout &dst,
-                                         const LinearLayout &smem,
-                                         int32_t bitwidth) {
+std::pair<int, int> bankConflictsLdSt(const LinearLayout &src,
+                                      const LinearLayout &dst,
+                                      const LinearLayout &smem,
+                                      int32_t bitwidth) {
   auto srcFlat = src.flattenOuts();
   auto dstFlat = dst.flattenOuts();
   auto *ctx = smem.getOutDimNames().begin()->getContext();
@@ -273,19 +266,24 @@ std::pair<int, int> logBankConflictsLdSt(const LinearLayout &src,
       llvm::Log2_32(std::max(smem.getInDimSize(kVec) * bitwidth / 32, 1));
   srcLane.resize(srcLane.size() - log2Vec);
   dstLane.resize(dstLane.size() - log2Vec);
-  return logBankConflicts(srcLane, dstLane, smem, bitwidth);
+  return bankConflicts(srcLane, dstLane, smem);
 }
 
-int logBankConflictsMemDesc(const LinearLayout &reg, const LinearLayout &smem,
-                            int32_t bitwidth) {
+int bankConflictsMemDesc(const LinearLayout &reg, const LinearLayout &smem,
+                         int32_t bitwidth) {
   auto *ctx = smem.getInDimNames().begin()->getContext();
   auto S = [ctx](StringRef str) { return StringAttr::get(ctx, str); };
 
   assert(smem.hasInDim(S("offset")) && "shared layout must have an offset dim");
   assert(reg.hasInDim(S("register")) &&
          "register layout must have a register dim");
+  auto regNoBroadcast = actionRemoveBroadcastedRegs(reg).apply(reg);
+  auto regToShared = regNoBroadcast.invertAndCompose(smem);
+  auto [elemsPerVec, permutation] =
+      largestVectorisation(ctx, regToShared, bitwidth);
+  regNoBroadcast = permutation.apply(regNoBroadcast);
 
-  int32_t vecSize = reg.invertAndCompose(smem).getNumConsecutiveInOut();
+  int32_t vecSize = elemsPerVec;
   int32_t bankSize =
       std::min(32 * 32 / (vecSize * bitwidth), smem.getTotalInDimSize());
   int32_t segmentSize = smem.getTotalInDimSize() / (bankSize * vecSize);
@@ -295,7 +293,9 @@ int logBankConflictsMemDesc(const LinearLayout &reg, const LinearLayout &smem,
       {S("segment"), segmentSize},
   };
   auto smemReshaped = smem.reshapeIns(newInDims);
-  return logBankConflictsLdSt(reg, reg, smemReshaped, bitwidth).first;
+  return bankConflictsLdSt(regNoBroadcast, regNoBroadcast, smemReshaped,
+                           bitwidth)
+      .first;
 }
 
 std::optional<SmallVector<int32_t>> optimalSwizzlingTile(
@@ -675,7 +675,7 @@ optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
     for (auto [instrs, vbasis, tileSrc, tileDst] : tiles) {
       auto smem = optimalSwizzling(srcFlat, dstFlat, bitwidth, vbasis, tileSrc,
                                    tileDst, src.getOutDims());
-      auto [read, write] = logBankConflicts(tileSrc, tileDst, smem, bitwidth);
+      auto [read, write] = bankConflicts(tileSrc, tileDst, smem);
       smems.push_back({read + write, smem, {instrs.first, instrs.second}});
     }
     // Current heuristic: Minimise total bank conflicts
 
@@ -443,4 +443,38 @@ LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order) {
                       to_vector(layout.getOutDimNames()));
 }
 
+std::pair<int, ColumnAction>
+largestVectorisation(MLIRContext *ctx, const LinearLayout &cvt, int bitwidth,
+                     std::optional<int> maybeMaxVecElems) {
+  // Find the largest vectorisation we can use:
+  auto S = [ctx](StringRef str) { return StringAttr::get(ctx, str); };
+  StringAttr kReg = S("register");
+  StringAttr kOffset = S("offset");
+  LinearLayout quot;
+  LinearLayout tile;
+  ColumnAction permutation;
+  // If there are restrictions on the vectorisation, we don't allow
+  // permutations.
+  auto allowPerm = !maybeMaxVecElems.has_value();
+  auto maxVecElems = maybeMaxVecElems.value_or(128 / bitwidth);
+  for (int v = maxVecElems; v >= 1; v /= 2) {
+    tile = LinearLayout::identity1D(v, kReg, kOffset);
+    auto maybePerm = regPermForDivide(cvt, tile, /*left=*/true);
+    if (!maybePerm) {
+      continue;
+    }
+    permutation = *maybePerm;
+    if (!allowPerm && !permutation.isIdentity()) {
+      continue;
+    }
+    auto newCvt = permutation.apply(cvt);
+    auto maybeQuot = divideLeft(newCvt, tile);
+    if (!maybeQuot) {
+      continue;
+    }
+    return {v, permutation};
+  }
+  llvm_unreachable("Vectorization < 1 is not valid");
+}
+
 } // namespace mlir::triton
@@ -554,8 +554,7 @@ void init_gluon_ir(py::module &&m) {
               int bitwidth) -> int {
              auto regLayout = ttg::toLinearLayout(shape, regLayoutAttr);
              auto smemLayout = ttg::toLinearLayout(shape, sharedLayoutAttr);
-             return 1 << ttg::logBankConflictsMemDesc(regLayout, smemLayout,
-                                                      bitwidth);
+             return ttg::bankConflictsMemDesc(regLayout, smemLayout, bitwidth);
            })
       .def("create_local_dealloc",
            [](GluonOpBuilder &self, Value memDesc) -> Operation * {
 
@@ -1336,37 +1336,36 @@ def test_static_assert():
 
 
 @pytest.mark.parametrize("reg_layout, shared_layout, shape, bitwidth, ref_conflicts", [
-    (ttgl.BlockedLayout([1], [32], [4], [0]), ttgl.SwizzledSharedLayout(1, 1, 1, order=[0]), [32], 32, 1),
-    # FIXME: This one should be zero conflicts due to broadcasting.
-    (ttgl.BlockedLayout([1], [32], [4], [0]), ttgl.SwizzledSharedLayout(1, 1, 1, order=[0]), [32], 16, 2),
+    (ttgl.BlockedLayout([1], [32], [4], [0]), ttgl.SwizzledSharedLayout(1, 1, 1, order=[0]), [32], 32, 0),
+    (ttgl.BlockedLayout([1], [32], [4], [0]), ttgl.SwizzledSharedLayout(1, 1, 1, order=[0]), [32], 16, 0),
     # MMAv3 accumulator tile lowered with the 128B swizzle (WGMMA default path).
     (ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1], instr_shape=[16, 32, 16]),
-     ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=16, rank=2), [128, 128], 16, 1),
+     ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=16, rank=2), [128, 128], 16, 0),
     # Small-M tiles disable swizzling entirely.
     # MMAv2 rhs operand emitted with the 64B swizzle.
     (ttgl.DotOperandLayout(
         operand_index=1, parent=ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[1, 4], instr_shape=[16, 8]),
-        k_width=2), ttgl.NVMMASharedLayout(swizzle_byte_width=64, element_bitwidth=16, rank=2), [64, 32], 16, 2),
+        k_width=2), ttgl.NVMMASharedLayout(swizzle_byte_width=64, element_bitwidth=16, rank=2), [64, 32], 16, 0),
     # MMAv2 lhs operand uses the transposed 64B swizzle flavour.
     (ttgl.DotOperandLayout(
         operand_index=0, parent=ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[1, 4], instr_shape=[16, 8]),
         k_width=2), ttgl.NVMMASharedLayout(swizzle_byte_width=64, element_bitwidth=16, rank=2,
-                                           transposed=True), [32, 64], 16, 2),
+                                           transposed=True), [32, 64], 16, 0),
     # int8 tensor-core tiles follow the 32B swizzle path.
     (ttgl.DotOperandLayout(
         operand_index=1, parent=ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[1, 4], instr_shape=[16, 8]),
-        k_width=1), ttgl.NVMMASharedLayout(swizzle_byte_width=32, element_bitwidth=8, rank=2), [8, 32], 8, 4),
+        k_width=1), ttgl.NVMMASharedLayout(swizzle_byte_width=32, element_bitwidth=8, rank=2), [8, 32], 8, 0),
     # Small-M tiles disable swizzling entirely.
     (ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[4, 1], instr_shape=[16, 8]),
-     ttgl.NVMMASharedLayout(swizzle_byte_width=64, element_bitwidth=16, rank=2, transposed=True), [64, 64], 16, 2),
+     ttgl.NVMMASharedLayout(swizzle_byte_width=64, element_bitwidth=16, rank=2, transposed=True), [64, 64], 16, 0),
     (ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[2, 2], instr_shape=[16, 32, 16]),
-     ttgl.NVMMASharedLayout(swizzle_byte_width=64, element_bitwidth=16, rank=2), [64, 32], 16, 1),
+     ttgl.NVMMASharedLayout(swizzle_byte_width=64, element_bitwidth=16, rank=2), [64, 32], 16, 0),
     (ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[4, 1], instr_shape=[16, 8]),
-     ttgl.NVMMASharedLayout(swizzle_byte_width=32, element_bitwidth=8, rank=2), [32, 32], 8, 2),
+     ttgl.NVMMASharedLayout(swizzle_byte_width=32, element_bitwidth=8, rank=2), [32, 32], 8, 0),
     (ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[2, 4], instr_shape=[16, 8]),
-     ttgl.NVMMASharedLayout(swizzle_byte_width=0, element_bitwidth=16, rank=2), [4, 64], 16, 4),
+     ttgl.NVMMASharedLayout(swizzle_byte_width=0, element_bitwidth=16, rank=2), [4, 64], 16, 3),
     (ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1], instr_shape=[16, 32, 16]),
-     ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=32, rank=2), [128, 64], 32, 2),
+     ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=32, rank=2), [128, 64], 32, 1),
 ])
 def test_bank_conflicts(reg_layout, shared_layout, shape, bitwidth, ref_conflicts):
     dtype = {8: ttgl.int8, 16: ttgl.float16, 32: ttgl.float32}[bitwidth]
Original file line number	Diff line number	Diff line change
@@ -428,7 +428,7 @@ and `pN` to mean padding:
`428`	`428`	`x1, x3, p2, p3`
`429`	`429`	`...]`
`430`	`430`
`431`		`-2. 2D single interval-padding with rearanged rows.`
	`431`	`+2. 2D single interval-padding with rearranged rows.`
`432`	`432`
`433`	`433`	`#ttg.padded_shared<[16:+1] {offset = [[0, 1], [0, 2], /gap, stride by 2 rows/[2, 0], [4, 0], [1, 0]]], block = []}>`
`434`	`434`	`[`