intel
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 0 additions & 2 deletions b/‎python/test/gluon/test_core.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAHelpers.h‎
Lines changed: 88 additions & 99 deletions b/‎third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAHelpers.h‎
Lines changed: 88 additions & 99 deletions
diff --git a/‎third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp‎
Lines changed: 17 additions & 7 deletions b/‎third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp‎
Lines changed: 17 additions & 7 deletions
@@ -692,8 +692,6 @@ def kernel(in_ptr, out_ptr, smem_h: ttgl.constexpr, smem_w: ttgl.constexpr, num_
         tmem_alias: ttgl.constexpr = TensorMemoryLayout((num_rows, num_cols), col_stride=1)
         tmem = tmem._reinterpret(ttgl.int8, (num_rows, num_cols), tmem_alias)
         value = tmem.load(blocked)
-        ttgl.static_print(ttgl.to_linear_layout(blocked, (smem_h, smem_w)))
-        ttgl.static_print(ttgl.to_linear_layout(blocked, (num_rows, num_cols)))
         ttgl.store(ttgl.set_auto_layout(out_ptrs, blocked), value)
 
     torch.manual_seed(0)
 
@@ -28,7 +28,6 @@ struct MMASMEMDescriptor {
   SMEMDescriptor descriptor;
   int32_t swizzlingByteWidth;
   int32_t bitwidth;
-  bool twoCTAs;
   bool transposed;
   bool fp4Padded;
 };
@@ -53,77 +52,67 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
 
   DotOpMmaSmemLoader(MMASMEMDescriptor desc, Value baseb128, LinearLayout llInv,
                      ArrayRef<unsigned> instrShape)
-      : desc(desc), baseb128(baseb128), llInv(std::move(llInv)),
+      : desc(desc), baseb128(baseb128), ll(std::move(llInv)),
         instrShape(instrShape) {}
 
   static DotOpMmaSmemLoader
-  build(Location loc, RewriterBase &rewriter, triton::gpu::MemDescType tensor,
+  build(Location loc, RewriterBase &rewriter, gpu::MemDescType memTy,
         Value smemBase, ArrayRef<unsigned> instrShape, int mmaVersion,
+        bool isFp4 = false,
         std::optional<RankedTensorType> mmaTy = std::nullopt,
         std::optional<unsigned> MNdim = std::nullopt) {
-    auto ctx = tensor.getContext();
+    auto ctx = rewriter.getContext();
+    auto kOffset = str_attr("offset");
+    // The handling of subviews is not as fine as it could be
+    // We could compose with the identity of the memTy.getShape()
+    // (at the moment llInv will be of allocShape), but then
+    // we would need to handle the getReps part more carefuly
+    // This way we could support more subviews that we don't
+    // We can implement this generalisation in the future if needed
+    auto llInv = toLinearLayout(memTy).pseudoinvert();
+    auto bitwidth = memTy.getElementType().getIntOrFloatBitWidth();
+    if (isFp4) {
+      // hacky but well
+      auto dims = to_vector(llInv.getInDimNames());
+      auto trans = llInv.getBasis(dims[0], 0, kOffset) == 1;
+      llInv = LinearLayout::identity1D(2, dims[trans ? 0 : 1], kOffset) * llInv;
+      bitwidth /= 2;
+      // The instr_shape comes in number of elements already
+    }
+    return build(loc, rewriter, llInv, bitwidth, smemBase, instrShape,
+                 mmaVersion, mmaTy, MNdim);
+  }
+
+  static DotOpMmaSmemLoader
+  build(Location loc, RewriterBase &rewriter, const LinearLayout &ll,
+        int bitwidth, Value smemBase, ArrayRef<unsigned> instrShapeArray,
+        int mmaVersion, std::optional<RankedTensorType> mmaTy = std::nullopt,
+        std::optional<unsigned> MNdim = std::nullopt) {
+    // ll is a map from two dimensions (dim0, dim1) or (row, col) into offsets
+    // and blocks
+    auto ctx = rewriter.getContext();
+    auto kOffset = str_attr("offset");
+    auto kBlock = str_attr("block");
+    assert(ll.getNumOutDims() == 2);
+    assert(ll.hasOutDim(kOffset) && ll.hasOutDim(kBlock));
+
     assert(mmaVersion == 3 || mmaVersion == 5);
     // Just needed for MMAv3
     assert(mmaTy.has_value() == (mmaVersion == 3));
     assert(MNdim.has_value() == (mmaVersion == 3));
     if (mmaVersion == 3) {
       assert(MNdim.value() < 2);
     }
+    auto instrShape = to_vector(instrShapeArray);
     assert(instrShape.size() == 2);
     auto b = TritonLLVMOpBuilder(loc, rewriter);
-    // TODO Assert that calling getShmemAffineBase is valid!
-
-    // Due to the alignment, we can transform ((base + offset) & 0x3FFFF) >> 4
-    // into ((base >> 4) & 0x3FFF + (offset >> 4) where offset is in the inner
-    // loop and ((base >> 4) & 0x3FFF) can be computed once.
-    assert(cast<triton::gpu::SharedEncodingTrait>(tensor.getEncoding())
-               .getAlignment() >= 16);
 
+    // Due to having a 16B alignment, we can compute the offsets in 128b
+    // elements
+    // TODO We should assert in the verifier that the alignment is at least 16B
     smemBase = b.ptrtoint(i32_ty, smemBase);
     Value baseSrcb128 = b.lshr(smemBase, b.i32_val(4));
-    int bitwidth = tensor.getElementType().getIntOrFloatBitWidth();
 
-    auto ll = toLinearLayout(tensor);
-    auto kOffset = str_attr("offset");
-    assert(ll.getNumOutDims() == 2);
-    auto dims = to_vector(ll.getOutDimNames());
-    // The linear layout for fp4 represents the matrix as i8s
-    // For it to play ball with instrShape, which is in terms of the original
-    // tensor, we need to represent it as i4s
-    // Interestingly enough, we support i8 x i8 matmul by the looks of it
-    auto isFp4 =
-        tensor.getElementType() == IntegerType::get(ctx, 8) && mmaVersion == 5;
-    auto shape = to_vector(tensor.getShape());
-    if (isFp4) {
-      // hacky but well
-      auto trans = ll.getBasis(kOffset, 0)[0] != 0;
-      ll = LinearLayout::identity1D(2, kOffset, dims[trans ? 0 : 1]) * ll;
-      shape[trans ? 0 : 1] *= 2;
-      bitwidth /= 2;
-      // The instr_shape comes in number of elements already
-    }
-
-    for (auto [dim, instrSize] : llvm::zip(ll.getOutDimNames(), instrShape)) {
-      assert(instrSize <= ll.getOutDimSize(dim) &&
-             "Instr shape is too large for the layout");
-    }
-
-    // TODO Add this to the verifier
-    // We represent fp4 padded tensors as i8s
-    auto desc = getDescriptor(ll, instrShape, bitwidth, mmaVersion);
-
-    // In case it was a subview, we resize it by composing it with the identity
-    // of shape getShape (rather than shape getAllocShape, as toLinearLayout
-    // returns)
-    // Also in the case of mutlicta where the different CTAs have broadcasting
-    // (so no 2-CTA MMA) we effectively need to pseudoinvert. This also achieves
-    // that
-    auto outDims = to_vector(ll.getOutDimNames());
-    auto identity = LinearLayout::identity1D(shape[0], outDims[0], outDims[0]) *
-                    LinearLayout::identity1D(shape[1], outDims[1], outDims[1]);
-    auto llInv = identity.invertAndCompose(ll);
-
-    auto blockInstrShape = to_vector(instrShape);
     if (mmaVersion == 3) {
       auto mndim = MNdim.value();
       auto mmaLl = gpu::toLinearLayout(mmaTy.value());
@@ -133,15 +122,15 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
       auto mmaWarps = mmaLl.sublayout({kWarp}, {outDims[mndim]}) *
                       LinearLayout::identity1D(1, kWarp, outDims[1 - mndim]);
       // Map from warps to offsets in bitwidth elements
-      auto warpToOffset = mmaWarps.compose(llInv);
+      auto warpToOffset = mmaWarps.compose(ll);
       // Map from warps to offsets in 128b elements
       auto maybeWarpToOffsetb128 =
           divideLeft(warpToOffset,
                      LinearLayout::zeros1D(1, kWarp, kOffset, 128 / bitwidth));
       assert(maybeWarpToOffsetb128.has_value());
       // zero out the first two warp bases to have a warpgroup to offset map
-      assert(maybeWarpToOffsetb128->getNumOutDims() == 2);
       auto bases = maybeWarpToOffsetb128->getBases();
+      assert(maybeWarpToOffsetb128->getNumOutDims() == 2);
       bases[kWarp][0] = {0, 0};
       bases[kWarp][1] = {0, 0};
       auto warpGroupToOffsetb128 = LinearLayout(
@@ -152,34 +141,40 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
                             {{kWarp, warpId}})[0]
               .second;
       baseSrcb128 = b.add(baseSrcb128, warpStrideb128);
-      // Increase the instruction shape to describe the size at a warp level
-      // A bit hacky but well
+      // Increase the instruction shape to describe the size at a block level
+      // as the input just describes it at a warp level
       int logwgAlongMN = 0;
       for (int i = 0; i < warpGroupToOffsetb128.getInDimSizeLog2(kWarp); i++) {
         if (warpGroupToOffsetb128.getBasis(kWarp, i, kOffset) != 0) {
           logwgAlongMN++;
         }
       }
-      blockInstrShape[mndim] *= (1 << logwgAlongMN);
+      instrShape[mndim] *= (1 << logwgAlongMN);
     }
 
+    for (auto [dim, instrSize] : llvm::zip(ll.getInDimNames(), instrShape)) {
+      assert(instrSize <= ll.getInDimSize(dim) &&
+             "Instruction shape is too large for the layout");
+    }
+
+    auto desc = getDescriptor(ll, instrShape, bitwidth, mmaVersion);
+
     Value baseb128 = b.zext(i64_ty, b.and_(baseSrcb128, b.i32_val(0x3FFF)));
-    return DotOpMmaSmemLoader(desc, baseb128, llInv, blockInstrShape);
+    return {desc, baseb128, ll, instrShape};
   }
 
   Value smemLoad(int a, int b, ConversionPatternRewriter &rewriter,
                  Location loc) const {
     auto *ctx = loc.getContext();
     auto tb = TritonLLVMOpBuilder(loc, rewriter);
-    auto dims = to_vector(llInv.getInDimNames());
-    assert((a + 1) * instrShape[0] <= llInv.getInDimSize(dims[0]));
-    assert((b + 1) * instrShape[1] <= llInv.getInDimSize(dims[1]));
-    assert(to_vector(llInv.getOutDimNames()) ==
+    auto dims = to_vector(ll.getInDimNames());
+    assert((a + 1) * instrShape[0] <= ll.getInDimSize(dims[0]));
+    assert((b + 1) * instrShape[1] <= ll.getInDimSize(dims[1]));
+    assert(to_vector(ll.getOutDimNames()) ==
            llvm::to_vector(
                ArrayRef<StringAttr>{str_attr("offset"), str_attr("block")}));
-    int32_t totalOffElems = llInv
-                                .apply({{dims[0], a * instrShape[0]},
-                                        {dims[1], b * instrShape[1]}})[0]
+    int32_t totalOffElems = ll.apply({{dims[0], a * instrShape[0]},
+                                      {dims[1], b * instrShape[1]}})[0]
                                 .second;
     int32_t smemByteOffsetb8 = totalOffElems * desc.bitwidth / 8;
     auto currDesc = desc.descriptor;
@@ -198,37 +193,22 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
     return {smemLoad(a, b, rewriter, loc), std::nullopt};
   }
 
+  MMASMEMDescriptor &getDescriptor() { return desc; }
+
 private:
   MMASMEMDescriptor desc;
   Value baseb128;
-  LinearLayout llInv;
+  LinearLayout ll;
   SmallVector<unsigned> instrShape;
 
   static MMASMEMDescriptor getDescriptor(const LinearLayout &ll,
                                          ArrayRef<unsigned> instrShape,
                                          int bitwidth, int mmaVersion) {
     // ll is a map from allocShape into offsets and blocks
-    auto inv = ll.pseudoinvert();
-    auto dims = to_vector(inv.getInDimNames());
+    auto dims = to_vector(ll.getInDimNames());
     auto ctx = dims[0].getContext();
     auto kOffset = str_attr("offset");
 
-    // Detect tcgen05.mma.cta_group::2 as having two CTAs that are not
-    // broadcasting
-    auto kBlock = str_attr("block");
-    auto twoCTAs = ll.getInDimSize(kBlock) > 1 &&
-                   ll.getBasis(kBlock, 0) != ArrayRef<int32_t>({0, 0});
-    SmallVector<unsigned> instrShapePerCTA = to_vector(instrShape);
-    if (twoCTAs) {
-      // In 2CTA mode we split the tensor into two CTAs
-      assert(ll.getInDimSize(kBlock) == 2);
-      if (ll.getBasis(kBlock, 0, dims[0]) != 0) {
-        instrShapePerCTA[0] /= 2;
-      } else {
-        instrShapePerCTA[1] /= 2;
-      }
-    }
-
     // Any CTALayout, it's not really used within getCoreMatrixLinearLayout
     auto CTALayout = triton::gpu::CTALayoutAttr::getDefault(ctx, 2);
 
@@ -242,11 +222,19 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
               CTALayout);
           auto shmemTile =
               getCoreMatrixLinearLayout(shmemEnc, /*disableSwizzle=*/false);
-          // We unpack the bitwidth == 8 tile
+          // Rename out dims to match the original layout (in case the dims were
+          // (row, col))
+          auto outDims = to_vector(shmemTile.getOutDims());
+          outDims[0].first = dims[0];
+          outDims[1].first = dims[1];
+          shmemTile = LinearLayout(shmemTile.getBases(), outDims,
+                                   /*requireSurjective=*/false);
+          // unpack the fp4 layout
           if (bitwidth == 4) {
             shmemTile =
                 LinearLayout::identity1D(2, kOffset, dims[1]) * shmemTile;
           }
+
           // getCoreMatrixLinearLayout gives the k-contiguous tile
           // shmemTile is a layout onto a matrix with shape
           // If swizzling != 0: 8 x (8 * swizzling / bitwidth)
@@ -266,10 +254,12 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
           // The PTX docs are wrong in a number of ways:
           // 1) LBO can be specified for !transposed && swizzled != 0
           //    PTX says it's assumed to be 1, but  we can in fact use it
-          // 2) LBO / SBO are swapped also for !transposed && swizzled != 0
+          // 2) LBO / SBO are swapped also for !transposed && swizzled == 0
           //    PTX just reports this for the transposed case
-          // Luckily enough the generic logic is much simpler than what's
-          // described in the docs
+          //    EVEN MORE the computation we do here is conceptually correct
+          //    and it agrees with the tensor descriptors for wgmma or
+          //    tcgen05.mma but not for tcgen05.cp! tcgen05.cp follows the PTX
+          //    docs!
           int lbo = 0, sbo = 0;
           int leadingDim = transposed ? 0 : 1;
           int stridedDim = transposed ? 1 : 0;
@@ -279,13 +269,13 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
             std::swap(leadingDim, stridedDim);
           }
           auto log2RowsTile = shmemTileInv.getInDimSizeLog2(dims[leadingDim]);
-          if (inv.getInDimSizeLog2(dims[leadingDim]) > log2RowsTile) {
-            lbo = inv.getBasis(dims[leadingDim], log2RowsTile, kOffset);
+          if (llvm::Log2_32(instrShape[leadingDim]) > log2RowsTile) {
+            lbo = ll.getBasis(dims[leadingDim], log2RowsTile, kOffset);
           }
 
           auto log2ColsTile = shmemTileInv.getInDimSizeLog2(dims[stridedDim]);
-          if (inv.getInDimSizeLog2(dims[stridedDim]) > log2ColsTile) {
-            sbo = inv.getBasis(dims[stridedDim], log2ColsTile, kOffset);
+          if (llvm::Log2_32(instrShape[stridedDim]) > log2ColsTile) {
+            sbo = ll.getBasis(dims[stridedDim], log2ColsTile, kOffset);
           }
 
           // Pad the tile up to the full instruction shape with the relevant
@@ -294,9 +284,9 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
           for (int d : {0, 1}) {
             // 'tile' with the atom tile according to the lbo/sbo rules
             for (int i = 1;
-                 i < instrShapePerCTA[d] / shmemTileInv.getInDimSize(dims[d]);
+                 i < instrShape[d] / shmemTileInv.getInDimSize(dims[d]);
                  i *= 2) {
-              auto stride = inv.getBasis(
+              auto stride = ll.getBasis(
                   dims[d], shmemTileInv.getInDimSizeLog2(dims[d]), kOffset);
               bases[dims[d]].push_back({stride * i});
             }
@@ -316,11 +306,11 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
           shmemTileInv *=
               LinearLayout::identity1D(1, dims[0], str_attr("block"));
 
-          auto quot = getReps(inv, shmemTileInv);
-          if (quot.has_value()) {
+          auto reps = getReps(ll, shmemTileInv);
+          if (reps.has_value()) {
             SMEMDescriptor desc;
             desc.descriptor = mmaVersion == 5 ? 1ULL << 46 : 0ULL;
-            // The lbo / sbo is defined wrt. the 128 tile
+            // The lbo / sbo is defined wrt. the 128b elements
             desc.leadDimensionBaseOffset = (lbo * bitwidth / 8) >> 4;
             desc.strideDimensionBaseOffset = (sbo * bitwidth / 8) >> 4;
             switch (swizzling) {
@@ -342,7 +332,6 @@ class DotOpMmaSmemLoader : public DotOpMmaMemLoader {
             return {.descriptor = desc,
                     .swizzlingByteWidth = swizzling,
                     .bitwidth = bitwidth,
-                    .twoCTAs = twoCTAs,
                     .transposed = transposed,
                     .fp4Padded = fp4Padded};
           }
 
@@ -453,14 +453,24 @@ void convertDotImpl(const LLVMTypeConverter &typeConverter,
     aLoader = std::make_unique<DotOpMmaV5TmemLoader>(a, baseA, aOperandShape,
                                                      interleaved, transA);
   } else {
-    auto allocShapeA = getAllocShape(aTensorTy, 1);
+    auto isFp4a = op.numBitsPerElementA == 4;
     aLoader = std::make_unique<DotOpMmaSmemLoader>(DotOpMmaSmemLoader::build(
-        loc, rewriter, aTensorTy, baseA, aOperandShape, 5));
+        loc, rewriter, aTensorTy, baseA, aOperandShape, 5, isFp4a));
   }
 
+  auto isFp4b = op.numBitsPerElementB == 4;
   auto allocShapeB = getAllocShape(bTensorTy, 0);
+  // [Instr shape twoCTAs]
+  // This division by 2 in 2CTA mode a bit subtle:
+  // The issue here is that in 2CTA you multiply in one instruction a tensor
+  // of shape MNK = 256, K, N, and you put it into TMEM of shape 128, K, N*2.
+  // So to compute the shapePerCTA, on the lhs we can look at the TMEM shape,
+  // but to compute the shapePerCTA on the rhs, we need to divide by 2.
+  // Something similar happens when we multiply by 2 the mmaSizeM when creating
+  // It's a massive code smell tho
   DotOpMmaSmemLoader bLoader = DotOpMmaSmemLoader::build(
-      loc, rewriter, bTensorTy, baseB, {mmaSizeK, mmaSizeN}, 5);
+      loc, rewriter, bTensorTy, baseB, {mmaSizeK, mmaSizeN / (twoCTAs ? 2 : 1)},
+      5, isFp4b);
 
   DotConversion::InstDesc desc{mmaSizeM, mmaSizeN, {numRepM, numRepN, numRepK},
                                transA,   transB,   interleaved,
@@ -522,6 +532,8 @@ void convertDot(const LLVMTypeConverter &typeConverter,
                           Value pred, Value useInitAcc,
                           const DotConversion::InstDesc &desc, int m, int n,
                           int k) {
+    // To understand this multiplication by 2, see the comment
+    // [Instr shape twoCTAs]
     Value instDescriptor = createInstDescriptor(
         rewriter, op, twoCTAs ? desc.mmaSizeM * 2 : desc.mmaSizeM,
         desc.mmaSizeN, desc.transA, desc.transB);
@@ -594,10 +606,8 @@ void convertScaledDot(const LLVMTypeConverter &typeConverter,
     dot.shapeB[0] *= 2;
   }
 
-  dot.numBitsPerElementA = opKindIsMXFP4 ? getFormatBitSize(op.getAType())
-                                         : aTensorTy.getElementTypeBitWidth();
-  dot.numBitsPerElementB = opKindIsMXFP4 ? getFormatBitSize(op.getBType())
-                                         : bTensorTy.getElementTypeBitWidth();
+  dot.numBitsPerElementA = getFormatBitSize(op.getAType());
+  dot.numBitsPerElementB = getFormatBitSize(op.getBType());
 
   TritonLLVMOpBuilder tb(loc, rewriter);
   Value baseD = tb.ptrtoint(i32_ty, adaptor.getD());