Enable the nested layout #slice->#dot->#mma to linear layout conversion for third party extension.

chengjunlu · chengjunlu · commit b1c3c7265194 · 2024-12-06T15:41:25.000Z
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -86,6 +86,15 @@ unsigned getTotalElemsPerThread(Type type) {
   if (type.isIntOrIndexOrFloat() || isa<triton::PointerType>(type))
     return 1;
   auto tensorType = cast<RankedTensorType>(type);
+
+  std::optional<LinearLayout> ll = triton::gpu::toLinearLayout(
+      tensorType.getShape(), tensorType.getEncoding());
+  if (ll.has_value()) {
+    MLIRContext *ctx = tensorType.getContext();
+    auto kRegister = StringAttr::get(ctx, "register");
+    return ll->getInDimSize(kRegister);
+  }
+  // fallback to legacy layout interface.
   return getTotalElemsPerThread(tensorType.getEncoding(), tensorType.getShape(),
                                 tensorType.getElementType());
 }
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -736,15 +736,15 @@ SliceEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   LinearLayout ret =
       LinearLayout(std::move(bases), llvm::to_vector(sliceLL.getOutDimNames()));
 
-  // Match a hack in the legacy code that ensures that the number of registers
-  // matches getTotalElemsPerThread.  Yup: We just removed all the zeros, now
-  // we're (maybe) adding some back.  :)
-  //
-  // TODO(jlebar): Once getTotalElemsPerThread uses LLs instead of the existing
-  // legacy code, I think we can remove this.
-  int expectedNumRegisters =
-      triton::gpu::getTotalElemsPerThread(RankedTensorType::get(
-          shape, IntegerType::get(ctx, 32) /*dummy type*/, *this));
+  // The semantic of the slice layout:
+  // The threads of the parent layout which are distributed on the
+  // sliced dim are squeezed to hold the same value of tensor redundantly.
+  // Only the number of values of sizePerThreads[dim] of the parent are reduced
+  // to the one. We need to fix up the number of registers in case we just
+  // removed all zeros bases aggressively.
+  auto sizePerThreads = triton::gpu::getSizePerThread(getParent());
+  unsigned expectedNumRegisters =
+      parentLL->getInDimSize(S("register")) / sizePerThreads[getDim()];
   if (ret.getInDimSize(S("register")) != expectedNumRegisters) {
     int extraZeros = expectedNumRegisters / ret.getInDimSize(S("register"));
     // Our use of "dim0" here is arbitrary; because we're adding zeros, any