[CommonCodeClean] Remove getElemsPerThreadForOperands (#2916)

chengjunlu · web-flow · commit f1a893a8517b · 2024-12-17T12:59:13.000+08:00
Remove `getElemsPerThreadForOperands`
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -793,11 +793,6 @@ def MmaEncodingTrait : AttrInterface<"MmaEncodingTrait"> {
                     "SmallVector<unsigned>",
                     "getRepOrderForOperand",
                     (ins "int":$opIdx)>,
-
-    InterfaceMethod<"Return element sizes per thread for dot operands.", "SmallVector<unsigned>",
-      "getElemsPerThreadForOperands", (ins "ArrayRef<int64_t>":$tensorShape,
-                                           "Type":$eltTy,
-                                           "unsigned":$opIdx)>,
   ];
 }
 
@@ -931,10 +926,6 @@ V [ 0,4,8...60   1,5...61     2,6...62     3,7...63    ]   [ 128,132...188  129,
       return contigPerThread;
     };
 
-    SmallVector<unsigned> getElemsPerThreadForOperands(ArrayRef<int64_t> shape, Type eltTy, unsigned opIdx) const {
-      llvm_unreachable("getElemsPerThreadForOperands is not supported.");
-    };
-
   }];
 
   let genVerifyDecl = 1;
@@ -1043,10 +1034,6 @@ Row |       warp 0                warp 2
       }
       return contigPerThread;
     };
-
-    SmallVector<unsigned> getElemsPerThreadForOperands(ArrayRef<int64_t> shape, Type eltTy, unsigned opIdx) const {
-      llvm_unreachable("getElemsPerThreadForOperands is not supported.");
-    };
   }];
 }
 
@@ -1171,10 +1158,6 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
       return contigPerThread;
     };
 
-    SmallVector<unsigned> getElemsPerThreadForOperands(ArrayRef<int64_t> shape, Type eltTy, unsigned opIdx) const {
-      llvm_unreachable("getElemsPerThreadForOperands is not supported.");
-    };
-
   }];
 
   let hasCustomAssemblyFormat = 1;
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -70,6 +70,15 @@ unsigned getTotalElemsPerThread(Type type) {
   if (type.isIntOrIndexOrFloat() || isa<triton::PointerType>(type))
     return 1;
   auto tensorType = cast<RankedTensorType>(type);
+
+  std::optional<LinearLayout> ll = triton::gpu::toLinearLayout(
+      tensorType.getShape(), tensorType.getEncoding());
+  if (ll.has_value()) {
+    MLIRContext *ctx = tensorType.getContext();
+    auto kRegister = StringAttr::get(ctx, "register");
+    return ll->getInDimSize(kRegister);
+  }
+  // fallback to legacy layout interface.
   return getTotalElemsPerThread(tensorType.getEncoding(), tensorType.getShape(),
                                 tensorType.getElementType());
 }
@@ -1065,9 +1074,6 @@ DotOperandEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape,
     return regs;
   }
 
-  if (auto mmaParent = mlir::dyn_cast<MmaEncodingTrait>(getParent())) {
-    return mmaParent.getElemsPerThreadForOperands(shape, eltTy, getOpIdx());
-  }
   llvm_unreachable("getElemsPerThread is not supported for dot operand");
   return SmallVector<unsigned>();
 }
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -736,15 +736,15 @@ SliceEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   LinearLayout ret =
       LinearLayout(std::move(bases), llvm::to_vector(sliceLL.getOutDimNames()));
 
-  // Match a hack in the legacy code that ensures that the number of registers
-  // matches getTotalElemsPerThread.  Yup: We just removed all the zeros, now
-  // we're (maybe) adding some back.  :)
-  //
-  // TODO(jlebar): Once getTotalElemsPerThread uses LLs instead of the existing
-  // legacy code, I think we can remove this.
-  int expectedNumRegisters =
-      triton::gpu::getTotalElemsPerThread(RankedTensorType::get(
-          shape, IntegerType::get(ctx, 32) /*dummy type*/, *this));
+  // The triton generate the homogeneous kernel run on every thread.
+  // The multiple threads of the parent layout which are distributed on the
+  // sliced dim are squeezed to hold the same value of tensor redundantly. The
+  // multiple values of sizePerThreads[dim] of the parent are reduced to the
+  // only one. We need to fix up the number of registers in case we just removed
+  // all zeros aggressively.
+  auto sizePerThreads = triton::gpu::getSizePerThread(getParent());
+  unsigned expectedNumRegisters =
+      parentLL->getInDimSize(S("register")) / sizePerThreads[getDim()];
   if (ret.getInDimSize(S("register")) != expectedNumRegisters) {
     int extraZeros = expectedNumRegisters / ret.getInDimSize(S("register"));
     // Our use of "dim0" here is arbitrary; because we're adding zeros, any
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td
@@ -100,9 +100,6 @@ along the row (resp. col) dimension.
     SmallVector<unsigned> getSizePerThreadForOperand(int kWidth, unsigned opIdx) const {
       return getSizePerThreadForOperand(kWidth, static_cast<OpIdx>(opIdx));
     }
-    SmallVector<unsigned> getElemsPerThreadForOperands(ArrayRef<int64_t> shape, Type eltTy, unsigned opIdx) const {
-      return getElemsPerThreadForOperands(shape, eltTy, static_cast<OpIdx>(opIdx));
-    }
     SmallVector<unsigned> getRepOrderForOperand(unsigned opIdx) const {
       return getRepOrderForOperand(static_cast<OpIdx>(opIdx));
     }
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -347,22 +347,6 @@ DpasEncodingAttr::getSizePerThreadForOperand(int kWidth, OpIdx opIdx) const {
   llvm_unreachable("unexpected opIdx");
 }
 
-SmallVector<unsigned>
-DpasEncodingAttr::getElemsPerThreadForOperands(ArrayRef<int64_t> shape,
-                                               Type eltTy, OpIdx opIdx) const {
-  SmallVector<unsigned> sizePerThread = getSizePerThreadForOperand(0, opIdx);
-  SmallVector<int64_t> repetitions = getDPASRepetitions(shape, opIdx);
-
-  size_t rank = shape.size();
-  SmallVector<unsigned> elemsPerThread(rank);
-  if (rank == 3)
-    elemsPerThread[0] = repetitions[0];
-  elemsPerThread[rank - 2] = sizePerThread[0] * repetitions[1];
-  elemsPerThread[rank - 1] = sizePerThread[1] * repetitions[2];
-
-  return elemsPerThread;
-};
-
 SmallVector<unsigned> DpasEncodingAttr::getContigPerThread() const {
   size_t rank = getWarpsPerCTA().size();
   assert(rank == 2 || rank == 3);

Original file line number	Diff line number	Diff line change
`@@ -100,9 +100,6 @@ along the row (resp. col) dimension.`
`100`	`100`	`SmallVector<unsigned> getSizePerThreadForOperand(int kWidth, unsigned opIdx) const {`
`101`	`101`	`return getSizePerThreadForOperand(kWidth, static_cast<OpIdx>(opIdx));`
`102`	`102`	`}`
`103`		`- SmallVector<unsigned> getElemsPerThreadForOperands(ArrayRef<int64_t> shape, Type eltTy, unsigned opIdx) const {`
`104`		`- return getElemsPerThreadForOperands(shape, eltTy, static_cast<OpIdx>(opIdx));`
`105`		`- }`
`106`	`103`	`SmallVector<unsigned> getRepOrderForOperand(unsigned opIdx) const {`
`107`	`104`	`return getRepOrderForOperand(static_cast<OpIdx>(opIdx));`
`108`	`105`	`}`