[BACKEND] Remove decomposition of splat -> shared conversion (#5450)

Jokeren · web-flow · commit 80e2abdfa359 · 2024-12-17T21:56:58.000-05:00
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Patterns.h b/include/triton/Conversion/TritonGPUToLLVM/Patterns.h
@@ -13,10 +13,6 @@ namespace triton::gpu {
 /// |module| op because the codegen doesn't handle `blocked -> dot_op` directly.
 void decomposeBlockedToDotLayoutConversion(ModuleOp module);
 
-/// Replaces `splat -> shared` with `splat -> blocked -> shared` in the given
-/// |module| op.
-void decomposeSplatOpToSharedLayoutConversion(ModuleOp module);
-
 /// Replaces `mma/mfma -> dot_op` with `mma/mfma -> blocked -> dot_op` in the
 /// given |module| op, but bypass the decomposition if |shortcutFn| returns
 /// true.
diff --git a/lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp b/lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp
@@ -18,32 +18,6 @@ static void addAttrs(Operation *op, ArrayRef<mlir::NamedAttribute> attrs) {
 
 namespace mlir::triton::gpu {
 
-void decomposeSplatOpToSharedLayoutConversion(ModuleOp module) {
-  int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(module);
-  int numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(module);
-  int threadsPerWarp = triton::gpu::TritonGPUDialect::getThreadsPerWarp(module);
-  module.walk([&](triton::SplatOp splatOp) -> void {
-    auto dstType = cast<RankedTensorType>(splatOp.getType());
-    auto shared =
-        dyn_cast<triton::gpu::SharedEncodingAttr>(dstType.getEncoding());
-    if (shared) {
-      OpBuilder builder(splatOp);
-      SmallVector<unsigned, 4> sizePerThread(dstType.getRank(), 1);
-      auto newType = RankedTensorType::get(
-          dstType.getShape(), dstType.getElementType(),
-          triton::gpu::BlockedEncodingAttr::get(
-              module.getContext(), dstType.getShape(), sizePerThread,
-              getOrder(shared), numWarps, threadsPerWarp, numCTAs));
-      auto newSplat = builder.create<triton::SplatOp>(splatOp.getLoc(), newType,
-                                                      splatOp.getSrc());
-      auto newConvert = builder.create<triton::gpu::ConvertLayoutOp>(
-          splatOp.getLoc(), dstType, newSplat.getResult());
-      splatOp.replaceAllUsesWith(newConvert.getResult());
-      splatOp.erase();
-    }
-  });
-}
-
 void decomposeTensorCoreToDotLayoutConversion(ModuleOp module,
                                               ShortcutFn shortcutFn) {
   int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(module);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DecomposeUnsupportedConversions.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DecomposeUnsupportedConversions.cpp
@@ -34,8 +34,6 @@ struct DecomposeUnsupportedAMDConversions
     int numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(mod);
     int threadsPerWarp = triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod);
 
-    triton::gpu::decomposeSplatOpToSharedLayoutConversion(mod);
-
     auto isShortcut =
         mlir::triton::gpu::ShortcutFn(std::not_fn(cvtNeedsSharedMemory));
 
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp
@@ -76,7 +76,6 @@ struct DecomposeUnsupportedConversions
     auto nvidiaShortCutFn = [&](RankedTensorType srcTy,
                                 RankedTensorType dstTy) { return true; };
     ModuleOp mod = getOperation();
-    triton::gpu::decomposeSplatOpToSharedLayoutConversion(mod);
     triton::gpu::decomposeTensorCoreToDotLayoutConversion(mod,
                                                           nvidiaShortCutFn);
     triton::gpu::decomposeBlockedToDotLayoutConversion(mod);