do not add barrier op for subgroup 2d block -> dpas conversion

alexbaden · alexbaden · commit e19e02fdca19 · 2025-06-13T16:09:21.000Z
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h
@@ -33,6 +33,12 @@ Attribute inferSrcEncoding(Operation *op, Attribute encoding);
 // Retuns true if the operation is an expensive load or store operation.
 bool isExpensiveLoadOrStore(Operation *op);
 
+// Returns true if the conversion between tensor types should be a no-op. Will
+// be removed once layout conversion for BlockIO types is lifted from
+// LoadStoreOpToLLVM.cpp
+bool isBlockIONoOpConversion(RankedTensorType srcType,
+                             RankedTensorType dstType);
+
 // Returns true if the tensor type has a subgroup 2d block io encoding
 bool hasSubgroup2DBlockEncoding(RankedTensorType tensorType);
 
diff --git a/third_party/intel/lib/Analysis/Allocation.cpp b/third_party/intel/lib/Analysis/Allocation.cpp
@@ -1,5 +1,6 @@
 #include "intel/include/Analysis/Allocation.h"
 #include "intel/include/Analysis/Utility.h"
+#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" // isBlockIONoOpConversion
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -11,6 +12,9 @@ constexpr unsigned invalidSize = -1;
 unsigned allocationAnalysisScratchSizeFn(gpu::ConvertLayoutOp convertLayout) {
   RankedTensorType srcTy = convertLayout.getSrc().getType();
   RankedTensorType dstTy = convertLayout.getResult().getType();
+
+  if (gpu::intel::isBlockIONoOpConversion(srcTy, dstTy))
+    return 0;
   if (gpu::intel::cvtIsSubGroupShuffle(srcTy, dstTy))
     return 0;
   if (gpu::intel::cvtIsSubGroupTranspose(srcTy, dstTy)) {
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -25,18 +25,15 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
                   ConversionPatternRewriter &rewriter) const override {
     MLIRContext *ctx = op.getContext();
 
-    auto srcTy = op.getSrc().getType();
+    RankedTensorType srcTy = op.getSrc().getType();
     auto dstTy = op.getType();
 
-    if (auto srcTensorTy = cast<RankedTensorType>(srcTy)) {
-      if (auto dstTensorTy = cast<RankedTensorType>(dstTy)) {
+    if (auto dstTensorTy = cast<RankedTensorType>(dstTy)) {
+      if (intel::isBlockIONoOpConversion(srcTy, dstTensorTy)) {
         // TODO: replace this with proper conversion once conversion is removed
         // from LoadStoreOpToLLVM.
-        if (intel::hasSubgroup2DBlockEncoding(srcTensorTy) &&
-            intel::hasDotDpasEncoding(dstTensorTy)) {
-          rewriter.replaceOp(op, op.getSrc());
-          return success();
-        }
+        rewriter.replaceOp(op, op.getSrc());
+        return success();
       }
     }
 
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp
@@ -153,6 +153,11 @@ bool isExpensiveLoadOrStore(Operation *op) {
   return false;
 }
 
+bool isBlockIONoOpConversion(RankedTensorType srcType,
+                             RankedTensorType dstType) {
+  return hasSubgroup2DBlockEncoding(srcType) && hasDotDpasEncoding(dstType);
+}
+
 bool hasSubgroup2DBlockEncoding(RankedTensorType tensorType) {
   if (!tensorType.getEncoding())
     return false;