[SWP] Dedup the code that checks if LoadOp can be converted to cpasync (#8529)

masahi · web-flow · commit a6e7434acb00 · 2025-10-25T07:45:31.000+09:00
During SWP, we are checking if a given `LoadOp` should be lowered to
`AsyncCopyGlobalToLocalOp` twice - first in `AssignLatency`, and
`LowerLoops` next. The two checks duplicate non-trivial conditions like
`copyVecBytes &gt;= 4` or `op.getResultTypes()[0].getIntOrFloatBitWidth()
&gt;= 32`.

I moved the `isPipeliningBeneficial` function from `AssignLatency` into
utilities so that it can also be used by `LowerLoops`. This will also be
used by WS to determine if `LoadOp` should be lowered to cpasync and
assigned to the load partition.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h b/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
@@ -184,6 +184,13 @@ getLastUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
 
 // Clean up attributes passing over schedules across stages in pipelining
 void removePipeliningAttributes(ModuleOp moduleOp);
+
+// For LoadOp, DescriptorLoad, and DescriptorGather ops, determine if
+// they should be pipelined.
+bool isPipeliningBeneficial(Operation *op,
+                            triton::ModuleAxisInfoAnalysis &axisInfoAnalysis,
+                            bool filterSmall = true);
+
 } // namespace triton
 } // namespace mlir
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp
@@ -88,64 +88,6 @@ class AssignLoadLatencies {
   scf::ForOp forOp;
   int numStages;
   DenseMap<Operation *, int> &opLatency;
-
-public:
-  static bool canHaveSharedEncoding(tt::LoadOp op) {
-    // If used by an user with DotOp encoding, all the uses must be compatible.
-    bool incompatible = false;
-    getSharedEncIfAllUsersAreDotEnc(op.getResult(), incompatible);
-    return !incompatible;
-  }
-
-  static bool
-  isPipeliningBeneficial(Operation *op, Operation *finalUser,
-                         tt::ModuleAxisInfoAnalysis &axisInfoAnalysis,
-                         bool filterSmall) {
-    if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
-      if (filterSmall && !canBeConvertedToAsyncLoad(loadOp, axisInfoAnalysis)) {
-        LDBG("Load " << *loadOp << " is too small for pipelining");
-        return false;
-      }
-    }
-    if (isa<tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op))
-      return true;
-    if (!canHaveSharedEncoding(cast<tt::LoadOp>(op))) {
-      LDBG("Load " << *op << " cannot have shared encoding");
-      return false;
-    }
-
-    ttg::SharedEncodingTrait localAllocEnc;
-    if (llvm::any_of(op->getUsers(), [&](Operation *user) {
-          return isa<ttg::LocalAllocOp>(user);
-        })) {
-      for (auto user : op->getUsers()) {
-        auto localAlloc = dyn_cast<ttg::LocalAllocOp>(user);
-        if (!localAlloc)
-          continue;
-        auto enc = mlir::cast<ttg::SharedEncodingTrait>(
-            localAlloc.getType().getEncoding());
-        if (!localAllocEnc) {
-          localAllocEnc = enc;
-        }
-        if (enc != localAllocEnc) {
-          // If the load is used by a LocalAllocOp, all the users need to have
-          // the same encoding.
-          return false;
-        }
-      }
-    }
-
-    if (localAllocEnc) {
-      auto registerTy = cast<RankedTensorType>(op->getResultTypes()[0]);
-      auto vecBytes = getCopyVecBytes(registerTy, localAllocEnc);
-      if (filterSmall && vecBytes < 4) {
-        // At least 4 bytes need to be consecutive for cp.async
-        return false;
-      }
-    }
-
-    return true;
-  }
 };
 
 class AssignMMALatencies {
@@ -280,8 +222,7 @@ loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,
         if (!seen.insert(op).second || excluded.count(op))
           return;
         if (isa<tt::LoadOp, tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op)) {
-          if (!AssignLoadLatencies::isPipeliningBeneficial(
-                  op, finalUser, axisInfoAnalysis, filterSmall))
+          if (!isPipeliningBeneficial(op, axisInfoAnalysis, filterSmall))
             return;
           if (loadOpToIndLevel.count(op)) {
             int level = loadOpToIndLevel[op].first;
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
@@ -453,26 +453,17 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule,
         continue;
       }
       SharedEncodingTrait sharedEncoding;
-      bool canUseAsyncCp = false;
-      if (!isa<RankedTensorType>(op.getResultTypes()[0])) {
-        canUseAsyncCp = op.getResultTypes()[0].getIntOrFloatBitWidth() >= 32;
-        sharedEncoding = ttg::SwizzledSharedEncodingAttr::get(
-            forOp.getContext(), 1, 1, 1, {0},
-            ttg::CTALayoutAttr::get(forOp.getContext(), {1}, {1}, {0}));
-        if (canUseAsyncCp) {
+      bool canUseAsyncCp =
+          triton::isPipeliningBeneficial(&op, axisInfoAnalysis);
+      if (canUseAsyncCp) {
+        if (!isa<RankedTensorType>(op.getResultTypes()[0])) {
+          sharedEncoding = ttg::SwizzledSharedEncodingAttr::get(
+              forOp.getContext(), 1, 1, 1, {0},
+              ttg::CTALayoutAttr::get(forOp.getContext(), {1}, {1}, {0}));
           scalarLoads.push_back(&op);
+        } else {
+          sharedEncoding = getSharedEncoding(&op);
         }
-      } else {
-        sharedEncoding = getSharedEncoding(&op);
-        // Do not create async loads for small loads (cp.async requires at least
-        // 4 bytes)
-        canUseAsyncCp =
-            isa<tt::LoadOp>(op) &&
-            canBeConvertedToAsyncLoad(cast<tt::LoadOp>(op), axisInfoAnalysis);
-        int copyVecBytes = getCopyVecBytes(
-            cast<RankedTensorType>(op.getResultTypes()[0]), sharedEncoding);
-
-        canUseAsyncCp &= copyVecBytes >= 4;
       }
       if (canUseAsyncCp || isTMALoad(&op)) {
         if (loadRequiresAdditionalBuffer(&op)) {
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -603,6 +603,10 @@ ttg::SharedEncodingTrait mlir::triton::getSharedEncoding(RankedTensorType ty) {
 }
 
 ttg::SharedEncodingTrait mlir::triton::getSharedEncoding(Operation *op) {
+  if (!isa<RankedTensorType>(op->getResultTypes()[0])) {
+    return nullptr;
+  }
+
   // Try to use local alloc encoding if possible.
   ttg::SharedEncodingTrait localAllocEnc;
   if (llvm::any_of(op->getUsers(), [&](Operation *user) {
@@ -933,3 +937,38 @@ void triton::removePipeliningAttributes(ModuleOp moduleOp) {
     op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);
   });
 }
+
+static bool canHaveSharedEncoding(tt::LoadOp op) {
+  // If used by an user with DotOp encoding, all the uses must be compatible.
+  bool incompatible = false;
+  getSharedEncIfAllUsersAreDotEnc(op.getResult(), incompatible);
+  return !incompatible;
+}
+
+bool triton::isPipeliningBeneficial(
+    Operation *op, tt::ModuleAxisInfoAnalysis &axisInfoAnalysis,
+    bool filterSmall) {
+  if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
+    if (filterSmall && !canBeConvertedToAsyncLoad(loadOp, axisInfoAnalysis)) {
+      LDBG("Load " << *loadOp << " is too small for pipelining");
+      return false;
+    }
+  }
+  if (isa<tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op))
+    return true;
+  if (!canHaveSharedEncoding(cast<tt::LoadOp>(op))) {
+    LDBG("Load " << *op << " cannot have shared encoding");
+    return false;
+  }
+
+  if (auto localAllocEnc = getSharedEncoding(op)) {
+    auto registerTy = cast<RankedTensorType>(op->getResultTypes()[0]);
+    auto vecBytes = mlir::triton::getCopyVecBytes(registerTy, localAllocEnc);
+    if (filterSmall && vecBytes < 4) {
+      // At least 4 bytes need to be consecutive for cp.async
+      return false;
+    }
+  }
+
+  return true;
+}