[MLIR][Affine] Fix fusion private memref creation for multiple producer stores

bondhugula · bondhugula · commit a38b8ab4b569 · 2025-03-08T04:34:47.000+05:30
Fix private memref creation in affine fusion for the multiple producer store case. This scenario was not supported but not properly checked. Fixes: #120227
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -328,14 +328,34 @@ static std::optional<double> getAdditionalComputeFraction(
 // Creates and returns a private (single-user) memref for fused loop rooted at
 // 'forOp', with (potentially reduced) memref size based on the memref region
 // written to by `storeOps` at depth 'dstLoopDepth'. 'sliceInsertionBlock'
-// specifies the block in which the slice was/will be inserted.
+// specifies the block in which the slice was/will be inserted. The method
+// expects that all stores ops to the memref have the same access function.
+// Returns nullptr if the creation failed.
 static Value createPrivateMemRef(AffineForOp forOp,
                                  ArrayRef<Operation *> storeOps,
                                  unsigned dstLoopDepth,
                                  std::optional<unsigned> fastMemorySpace,
                                  Block *sliceInsertionBlock,
                                  uint64_t localBufSizeThreshold) {
   assert(!storeOps.empty() && "no source stores supplied");
+
+  // Check if all stores have the same access function; we only support this
+  // case.
+  // TODO: Use union of memref write regions to compute private memref footprint
+  // for store ops with different access functions.
+  if (storeOps.size() > 1 &&
+      !std::equal(std::next(storeOps.begin()), storeOps.end(), storeOps.begin(),
+                  [](Operation *a, Operation *b) {
+                    MemRefAccess aM(cast<AffineWriteOpInterface>(a));
+                    MemRefAccess bM(cast<AffineWriteOpInterface>(b));
+                    return aM == bM;
+                  })) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Private memref creation unsupported for multiple producer "
+                  "stores with different access functions.\n");
+    return nullptr;
+  }
+
   Operation *srcStoreOp = storeOps[0];
 
   // Create builder to insert alloc op just before 'forOp'.
@@ -432,6 +452,8 @@ static Value createPrivateMemRef(AffineForOp forOp,
   assert(succeeded(res) &&
          "replaceAllMemrefUsesWith should always succeed here");
   (void)res;
+  LLVM_DEBUG(llvm::dbgs() << "Created private memref of type: " << newMemRefType
+                          << '\n');
   return newMemRef;
 }
 
@@ -1123,13 +1145,12 @@ struct GreedyFusion {
           // loads and stores. Any reference to the original ones becomes
           // invalid after this point.
           for (auto &memrefToStoresPair : privateMemRefToStores) {
-            // TODO: Use union of memref write regions to compute
-            // private memref footprint.
-            SmallVector<Operation *, 4> &storesForMemref =
-                memrefToStoresPair.second;
+            ArrayRef<Operation *> storesForMemref = memrefToStoresPair.second;
             Value newMemRef = createPrivateMemRef(
                 dstAffineForOp, storesForMemref, bestDstLoopDepth,
                 fastMemorySpace, sliceInsertionBlock, localBufSizeThreshold);
+            if (!newMemRef)
+              continue;
             // Create new node in dependence graph for 'newMemRef' alloc op.
             unsigned newMemRefNodeId = mdg->addNode(newMemRef.getDefiningOp());
             // Add edge from 'newMemRef' node to dstNode.
diff --git a/mlir/test/Dialect/Affine/loop-fusion-2.mlir b/mlir/test/Dialect/Affine/loop-fusion-2.mlir
@@ -39,16 +39,20 @@ func.func @should_fuse_at_depth_above_loop_carried_dependence(%arg0: memref<64x4
 
   // We can fuse source loop nest '%i0' into dst loop nest '%i2', but the
   // depth at which we can insert the src loop nest slice into the dst loop
-  // lest must be decreased because of a loop carried dependence on loop '%i3'.
+  // nest must be decreased because of a loop carried dependence on loop '%i3'.
   // As a result, the source loop nest is inserted at dst loop nest depth 1,
   // just above the loop with the carried dependence. In addition, the source
   // loop nest iteration bounds on its loop '%i1' are reduced to 1, so the
-  // memref size can be reduced to 128x1xf32.
+  // memref size can be reduced to 64x1xf32.
 
-  // CHECK:       memref.alloc() : memref<64x1xf32>
+  // In this case, since we have multiple producer stores (for %out) with
+  // different access functions and we don't yet support private memref
+  // computation in such cases, a 64x1 private memref isn't created.
+
+  // CHECK:       memref.alloc() : memref<64x4xf32>
   // CHECK:       affine.for %{{.*}} = 0 to 4 {
   // CHECK-NEXT:    affine.for %{{.*}} = 0 to 64 {
-  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<64x1xf32>
+  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x4xf32>
   // CHECK-NEXT:    }
   // CHECK-NEXT:    affine.for %{{.*}} = 0 to 4 {
   // CHECK-NEXT:      affine.for %{{.*}} = 0 to 16 {
@@ -62,9 +66,9 @@ func.func @should_fuse_at_depth_above_loop_carried_dependence(%arg0: memref<64x4
   // CHECK-NEXT:        }
   // CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
   // CHECK-NEXT:          %{{.*}} = "op2"() : () -> f32
-  // CHECK:               affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32>
+  // CHECK:               affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<64x4xf32>
   // CHECK-NEXT:          arith.addf %{{.*}}, %{{.*}} : f32
-  // CHECK:               affine.store %{{.*}}, %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32>
+  // CHECK:               affine.store %{{.*}}, %{{.*}}[%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<64x4xf32>
   // CHECK-NEXT:        }
   // CHECK-NEXT:      }
   // CHECK-NEXT:    }
diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
@@ -622,3 +622,47 @@ func.func @zero_tolerance(%arg0: memref<65536xcomplex<f64>>, %arg1: memref<30x13
 }
 func.func private @__external_levelwise_forward_ntt(memref<30x131072xi64>)
 func.func private @__external_reduce_barrett(i64, i64, i64, i64, i128) -> i64
+
+// An unrolled loop nest. Fusion here should correctly fuse while preserving
+// dependences between store-load pairs of the same memref. A private memref
+// of size 1x1x1 can't be created.
+
+// PRODUCER-CONSUMER-MAXIMAL-LABEL: func @unrolled
+func.func @unrolled(%arg0: memref<2x4xf32>, %arg1: memref<1x2x4xf32>) {
+  %alloc = memref.alloc() : memref<1x2x4xf32>
+  affine.for %i = 0 to 1 {
+    %0 = affine.load %arg0[0, 0] : memref<2x4xf32>
+    %1 = affine.load %arg0[0, 1] : memref<2x4xf32>
+    %2 = affine.load %arg0[0, 2] : memref<2x4xf32>
+    %3 = affine.load %arg0[0, 3] : memref<2x4xf32>
+    %4 = affine.load %arg0[1, 0] : memref<2x4xf32>
+    %5 = affine.load %arg0[1, 1] : memref<2x4xf32>
+    %6 = affine.load %arg0[1, 2] : memref<2x4xf32>
+    %7 = affine.load %arg0[1, 3] : memref<2x4xf32>
+
+    affine.store %0, %alloc[0, 0, 0] : memref<1x2x4xf32>
+    affine.store %1, %alloc[0, 0, 1] : memref<1x2x4xf32>
+    affine.store %2, %alloc[0, 0, 2] : memref<1x2x4xf32>
+    affine.store %3, %alloc[0, 0, 3] : memref<1x2x4xf32>
+    affine.store %4, %alloc[0, 1, 0] : memref<1x2x4xf32>
+    affine.store %5, %alloc[0, 1, 1] : memref<1x2x4xf32>
+    affine.store %6, %alloc[0, 1, 2] : memref<1x2x4xf32>
+    affine.store %7, %alloc[0, 1, 3] : memref<1x2x4xf32>
+  }
+
+  affine.for %i = 0 to 2 {
+    affine.for %j = 0 to 4 {
+      %8 = affine.load %alloc[0, %i, %j] : memref<1x2x4xf32>
+      %9 = arith.negf %8 : f32
+      affine.store %9, %arg1[0, %i, %j] : memref<1x2x4xf32>
+    }
+  }
+  // PRODUCER-CONSUMER-MAXIMAL:      affine.for %{{.*}} = 0 to 2 {
+  // PRODUCER-CONSUMER-MAXIMAL-NEXT:   affine.for %{{.*}} = 0 to 4 {
+  // PRODUCER-CONSUMER-MAXIMAL-NEXT:     affine.load %{{.*}}[0, 0]
+  // PRODUCER-CONSUMER-MAXIMAL:          affine.load %{{.*}}[1, 3]
+  // PRODUCER-CONSUMER-MAXIMAL:          affine.store %{{.*}}[0, 0, 0]
+  // PRODUCER-CONSUMER-MAXIMAL:          affine.store %{{.*}}[0, 1, 3]
+  // PRODUCER-CONSUMER-MAXIMAL:          affine.load %{{.*}}[0, %{{.*}}, %{{.*}}]
+  return
+}