From 01074c10b014146ef4d0a24647cc6a37641de0a3 Mon Sep 17 00:00:00 2001
From: Arnab Dutta <arnab@polymagelabs.com>
Date: Wed, 31 Jul 2024 11:59:34 +0530
Subject: [PATCH] Fix bug in `fold-memref-alias-ops` pass

Pass proper dimensional and symbolic operands to the
linearized access map of the input memref of the
memref.expand_shape op.
---
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  |  4 +-
 .../Dialect/MemRef/fold-memref-alias-ops.mlir | 37 +++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 8e927a60087fc..930c5d47839ff 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -126,11 +126,11 @@ resolveSourceIndicesExpandShape(Location loc, PatternRewriter &rewriter,
     for (int64_t i = 0; i < groupSize; i++)
       dynamicIndices[i] = indices[groups[i]];
 
-    // Supply suffix product results followed by load op indices as operands
+    // Supply load op indices as operands followed by suffix product results
     // to the map.
     SmallVector<OpFoldResult> mapOperands;
-    llvm::append_range(mapOperands, suffixProduct);
     llvm::append_range(mapOperands, dynamicIndices);
+    llvm::append_range(mapOperands, suffixProduct);
 
     // Creating maximally folded and composed affine.apply composes better
     // with other transformations without interleaving canonicalization
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index 327cacf7d9a20..e52dd15b0fdbb 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -1031,3 +1031,40 @@ func.func @fold_vector_maskedstore_collapse_shape(
 //       CHECK:   %[[IDX:.*]] = affine.apply  #[[$MAP]]()[%[[ARG1]]]
 //       CHECK:   %[[IDX1:.*]] = affine.apply #[[$MAP1]]()[%[[ARG1]]]
 //       CHECK:   vector.maskedstore %[[ARG0]][%[[IDX]], %[[IDX1]]], %[[ARG3]], %[[ARG4]]
+// -----
+
+// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0, d1)[s0] -> (d0 mod s0)>
+// CHECK-LABEL: fold_expand_shape_dynamic_dim
+func.func @fold_expand_shape_dynamic_dim(%arg0: i64, %arg1: memref<*xf16>) {
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  // CHECK-DAG:  %[[C2:.*]] = arith.constant 2 : index
+  %cast = memref.cast %arg1 : memref<*xf16> to memref<1x8x?x128xf16>
+  // CHECK: %[[CAST:.*]] = memref.cast
+  %dim = memref.dim %cast, %c2 : memref<1x8x?x128xf16>
+  // CHECK: %[[DIM:.*]] = memref.dim %[[CAST]], %[[C2]]
+  %dim_0 = memref.dim %cast, %c2 : memref<1x8x?x128xf16>
+  %expand_shape = memref.expand_shape %cast [[0], [1], [2, 3], [4]] output_shape [1, 8, 1, %dim_0, 128] : memref<1x8x?x128xf16> into memref<1x8x1x?x128xf16>
+  // CHECK-NOT: memref.expand_shape
+  %0 = arith.index_cast %arg0 : i64 to index
+  // CHECK: %[[IDX:.*]] = arith.index_cast
+  %alloc = memref.alloc(%0) {alignment = 64 : i64} : memref<1x8x4x?x128xf16>
+  affine.for %arg2 = 0 to 8 {
+  // CHECK: affine.for %[[ARG2:.*]] = 0 to 8
+    affine.for %arg3 = 0 to 4 {
+    // CHECK-NEXT: affine.for %[[ARG3:.*]] = 0 to 4
+      affine.for %arg4 = 0 to %0 {
+      // CHECK-NEXT: affine.for %[[ARG4:.*]] = 0 to %[[IDX]]
+        affine.for %arg5 = 0 to 128 {
+          // CHECK-NEXT: affine.for %[[ARG5:.*]] = 0 to 128
+          // CHECK: %[[DIM_0:.*]] = memref.dim %[[CAST]], %[[C2]] : memref<1x8x?x128xf16>
+          // CHECK: %[[APPLY_RES:.*]] = affine.apply #[[$MAP]](%[[ARG4]]
+          %2 = affine.load %expand_shape[0, %arg2, 0, %arg4 mod symbol(%dim), %arg5] : memref<1x8x1x?x128xf16>
+          // CHECK: memref.load %[[CAST]][%[[C0]], %[[ARG2]], %[[APPLY_RES]], %[[ARG5]]] : memref<1x8x?x128xf16>
+          affine.store %2, %alloc[0, %arg2, %arg3, %arg4, %arg5] : memref<1x8x4x?x128xf16>
+        }
+      }
+    }
+  }
+  return
+}