intel
diff --git a/‎lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp‎
Lines changed: 47 additions & 33 deletions b/‎lib/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmOnednn.cpp‎
Lines changed: 47 additions & 33 deletions
diff --git a/‎lib/gc/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎lib/gc/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/gc/Transforms/IterativeTilingAndFusion.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/gc/Transforms/IterativeTilingAndFusion.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/gc/Transforms/MergeAllocTickBased.cpp‎
Lines changed: 208 additions & 15 deletions b/‎lib/gc/Transforms/MergeAllocTickBased.cpp‎
Lines changed: 208 additions & 15 deletions
@@ -53,15 +53,24 @@ using read_lock_guard_t = std::shared_lock<std::shared_mutex>;
 using write_lock_guard_t = std::unique_lock<std::shared_mutex>;
 static std::shared_mutex g_brgemm_lock;
 
-static std::vector<brgemm_desc_t> g_brgemm_desc_list;
-static std::vector<brgemm_kernel_t *> g_brgemm_kernel_list;
-static std::vector<std::unique_ptr<char[]>> g_brgemm_palette;
+struct brgemm_cache_info_t {
+  brgemm_desc_t desc;
+  brgemm_kernel_t *kernel;
+  std::shared_ptr<char[]> palette;
+};
+
+static std::vector<brgemm_cache_info_t> g_cache;
 
 // TODO(haixin): use syscall to determine page size?
 static constexpr size_t SCRATCH_SIZE = 2 * 4096;
 // TODO(haixin): need to use custom thread management for scratch in the future?
 static thread_local char scratch[SCRATCH_SIZE] = {0};
 
+static std::unordered_map<int64_t, brgemm_cache_info_t> &get_tl_cache() {
+  thread_local std::unordered_map<int64_t, brgemm_cache_info_t> tl_cache;
+  return tl_cache;
+}
+
 extern "C" {
 
 int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
@@ -93,33 +102,33 @@ int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
   brgemm_desc_set_attr(&desc, dnnl_attrs);
 
   // TODO(haixin): Reuse identical palettes across kernels
-  char *palette_buffer = nullptr;
+  std::shared_ptr<char[]> palette_buffer;
   if (desc.is_tmm) {
-    palette_buffer = new char[PALETTE_SIZE];
-    dnnl::impl::status_t status = brgemm_init_tiles(desc, palette_buffer);
+    palette_buffer.reset(new char[PALETTE_SIZE]);
+    dnnl::impl::status_t status = brgemm_init_tiles(desc, palette_buffer.get());
     assert(status == dnnl::impl::status::success &&
            "Failed to initialize palette for BRGEMM");
   }
 
   write_lock_guard_t g(g_brgemm_lock);
-  g_brgemm_desc_list.push_back(desc);
-  g_brgemm_kernel_list.push_back(kernel);
-  g_brgemm_palette.emplace_back(palette_buffer);
-
-  return g_brgemm_desc_list.size() - 1;
+  g_cache.push_back(brgemm_cache_info_t{desc, kernel, palette_buffer});
+  return g_cache.size() - 1;
 }
 
 void dnnl_brgemm_tileconfig(int64_t kernel_idx) {
-  char *palette_buffer = nullptr;
-  {
+  assert(kernel_idx >= 0 && "Invalid kernel handler");
+  auto &tl_cache = get_tl_cache();
+  auto it = tl_cache.find(kernel_idx);
+  if (it == tl_cache.end()) {
     read_lock_guard_t g(g_brgemm_lock);
-    assert(kernel_idx >= 0 && kernel_idx < (int64_t)g_brgemm_desc_list.size() &&
-           "Invalid kernel handler");
-    brgemm_desc_t &desc = g_brgemm_desc_list[kernel_idx];
-    if (!desc.is_tmm) {
-      return;
-    }
-    palette_buffer = g_brgemm_palette[kernel_idx].get();
+    assert(kernel_idx < (int64_t)g_cache.size() && "Invalid kernel handler");
+    it = tl_cache.insert({kernel_idx, g_cache[kernel_idx]}).first;
+  }
+  brgemm_desc_t &desc = it->second.desc;
+  char *palette_buffer = it->second.palette.get();
+
+  if (!desc.is_tmm) {
+    return;
   }
 
   assert(palette_buffer != nullptr && "Invalid palette for BRGEMM kernel");
@@ -137,24 +146,29 @@ void dnnl_brgemm_tilerelease() {
 void dnnl_brgemm_execute(int64_t kernel_idx, void *A, uint64_t A_offset,
                          void *B, uint64_t B_offset, void *C, uint64_t C_offset,
                          int num) {
-  brgemm_kernel_t *kernel = nullptr;
-  size_t A_offset_in_bytes;
-  size_t B_offset_in_bytes;
-  size_t C_offset_in_bytes;
-  {
+  auto &tl_cache = get_tl_cache();
+  if (tl_cache.find(kernel_idx) == tl_cache.end()) {
     read_lock_guard_t g(g_brgemm_lock);
-    assert(kernel_idx >= 0 && kernel_idx < (int64_t)g_brgemm_desc_list.size() &&
+    assert(kernel_idx >= 0 && kernel_idx < (int64_t)g_cache.size() &&
            "Invalid kernel handler");
-
-    brgemm_desc_t &desc = g_brgemm_desc_list[kernel_idx];
-    kernel = g_brgemm_kernel_list[kernel_idx];
-
-    A_offset_in_bytes = dnnl::impl::types::data_type_size(desc.dt_a) * A_offset;
-    B_offset_in_bytes = dnnl::impl::types::data_type_size(desc.dt_b) * B_offset;
-    C_offset_in_bytes = dnnl::impl::types::data_type_size(desc.dt_c) * C_offset;
+    auto updated_cache =
+        tl_cache.insert(std::make_pair(kernel_idx, g_cache[kernel_idx]));
+    assert(updated_cache.second && "insert into thread local cache");
   }
+  auto it = tl_cache.find(kernel_idx);
+  brgemm_kernel_t *kernel = it->second.kernel;
+  brgemm_desc_t *desc_ptr = &it->second.desc;
 
   assert(kernel && "Invalid brgemm kernel pointer");
+  assert(desc_ptr && "Invalid brgemm descriptor pointer");
+
+  size_t A_offset_in_bytes =
+      dnnl::impl::types::data_type_size(desc_ptr->dt_a) * A_offset;
+  size_t B_offset_in_bytes =
+      dnnl::impl::types::data_type_size(desc_ptr->dt_b) * B_offset;
+  size_t C_offset_in_bytes =
+      dnnl::impl::types::data_type_size(desc_ptr->dt_c) * C_offset;
+
   char *A_arith = (char *)A;
   char *B_arith = (char *)B;
   char *C_arith = (char *)C;
 
@@ -13,8 +13,8 @@ gc_add_mlir_library(GcPasses
   MemRefToCPURuntime.cpp
   OneDNNGraphToLinalg.cpp
   Pipeline.cpp
+  TileUsingInterfaceX.cpp
   IterativeTilingAndFusion.cpp
-  TilingUsingInterfaceX.cpp
   VerifyTargetDescription.cpp
   DecomposeAggregatedOps.cpp
   DeepTileContractionOp.cpp
 
@@ -33,7 +33,7 @@
 #include <memory>
 #include <unordered_map>
 
-#include "TilingUsingInterfaceX.h"
+#include "TileUsingInterfaceX.h"
 
 namespace mlir {
 namespace gc {
 
@@ -30,6 +30,65 @@ using namespace special_ticks;
 /// and default memory space.
 static bool isMemRefTypeOk(MemRefType type) { return type.hasStaticShape(); }
 
+static inline int64_t getSizeInBytes(MemRefType &memType) {
+  // treat bool (i1) as 1 byte. It may not be true for all targets, but we at
+  // least have a large enough size for i1
+  int64_t size = memType.getElementTypeBitWidth() / 8;
+  size = (size > 0) ? size : 1;
+  for (auto v : memType.getShape()) {
+    size *= v;
+  }
+  return size;
+}
+
+static bool needsHoistOutOfParallelLoop(Operation *op) {
+  Operation *parent =
+      op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+  if (isa_and_nonnull<scf::ForallOp>(parent)) {
+    // check if the current allocation is between the nested pfor, and use
+    // inside the inner parallel loop
+    SmallVector<Operation *, 4> parallelOpInCurBlock;
+    Block *curBlock = op->getBlock();
+    for (auto &curOp : curBlock->getOperations()) {
+      if (isa<scf::ForallOp>(curOp)) {
+        parallelOpInCurBlock.push_back(&curOp);
+      }
+    }
+
+    if (parallelOpInCurBlock.empty())
+      return false;
+
+    for (auto *use : op->getUsers()) {
+      for (auto *parallelOp : parallelOpInCurBlock) {
+        if (parallelOp->isAncestor(use)) {
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+static bool isForallLoopBoundStatic(Operation *op) {
+  auto forallOp = dyn_cast<scf::ForallOp>(op);
+  if (!forallOp)
+    return false;
+
+  auto lbs = forallOp.getMixedLowerBound();
+  auto ubs = forallOp.getMixedUpperBound();
+  auto steps = forallOp.getMixedStep();
+  auto allConstantValue = [](SmallVector<OpFoldResult> vals) -> bool {
+    return llvm::all_of(vals, [](OpFoldResult val) {
+      std::optional<int64_t> const_val = getConstantIntValue(val);
+      return const_val.has_value();
+    });
+  };
+
+  return allConstantValue(lbs) && allConstantValue(ubs) &&
+         allConstantValue(steps);
+}
+
 void Tick::update(int64_t tick) {
   if (tick == UNTRACEABLE_ACCESS) {
     firstAccess = UNTRACEABLE_ACCESS;
@@ -180,28 +239,60 @@ bool TickCollecter::isMergeableAlloc(TickCollecterStates *s, Operation *op,
 // trait, and is not scf.for
 Operation *TickCollecter::getAllocScope(TickCollecterStates *s,
                                         Operation *op) const {
-  auto parent = op;
+  Operation *parent = op;
+  bool moveToUpperParellelLoop = needsHoistOutOfParallelLoop(op);
+
   for (;;) {
     parent = parent->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
     if (!parent) {
       return nullptr;
     }
-    if (!isa<scf::ForOp>(parent)) {
-      return parent;
-    }
+
+    if (isa<scf::ForOp>(parent))
+      continue;
+
+    if (isa<scf::ForallOp>(parent) &&
+        (moveToUpperParellelLoop && isForallLoopBoundStatic(parent)))
+      continue;
+
+    return parent;
   }
 }
 
 FailureOr<size_t> TickCollecter::getAllocSize(TickCollecterStates *s,
                                               Operation *op) const {
   auto refType = cast<MemRefType>(op->getResultTypes().front());
-  int64_t size = refType.getElementTypeBitWidth() / 8;
-  // treat bool (i1) as 1 byte. It may not be true for all targets, but we at
-  // least have a large enough size for i1
-  size = (size != 0) ? size : 1;
-  for (auto v : refType.getShape()) {
-    size *= v;
+
+  // Get the total number of threads from the outermost to the current level of
+  // the parallel loop that the allocation located in.
+  int64_t numThreads = 1;
+  if (needsHoistOutOfParallelLoop(op)) {
+    Operation *parent =
+        op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+    while (auto forallOp = dyn_cast<scf::ForallOp>(parent)) {
+      if (!isForallLoopBoundStatic(forallOp))
+        break;
+
+      OpBuilder builder{forallOp->getContext()};
+      std::optional<int64_t> numIterations;
+      for (auto [lb, ub, step] : llvm::zip(forallOp.getLowerBound(builder),
+                                           forallOp.getUpperBound(builder),
+                                           forallOp.getStep(builder))) {
+        numIterations = constantTripCount(lb, ub, step);
+        if (numIterations.has_value()) {
+          numThreads *= numIterations.value();
+        } else {
+          return op->emitError("Expecting static loop range!");
+        }
+      }
+
+      parent = parent->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+    }
   }
+  assert(numThreads > 0);
+
+  int64_t size = getSizeInBytes(refType);
+  size *= numThreads;
   if (size > 0) {
     return static_cast<size_t>(size);
   }
@@ -391,11 +482,113 @@ Value MergeAllocDefaultMutator::buildView(OpBuilder &builder, Block *scope,
                                           Value mergedAlloc,
                                           int64_t byteOffset) const {
   builder.setInsertionPoint(origAllocOp);
-  auto byteShift =
-      builder.create<arith::ConstantIndexOp>(origAllocOp->getLoc(), byteOffset);
-  return builder.create<memref::ViewOp>(origAllocOp->getLoc(),
-                                        origAllocOp->getResultTypes().front(),
-                                        mergedAlloc, byteShift, ValueRange{});
+  auto loc = origAllocOp->getLoc();
+  auto byteShift = builder.create<arith::ConstantIndexOp>(loc, byteOffset);
+
+  bool moveToUpperParellelLoop = needsHoistOutOfParallelLoop(origAllocOp);
+  Operation *parent =
+      origAllocOp->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+  if (!moveToUpperParellelLoop || !parent || !isa<scf::ForallOp>(parent))
+    return builder.create<memref::ViewOp>(loc,
+                                          origAllocOp->getResultTypes().front(),
+                                          mergedAlloc, byteShift, ValueRange{});
+
+  // get the aggregated inductorVar
+  Value inductVar;
+  bool isOuterMostLoop = true;
+  int64_t innerLoopUpperBound = 1;
+  while (parent) {
+    if (auto forallOp = dyn_cast<scf::ForallOp>(parent)) {
+      if (isForallLoopBoundStatic(forallOp)) {
+        SmallVector<Value> ubs = forallOp.getUpperBound(builder);
+        SmallVector<Value> lbs = forallOp.getLowerBound(builder);
+        SmallVector<Value> steps = forallOp.getStep(builder);
+        SmallVector<Value> inductionVars = forallOp.getInductionVars();
+
+        auto getCurrentVar = [&loc, &builder](Value var, Value lb,
+                                              Value step) -> Value {
+          if (!isConstantIntValue(lb, 0))
+            var = builder.create<arith::SubIOp>(loc, var, lb);
+
+          if (!isConstantIntValue(step, 1))
+            var = builder.create<arith::DivSIOp>(loc, var, step);
+          return var;
+        };
+
+        auto getAggregatedVar =
+            [&loc, &builder, &getCurrentVar](
+                const SmallVector<Value> &_lbs, const SmallVector<Value> &_ubs,
+                const SmallVector<Value> &_steps,
+                const SmallVector<Value> &_inductVars) -> Value {
+          Value var;
+          if (_ubs.size() == 1) {
+            var = getCurrentVar(_inductVars[0], _lbs[0], _steps[0]);
+            return var;
+          } else {
+            bool isFirstLoop = true;
+            for (auto [lb, ub, step, inductVar] :
+                 llvm::zip(_lbs, _ubs, _steps, _inductVars)) {
+              if (isFirstLoop) {
+                var = getCurrentVar(inductVar, lb, step);
+                isFirstLoop = false;
+              } else {
+                Value cur_var = getCurrentVar(inductVar, lb, step);
+                std::optional<int64_t> bound = constantTripCount(lb, ub, step);
+                assert(bound.has_value());
+                Value boundVal =
+                    builder.create<arith::ConstantIndexOp>(loc, bound.value());
+                Value tmpVal =
+                    builder.create<arith::MulIOp>(loc, var, boundVal);
+                var = builder.create<arith::AddIOp>(loc, tmpVal, cur_var);
+              }
+            }
+            return var;
+          }
+        };
+
+        if (isOuterMostLoop) {
+          inductVar = getAggregatedVar(lbs, ubs, steps, inductionVars);
+          isOuterMostLoop = false;
+        } else {
+          Value currentVar = getAggregatedVar(lbs, ubs, steps, inductionVars);
+
+          Value innerLoopBoundVal =
+              builder.create<arith::ConstantIndexOp>(loc, innerLoopUpperBound);
+          Value intermediateVal =
+              builder.create<arith::MulIOp>(loc, currentVar, innerLoopBoundVal);
+          inductVar =
+              builder.create<arith::AddIOp>(loc, inductVar, intermediateVal);
+        }
+        // get aggregated loop bound
+        for (auto [lb, ub, step] : llvm::zip(lbs, ubs, steps)) {
+          std::optional<int64_t> cur_bound = constantTripCount(lb, ub, step);
+          assert(cur_bound.has_value());
+          innerLoopUpperBound *= cur_bound.value();
+        }
+      }
+    }
+
+    parent = parent->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+  }
+
+  if (!isOuterMostLoop) {
+    // get original shape size
+    auto memType = cast<MemRefType>(origAllocOp->getResultTypes().front());
+    int64_t size = getSizeInBytes(memType);
+    Value origSize = builder.create<arith::ConstantIndexOp>(loc, size);
+    Value offsetPerThread =
+        builder.create<arith::MulIOp>(loc, inductVar, origSize);
+    Value byteShiftPerThread =
+        builder.create<arith::AddIOp>(loc, byteShift, offsetPerThread);
+
+    return builder.create<memref::ViewOp>(
+        loc, origAllocOp->getResultTypes().front(), mergedAlloc,
+        byteShiftPerThread, ValueRange{});
+  } else {
+    return builder.create<memref::ViewOp>(loc,
+                                          origAllocOp->getResultTypes().front(),
+                                          mergedAlloc, byteShift, ValueRange{});
+  }
 }
 
 LogicalResult