diff --git a/.github/workflows/tpp-benchmark.yml b/.github/workflows/tpp-benchmark.yml
index 346e24434..3d6f81e4b 100644
--- a/.github/workflows/tpp-benchmark.yml
+++ b/.github/workflows/tpp-benchmark.yml
@@ -11,10 +11,6 @@ on:
         description: "Run on Zen5"
         type: boolean
         default: true
-      RUN_CLX_BENCH:
-        description: "Run on CLX"
-        type: boolean
-        default: true
       RUN_ARL_BENCH:
         description: "Run on ARL"
         type: boolean
@@ -100,36 +96,6 @@ jobs:
                ${{ github.workspace }}/scripts/github/benchmark.sh -o"
           ${{ env.SRUN }} --partition=zen5 --time=0:30:00 -- $CMD
 
-  TPP-MLIR-CLX-BASE:
-    runs-on: pcl-tiergarten
-    if: |
-      (github.event_name == 'push') || 
-      (github.event_name == 'workflow_dispatch' && inputs.RUN_CLX_BENCH) ||
-      (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark-full'))
-    needs: Check_LLVM
-    steps:
-      - uses: actions/checkout@v4
-      - name: CLX Base
-        run: |-
-          CMD="KIND=Release COMPILER=clang LINKER=lld BENCHMARK_NUM_ITER=${{ env.NUM_ITER }} \
-                ${{ github.workspace }}/scripts/github/benchmark.sh -b -p"
-          ${{ env.SRUN }} --partition=clxap --time=0:30:00 -- $CMD
-
-  TPP-MLIR-CLX-OMP:
-    runs-on: pcl-tiergarten
-    if: |
-      (github.event_name == 'push') || 
-      (github.event_name == 'workflow_dispatch' && inputs.RUN_CLX_BENCH) ||
-      (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark-full'))
-    needs: Check_LLVM
-    steps:
-      - uses: actions/checkout@v4
-      - name: CLX OpenMP
-        run: |-
-          CMD="KIND=Release COMPILER=clang LINKER=lld BENCHMARK_NUM_ITER=${{ env.NUM_ITER }} \
-                ${{ github.workspace }}/scripts/github/benchmark.sh -o"
-          ${{ env.SRUN }} --partition=clxap --time=0:30:00 -- $CMD
-
   TPP-MLIR-ARL-BASE:
     runs-on: pcl-tiergarten
     if: |
diff --git a/.github/workflows/tpp-llvm.yml b/.github/workflows/tpp-llvm.yml
index 533a37887..6233dbd47 100644
--- a/.github/workflows/tpp-llvm.yml
+++ b/.github/workflows/tpp-llvm.yml
@@ -27,6 +27,6 @@ jobs:
       - name: LLVM CUDA
         run: |-
               GPU=cuda scripts/github/check_llvm.sh || \
-              ${{ env.SRUN }} --partition=a100,v100 --time=0:30:00 -- \
+              ${{ env.SRUN }} --partition=a100 --time=0:30:00 -- \
               'KIND=RelWithDebInfo COMPILER=clang GPU=cuda \
               ${{ github.workspace }}/scripts/github/build_llvm.sh'
diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt
index 440231c3d..92ed0d99f 100644
--- a/build_tools/llvm_version.txt
+++ b/build_tools/llvm_version.txt
@@ -1 +1 @@
-eb6da944af31dd684be3ab2f93f453a3837a72c6
+8eba28bc8ce9447d09edda6fc79e2191a1669252
diff --git a/lib/TPP/CMakeLists.txt b/lib/TPP/CMakeLists.txt
index 31b6d8a0b..a40cc7c59 100644
--- a/lib/TPP/CMakeLists.txt
+++ b/lib/TPP/CMakeLists.txt
@@ -23,8 +23,8 @@ add_mlir_library(TPPPipeline
 
   LINK_LIBS PUBLIC
     MLIRIR
-    ${mlir_dialect_libs}
-    ${conversion_libs}
+    MLIRRegisterAllDialects
+    MLIRRegisterAllPasses
     TPPGPU
     TPPPassBundles
   )
diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp
index c1cb2c3a8..7cc145654 100644
--- a/lib/TPP/DefaultPipeline.cpp
+++ b/lib/TPP/DefaultPipeline.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "TPP/PassBundles.h"
-
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
@@ -15,15 +13,25 @@
 #include "mlir/Pass/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Async/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/Passes.h"
+
 #include "TPP/Dialect/Check/BufferizableOpInterfaceImpl.h"
 #include "TPP/Dialect/Check/CheckDialect.h"
 #include "TPP/Dialect/Perf/BufferizableOpInterfaceImpl.h"
 #include "TPP/Dialect/Perf/PerfDialect.h"
 #include "TPP/Dialect/Perf/PerfOps.h"
 #include "TPP/Dialect/Xsmm/XsmmDialect.h"
+#include "TPP/PassBundles.h"
 #include "TPP/PassUtils.h"
 #include "TPP/Transforms/Utils/VNNIUtils.h"
-#include "mlir/Transforms/Passes.h"
 
 #include <string>
 
diff --git a/lib/TPP/GPU/GpuPipeline.cpp b/lib/TPP/GPU/GpuPipeline.cpp
index 4a0118fc3..5d7b92581 100644
--- a/lib/TPP/GPU/GpuPipeline.cpp
+++ b/lib/TPP/GPU/GpuPipeline.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
diff --git a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp
index a5fb6f68d..3722f73d8 100644
--- a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp
+++ b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp
@@ -459,14 +459,15 @@ namespace {
 static SmallVector<int64_t>
 getDefaultBlockingFactors(linalg::LinalgOp linalgOp) {
   assert(linalgOp && "expect a valid linalgOp");
-  if (isa<linalg::Conv2DNchwFchwOp>(linalgOp) ||
-      isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) {
+  auto *op = linalgOp.getOperation();
+  if (isa<linalg::Conv2DNchwFchwOp>(op) ||
+      isa<linalg::Conv2DNhwcHwcfOp>(op)) {
     return {32, 32};
   }
-  assert(isa<linalg::MatmulOp>(linalgOp) ||
-         isa<linalg::BatchMatmulOp>(linalgOp) ||
-         isa<linalg::MatmulTransposeAOp>(linalgOp) ||
-         isa<linalg::MatmulTransposeBOp>(linalgOp));
+  assert(isa<linalg::MatmulOp>(op) ||
+         isa<linalg::BatchMatmulOp>(op) ||
+         isa<linalg::MatmulTransposeAOp>(op) ||
+         isa<linalg::MatmulTransposeBOp>(op));
   return {32, 32, 32};
 }
 
@@ -492,12 +493,13 @@ struct PackMatmul : public tpp::impl::PackMatmulBase<PackMatmul> {
     auto packControlFn = [&](linalg::LinalgOp linalgOp)
         -> std::optional<linalg::BlockPackMatmulOptions> {
       linalg::BlockPackMatmulOptions options;
+      auto *op = linalgOp.getOperation();
 
       // Pack only these named matmul variants.
-      if (!(isa<linalg::MatmulOp>(linalgOp) ||
-            isa<linalg::MatmulTransposeAOp>(linalgOp) ||
-            isa<linalg::MatmulTransposeBOp>(linalgOp) ||
-            isa<linalg::BatchMatmulOp>(linalgOp))) {
+      if (!(isa<linalg::MatmulOp>(op) ||
+            isa<linalg::MatmulTransposeAOp>(op) ||
+            isa<linalg::MatmulTransposeBOp>(op) ||
+            isa<linalg::BatchMatmulOp>(op))) {
         return std::nullopt;
       }
 
diff --git a/scripts/ci/setup_gpu_env.sh b/scripts/ci/setup_gpu_env.sh
index 04252688f..521df7425 100755
--- a/scripts/ci/setup_gpu_env.sh
+++ b/scripts/ci/setup_gpu_env.sh
@@ -10,8 +10,7 @@ source ${SCRIPT_DIR}/ci/common.sh
 # Env CUDA setup
 if [[ ${GPU,,} =~ "cuda" ]]; then
   echo "Setting up CUDA environment"
-  echo "Hard-coding CUDA-compatible GCC version (12.3)"
-  source /swtools/gcc/gcc-12.3.0/gcc_vars.sh
-  source /swtools/cuda/latest/cuda_vars.sh
+  echo "Hard-coding MLIR-compatible CUDA version (12.9)"
+  source /swtools/cuda/12.9.0/cuda_vars.sh
   check_program nvcc
 fi
diff --git a/scripts/github/build_tpp.sh b/scripts/github/build_tpp.sh
index dd40750b7..b35ae0972 100755
--- a/scripts/github/build_tpp.sh
+++ b/scripts/github/build_tpp.sh
@@ -32,10 +32,6 @@ echo "--- ENVIRONMENT"
 if [ ! "${COMPILER}" ]; then
   COMPILER=clang
 fi
-if [ "${COMPILER}" == "gcc" ]; then
-  echo "Hard-coding GCC to a known stable version (12.3)"
-  source /swtools/gcc/gcc-12.3.0/gcc_vars.sh
-fi
 if [ "${SANITIZERS}" ]; then
   SANITIZERS="-S"
 fi
diff --git a/test/Integration/vector-contract-to-outerproduct.mlir b/test/Integration/vector-contract-to-outerproduct.mlir
index fc65e1233..1ca39219b 100644
--- a/test/Integration/vector-contract-to-outerproduct.mlir
+++ b/test/Integration/vector-contract-to-outerproduct.mlir
@@ -1,19 +1,18 @@
 // RUN: tpp-opt %s  | tpp-run -e entry --entry-point-result=void -seed 123 -print > %t.1
 // RUN: tpp-opt %s  --vector-contract-to-outerproduct  | tpp-run -e entry --entry-point-result=void -seed 123 -print > %t.2
-// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty
+// RUN: fpcmp -r 0.0001 %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty
 
 // RUN: tpp-opt %s  | tpp-run -e permA --entry-point-result=void -seed 123 -print > %t.1
 // RUN: tpp-opt %s  --vector-contract-to-outerproduct  | tpp-run -e permA --entry-point-result=void -seed 123 -print > %t.2
-// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMA --allow-empty
+// RUN: fpcmp -r 0.0001 %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMA --allow-empty
 
 // RUN: tpp-opt %s  | tpp-run -e permB --entry-point-result=void -seed 123 -print > %t.1
 // RUN: tpp-opt %s  --vector-contract-to-outerproduct  | tpp-run -e permB --entry-point-result=void -seed 123 -print > %t.2
-// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMA --allow-empty
+// RUN: fpcmp -r 0.0001 %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMA --allow-empty
 
 // RUN: tpp-opt %s  | tpp-run -e permAB --entry-point-result=void -seed 123 -print > %t.1
 // RUN: tpp-opt %s  --vector-contract-to-outerproduct  | tpp-run -e permAB --entry-point-result=void -seed 123 -print > %t.2
-// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMAB --allow-empty
-
+// RUN: fpcmp -r 0.0001 %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMAB --allow-empty
 
 // DIFF-NOT: {{.}}
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
diff --git a/test/Passes/DefaultPipeline/linalg-matmul-variants.mlir b/test/Passes/DefaultPipeline/linalg-matmul-variants.mlir
index d972b8ac7..a5a8b9c57 100644
--- a/test/Passes/DefaultPipeline/linalg-matmul-variants.mlir
+++ b/test/Passes/DefaultPipeline/linalg-matmul-variants.mlir
@@ -24,52 +24,6 @@ func.func @matmul(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>,
 
 // -----
 
-func.func @matmul_transpose_a(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, %arg2: tensor<2048x2048xbf16>)
-    -> tensor<2048x2048xbf16> {
-  %0 = linalg.matmul_transpose_a ins(%arg0, %arg1: tensor<2048x2048xbf16>, tensor<2048x2048xbf16>)
-                                 outs(%arg2: tensor<2048x2048xbf16>)
-    -> tensor<2048x2048xbf16>
-  return %0 : tensor<2048x2048xbf16>
-}
-
-// CHECK-LABEL: @matmul_transpose_a(
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>,
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>,
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>
-// CHECK: memref.subview %[[ARG0]]
-// CHECK: linalg.transpose
-// CHECK: memref.subview %[[ARG1]]
-// CHECK: call @xsmm_unary_invoke
-// CHECK: memref.subview %[[ARG2]]
-// CHECK: call @xsmm_intel_amx_tile_config_invoke
-// CHECK: call @xsmm_brgemm_invoke
-// CHECK: call @xsmm_intel_amx_tile_config_invoke
-
-// -----
-
-func.func @matmul_transpose_b(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, %arg2: tensor<2048x2048xbf16>)
-    -> tensor<2048x2048xbf16> {
-  %0 = linalg.matmul_transpose_b ins(%arg0, %arg1: tensor<2048x2048xbf16>, tensor<2048x2048xbf16>)
-                                 outs(%arg2: tensor<2048x2048xbf16>)
-    -> tensor<2048x2048xbf16>
-  return %0 : tensor<2048x2048xbf16>
-}
-
-// CHECK-LABEL: @matmul_transpose_b(
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>,
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>,
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>
-// CHECK: memref.subview %[[ARG0]]
-// CHECK: call @xsmm_unary_invoke
-// CHECK: memref.subview %[[ARG1]]
-// CHECK: linalg.transpose
-// CHECK: memref.subview %[[ARG2]]
-// CHECK: call @xsmm_intel_amx_tile_config_invoke
-// CHECK: call @xsmm_brgemm_invoke
-// CHECK: call @xsmm_intel_amx_tile_config_invoke
-
-// -----
-
 func.func @batch_matmul(%arg0: tensor<8x2048x2048xbf16>, %arg1: tensor<8x2048x2048xbf16>, %arg2: tensor<8x2048x2048xbf16>)
     -> tensor<8x2048x2048xbf16> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1: tensor<8x2048x2048xbf16>, tensor<8x2048x2048xbf16>)
diff --git a/test/Passes/fold-add-into-dest.mlir b/test/Passes/fold-add-into-dest.mlir
index 3b3df4dac..34c1073fc 100644
--- a/test/Passes/fold-add-into-dest.mlir
+++ b/test/Passes/fold-add-into-dest.mlir
@@ -36,32 +36,13 @@ func.func @expect_add_to_fold(%arg0: !type, %arg1: !type) -> !type {
 
 // -----
 
-!type = tensor<2048x2048xf32>
-func.func @expect_add_to_fold(%arg0: !type, %arg1: !type) -> !type {
-  %0 = arith.constant dense<1.111111e+00> : !type
-  %cst = arith.constant 0.000000e+00 : f32
-  %1 = tensor.empty() : !type
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : !type) -> !type
-  %3 = linalg.matmul_transpose_a ins(%arg0, %0 : !type, !type) outs(%2 : !type) -> !type
-  %4 = linalg.matmul_transpose_b ins(%arg1, %0 : !type, !type) outs(%2 : !type) -> !type
-  %5 = linalg.add ins(%3, %4 : !type, !type) outs(%1 : !type) -> !type
-  return %5 : !type
-}
-
-// CHECK-LABEL: func.func @expect_add_to_fold
-// CHECK: %[[ACC:.+]] = linalg.matmul_transpose_a
-// CHECK-NEXT: %[[RES:.+]] = linalg.matmul_transpose_b ins(%[[X:.+]]) outs(%[[ACC]]
-// CHECK-NEXT: return %[[RES]]
-
-// -----
-
 !type = tensor<2048x2048xf32>
 func.func @expect_no_fold_as_operands_do_not_dominate_each_other(%arg0: !type, %arg1: !type) -> !type {
   %0 = arith.constant dense<1.111111e+00> : !type
   %cst = arith.constant 0.000000e+00 : f32
   %1 = tensor.empty() : !type
   %2 = linalg.fill ins(%cst : f32) outs(%1 : !type) -> !type
-  %3 = linalg.matmul_transpose_b ins(%arg0, %0 : !type, !type) outs(%2 : !type) -> !type
+  %3 = linalg.matmul ins(%arg0, %0 : !type, !type) outs(%2 : !type) -> !type
   %4 = linalg.add ins(%3, %3 : !type, !type) outs(%1 : !type) -> !type
   return %4 : !type
 }
@@ -69,7 +50,7 @@ func.func @expect_no_fold_as_operands_do_not_dominate_each_other(%arg0: !type, %
 
 // CHECK-LABEL: func.func @expect_no_fold_as_operands_do_not_dominate_each_other
 // CHECK: linalg.fill
-// CHECK-NEXT: linalg.matmul_transpose_b
+// CHECK-NEXT: linalg.matmul
 // CHECK-NEXT: linalg.add
 // CHECK-NEXT: return
 
diff --git a/test/Passes/pass-convert-gemm-to-parallel-tile.mlir b/test/Passes/pass-convert-gemm-to-parallel-tile.mlir
index f9e132da4..6e8c8c4cc 100644
--- a/test/Passes/pass-convert-gemm-to-parallel-tile.mlir
+++ b/test/Passes/pass-convert-gemm-to-parallel-tile.mlir
@@ -29,7 +29,7 @@ module {
 // CHECK: %[[temp0:.*]] = call @xsmm_brgemm_dispatch(%[[c1_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c1024_i64]], %[[c1024_i64]], %[[c0_i64]])
 // CHECK:    omp.parallel {
 // CHECK:      omp.wsloop {
-// CHECK:        omp.loop_nest (%[[ARG3:.*]], %[[ARG4:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c8]]) {
+// CHECK:        omp.loop_nest (%[[ARG3:.*]], %[[ARG4:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c8]]) collapse(2) {
 // CHECK:          memref.alloca_scope  {
 // CHECK:            scf.for %[[ARG5:.*]] = %[[c0]] to %[[c2]] step %[[c1]] {
 // CHECK:	       %[[temp1:.*]] = arith.addi %[[ARG5]], %[[ARG3]] : index
diff --git a/test/Passes/pass-convert-mlp-to-parallel-tile.mlir b/test/Passes/pass-convert-mlp-to-parallel-tile.mlir
index 09567de45..265917ba9 100644
--- a/test/Passes/pass-convert-mlp-to-parallel-tile.mlir
+++ b/test/Passes/pass-convert-mlp-to-parallel-tile.mlir
@@ -80,7 +80,7 @@ module {
 //CHECK: %[[temp0:.*]] = call @xsmm_fused_brgemm_dispatch(%[[c1_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c1024_i64]], %[[c1024_i64]], %[[c0_i64]], %[[c0_i64]], %[[c5_i64]], %[[c4_i64]], %[[c1_i64]])
 //CHECK:  omp.parallel {
 //CHECK:      omp.wsloop {
-//CHECK:        omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) {
+//CHECK:        omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) collapse(2) {
 //CHECK:          memref.alloca_scope  {
 //CHECK:            scf.for %[[ARG12:.*]] = %[[c0]] to %[[c2]] step %[[c1]] {
 //CHECK:             %[[temp1:.*]] = arith.addi %[[ARG12]], %[[ARG10]] : index
@@ -88,7 +88,7 @@ module {
 //CHECK:                %[[temp2:.*]] = arith.addi %[[ARG13]], %[[ARG11]] : index
 //CHECK:  omp.parallel {
 //CHECK:      omp.wsloop {
-//CHECK:        omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) {
+//CHECK:        omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) collapse(2) {
 //CHECK:          memref.alloca_scope  {
 //CHECK:            scf.for %[[ARG12:.*]] = %[[c0]] to %[[c2]] step %[[c1]] {
 //CHECK:             %[[temp1:.*]] = arith.addi %[[ARG12]], %[[ARG10]] : index
@@ -96,7 +96,7 @@ module {
 //CHECK:                %[[temp2:.*]] = arith.addi %[[ARG13]], %[[ARG11]] : index
 //CHECK:  omp.parallel {
 //CHECK:      omp.wsloop {
-//CHECK:        omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) {
+//CHECK:        omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) collapse(2) {
 //CHECK:          memref.alloca_scope  {
 //CHECK:            scf.for %[[ARG12:.*]] = %[[c0]] to %[[c2]] step %[[c1]] {
 //CHECK:             %[[temp1:.*]] = arith.addi %[[ARG12]], %[[ARG10]] : index
diff --git a/test/Passes/pass-matmul-blocking-default.mlir b/test/Passes/pass-matmul-blocking-default.mlir
index 425bc7d0f..16cccf2f7 100644
--- a/test/Passes/pass-matmul-blocking-default.mlir
+++ b/test/Passes/pass-matmul-blocking-default.mlir
@@ -29,64 +29,6 @@ func.func @block_linalg_matmul(
 
 // -----
 
-func.func @block_linalg_matmul_transpose_a(
-  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32> {
-  %0 = linalg.matmul_transpose_a ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
-                                 outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
-  return %0 : tensor<128x128xf32>
-}
-
-// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
-// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
-// CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
-
-// CHECK-LABEL: func @block_linalg_matmul_transpose_a(
-// CHECK-SAME:    %[[ARG0:[0-9a-z]+]]: tensor<128x128xf32>
-// CHECK-SAME:    %[[ARG1:[0-9a-z]+]]: tensor<128x128xf32>
-// CHECK-SAME:    %[[ARG2:[0-9a-z]+]]: tensor<128x128xf32>) -> tensor<128x128xf32> {
-// CHECK: %[[BUF0:.+]] = tensor.empty() : tensor<4x4x32x32xf32>
-// CHECK: %[[PACK0:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 32] into %[[BUF0]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
-// CHECK: %[[BUF1:.*]] = tensor.empty() : tensor<4x4x32x32xf32>
-// CHECK: %[[PACK1:.+]] = linalg.pack %[[ARG1]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[BUF1]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
-// CHECK: %[[BUF2:.+]] = tensor.empty() : tensor<4x4x32x32xf32>
-// CHECK: %[[PACK2:.+]] = linalg.pack %[[ARG2]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[BUF2]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
-// CHECK: %[[VAL:.+]] = linalg.generic {indexing_maps = [#[[MAP3]], #[[MAP4]], #[[MAP5]]], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%[[PACK0]], %[[PACK1]] : tensor<4x4x32x32xf32>, tensor<4x4x32x32xf32>) outs(%[[PACK2]] : tensor<4x4x32x32xf32>)
-// CHECK: %[[OUT:.+]] = linalg.unpack %[[VAL]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[ARG2]] : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
-// CHECK: return %[[OUT]] : tensor<128x128xf32>
-
-// -----
-
-func.func @block_linalg_matmul_transpose_b(
-  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32> {
-  %0 = linalg.matmul_transpose_b ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
-                                 outs(%arg2: tensor<128x128xf32>)
-    -> tensor<128x128xf32>
-  return %0 : tensor<128x128xf32>
-}
-
-// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
-// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
-// CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
-
-// CHECK-LABEL: func @block_linalg_matmul_transpose_b(
-// CHECK-SAME:    %[[ARG0:[0-9a-z]+]]: tensor<128x128xf32>
-// CHECK-SAME:    %[[ARG1:[0-9a-z]+]]: tensor<128x128xf32>
-// CHECK-SAME:    %[[ARG2:[0-9a-z]+]]: tensor<128x128xf32>) -> tensor<128x128xf32> {
-// CHECK: %[[BUF0:.+]] = tensor.empty() : tensor<4x4x32x32xf32>
-// CHECK: %[[PACK0:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[BUF0]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
-// CHECK: %[[BUF1:.*]] = tensor.empty() : tensor<4x4x32x32xf32>
-// CHECK: %[[PACK1:.+]] = linalg.pack %[[ARG1]] outer_dims_perm = [0, 1] inner_dims_pos = [1, 0] inner_tiles = [32, 32] into %[[BUF1]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
-// CHECK: %[[BUF2:.+]] = tensor.empty() : tensor<4x4x32x32xf32>
-// CHECK: %[[PACK2:.+]] = linalg.pack %[[ARG2]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[BUF2]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32>
-// CHECK: %[[VAL:.+]] = linalg.generic {indexing_maps = [#[[MAP3]], #[[MAP4]], #[[MAP5]]], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%[[PACK0]], %[[PACK1]] : tensor<4x4x32x32xf32>, tensor<4x4x32x32xf32>) outs(%[[PACK2]] : tensor<4x4x32x32xf32>)
-// CHECK: %[[OUT:.+]] = linalg.unpack %[[VAL]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[ARG2]] : tensor<4x4x32x32xf32> -> tensor<128x128xf32>
-// CHECK: return %[[OUT]] : tensor<128x128xf32>
-
-// -----
-
 func.func @block_linalg_matmul_dynamic(
   %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
     -> tensor<?x?xf32> {
diff --git a/test/Passes/split-reduction-dim.mlir b/test/Passes/split-reduction-dim.mlir
index 78eba15fb..baf849cad 100644
--- a/test/Passes/split-reduction-dim.mlir
+++ b/test/Passes/split-reduction-dim.mlir
@@ -178,52 +178,6 @@ func.func @tile_batch_reduce_matmul(%A: memref<2x32x64xf32>, %B: memref<2x64x16x
 
 // -----
 
-func.func @tile_matmul_transpose_a(%A: memref<64x32xf32>, %B: memref<64x16xf32>,
-    %C: memref<32x16xf32>) {
-  linalg.matmul_transpose_a ins(%A, %B: memref<64x32xf32>, memref<64x16xf32>)
-    outs(%C: memref<32x16xf32>)
-  return
-}
-
-// CHECK-LABEL: @tile_matmul_transpose_a(
-// CHECK-SAME:  %[[A:[0-9a-z]+]]: memref<64x32xf32>
-// CHECK-SAME:  %[[B:[0-9a-z]+]]: memref<64x16xf32>
-// CHECK-SAME:  %[[C:[0-9a-z]+]]: memref<32x16xf32>
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[UB:.+]] = arith.constant 64 : index
-// CHECK-DAG: %[[K_TILE:.+]] = arith.constant 8 : index
-// CHECK: scf.for %[[IV:.+]] = %[[C0]] to %[[UB]] step %[[K_TILE]] {
-// CHECK:   %[[SUBVIEW_A:.+]] = memref.subview %[[A]][%[[IV]], 0] [8, 32] [1, 1]
-// CHECK:   %[[SUBVIEW_B:.+]] = memref.subview %[[B]][%[[IV]], 0] [8, 16] [1, 1]
-// CHECK:   linalg.matmul_transpose_a
-// CHECK-SAME: ins(%[[SUBVIEW_A]], %[[SUBVIEW_B]]
-// CHECK-SAME: outs(%[[C]]
-
-// -----
-
-func.func @tile_matmul_transpose_b(%A: memref<32x64xf32>, %B: memref<16x64xf32>,
-    %C: memref<32x16xf32>) {
-  linalg.matmul_transpose_b ins(%A, %B: memref<32x64xf32>, memref<16x64xf32>)
-    outs(%C: memref<32x16xf32>)
-  return
-}
-
-// CHECK-LABEL: @tile_matmul_transpose_b(
-// CHECK-SAME:  %[[A:[0-9a-z]+]]: memref<32x64xf32>
-// CHECK-SAME:  %[[B:[0-9a-z]+]]: memref<16x64xf32>
-// CHECK-SAME:  %[[C:[0-9a-z]+]]: memref<32x16xf32>
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[UB:.+]] = arith.constant 64 : index
-// CHECK-DAG: %[[K_TILE:.+]] = arith.constant 8 : index
-// CHECK: scf.for %[[IV:.+]] = %[[C0]] to %[[UB]] step %[[K_TILE]] {
-// CHECK:   %[[SUBVIEW_A:.+]] = memref.subview %[[A]][0, %[[IV]]] [32, 8] [1, 1]
-// CHECK:   %[[SUBVIEW_B:.+]] = memref.subview %[[B]][0, %[[IV]]] [16, 8] [1, 1]
-// CHECK:   linalg.matmul_transpose_b
-// CHECK-SAME: ins(%[[SUBVIEW_A]], %[[SUBVIEW_B]]
-// CHECK-SAME: outs(%[[C]]
-
-// -----
-
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
 func.func @tile_generic_1D(%A: memref<32xf32>, %B: memref<32xf32>, %C: memref<f32>) {
diff --git a/tools/tpp-opt/CMakeLists.txt b/tools/tpp-opt/CMakeLists.txt
index 2c6128096..99b69e5a0 100644
--- a/tools/tpp-opt/CMakeLists.txt
+++ b/tools/tpp-opt/CMakeLists.txt
@@ -8,10 +8,17 @@ if(USE_OneDNN)
   set(ONEDNN_LIBS "tpp_dnnl_runner_utils")
 endif()
 
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  set_source_files_properties(
+    tpp-opt.cpp
+    PROPERTIES COMPILE_FLAGS "-Wno-error=subobject-linkage")
+endif()
+
+
 set(LIBS
-        ${dialect_libs}
-        ${conversion_libs}
-        ${extension_libs}
+        MLIRRegisterAllExtensions
+        MLIRRegisterAllDialects
+        MLIRRegisterAllPasses
         MLIRToLLVMIRTranslationRegistration
         MLIROptLib
         TPPPipeline
diff --git a/tools/tpp-opt/tpp-opt.cpp b/tools/tpp-opt/tpp-opt.cpp
index 4b3fd8b8b..43826add3 100644
--- a/tools/tpp-opt/tpp-opt.cpp
+++ b/tools/tpp-opt/tpp-opt.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h"
+#include "mlir/Dialect/Transform/TuneExtension/TuneExtension.h"
 
 #include "TPP/Dialect/Check/BufferizableOpInterfaceImpl.h"
 #include "TPP/Dialect/Check/CheckDialect.h"
diff --git a/tools/tpp-run/CMakeLists.txt b/tools/tpp-run/CMakeLists.txt
index 791954d67..00e097670 100644
--- a/tools/tpp-run/CMakeLists.txt
+++ b/tools/tpp-run/CMakeLists.txt
@@ -10,9 +10,9 @@ if(USE_OneDNN)
 endif()
 
 set(LIBS
-        ${dialect_libs}
-        ${conversion_libs}
-        ${extension_libs}
+        MLIRRegisterAllExtensions
+        MLIRRegisterAllDialects
+        MLIRRegisterAllPasses
         MLIRAnalysis
         MLIRExecutionEngine
         MLIRIR
diff --git a/tools/tpp-run/tpp-run.cpp b/tools/tpp-run/tpp-run.cpp
index a9257c938..4ab5d4606 100644
--- a/tools/tpp-run/tpp-run.cpp
+++ b/tools/tpp-run/tpp-run.cpp
@@ -52,6 +52,7 @@
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h"
+#include "mlir/Dialect/Transform/TuneExtension/TuneExtension.h"
 
 #include "TPP/Dialect/Check/CheckDialect.h"
 #include "TPP/Dialect/Perf/PerfDialect.h"