diff --git a/.github/workflows/tpp-benchmark.yml b/.github/workflows/tpp-benchmark.yml index 346e24434..3d6f81e4b 100644 --- a/.github/workflows/tpp-benchmark.yml +++ b/.github/workflows/tpp-benchmark.yml @@ -11,10 +11,6 @@ on: description: "Run on Zen5" type: boolean default: true - RUN_CLX_BENCH: - description: "Run on CLX" - type: boolean - default: true RUN_ARL_BENCH: description: "Run on ARL" type: boolean @@ -100,36 +96,6 @@ jobs: ${{ github.workspace }}/scripts/github/benchmark.sh -o" ${{ env.SRUN }} --partition=zen5 --time=0:30:00 -- $CMD - TPP-MLIR-CLX-BASE: - runs-on: pcl-tiergarten - if: | - (github.event_name == 'push') || - (github.event_name == 'workflow_dispatch' && inputs.RUN_CLX_BENCH) || - (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark-full')) - needs: Check_LLVM - steps: - - uses: actions/checkout@v4 - - name: CLX Base - run: |- - CMD="KIND=Release COMPILER=clang LINKER=lld BENCHMARK_NUM_ITER=${{ env.NUM_ITER }} \ - ${{ github.workspace }}/scripts/github/benchmark.sh -b -p" - ${{ env.SRUN }} --partition=clxap --time=0:30:00 -- $CMD - - TPP-MLIR-CLX-OMP: - runs-on: pcl-tiergarten - if: | - (github.event_name == 'push') || - (github.event_name == 'workflow_dispatch' && inputs.RUN_CLX_BENCH) || - (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark-full')) - needs: Check_LLVM - steps: - - uses: actions/checkout@v4 - - name: CLX OpenMP - run: |- - CMD="KIND=Release COMPILER=clang LINKER=lld BENCHMARK_NUM_ITER=${{ env.NUM_ITER }} \ - ${{ github.workspace }}/scripts/github/benchmark.sh -o" - ${{ env.SRUN }} --partition=clxap --time=0:30:00 -- $CMD - TPP-MLIR-ARL-BASE: runs-on: pcl-tiergarten if: | diff --git a/.github/workflows/tpp-llvm.yml b/.github/workflows/tpp-llvm.yml index 533a37887..6233dbd47 100644 --- a/.github/workflows/tpp-llvm.yml +++ b/.github/workflows/tpp-llvm.yml @@ -27,6 +27,6 @@ jobs: - name: LLVM CUDA run: |- GPU=cuda scripts/github/check_llvm.sh || \ - ${{ env.SRUN }} --partition=a100,v100 --time=0:30:00 -- \ + ${{ env.SRUN }} --partition=a100 --time=0:30:00 -- \ 'KIND=RelWithDebInfo COMPILER=clang GPU=cuda \ ${{ github.workspace }}/scripts/github/build_llvm.sh' diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 440231c3d..92ed0d99f 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -eb6da944af31dd684be3ab2f93f453a3837a72c6 +8eba28bc8ce9447d09edda6fc79e2191a1669252 diff --git a/lib/TPP/CMakeLists.txt b/lib/TPP/CMakeLists.txt index 31b6d8a0b..a40cc7c59 100644 --- a/lib/TPP/CMakeLists.txt +++ b/lib/TPP/CMakeLists.txt @@ -23,8 +23,8 @@ add_mlir_library(TPPPipeline LINK_LIBS PUBLIC MLIRIR - ${mlir_dialect_libs} - ${conversion_libs} + MLIRRegisterAllDialects + MLIRRegisterAllPasses TPPGPU TPPPassBundles ) diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp index c1cb2c3a8..7cc145654 100644 --- a/lib/TPP/DefaultPipeline.cpp +++ b/lib/TPP/DefaultPipeline.cpp @@ -6,8 +6,6 @@ // //===----------------------------------------------------------------------===// -#include "TPP/PassBundles.h" - #include "mlir/IR/BuiltinOps.h" #include "mlir/InitAllDialects.h" #include "mlir/InitAllPasses.h" @@ -15,15 +13,25 @@ #include "mlir/Pass/PassManager.h" #include "llvm/Support/CommandLine.h" +#include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/Dialect/Arith/Transforms/Passes.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Dialect/Async/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Transforms/Passes.h" + #include "TPP/Dialect/Check/BufferizableOpInterfaceImpl.h" #include "TPP/Dialect/Check/CheckDialect.h" #include "TPP/Dialect/Perf/BufferizableOpInterfaceImpl.h" #include "TPP/Dialect/Perf/PerfDialect.h" #include "TPP/Dialect/Perf/PerfOps.h" #include "TPP/Dialect/Xsmm/XsmmDialect.h" +#include "TPP/PassBundles.h" #include "TPP/PassUtils.h" #include "TPP/Transforms/Utils/VNNIUtils.h" -#include "mlir/Transforms/Passes.h" #include diff --git a/lib/TPP/GPU/GpuPipeline.cpp b/lib/TPP/GPU/GpuPipeline.cpp index 4a0118fc3..5d7b92581 100644 --- a/lib/TPP/GPU/GpuPipeline.cpp +++ b/lib/TPP/GPU/GpuPipeline.cpp @@ -20,6 +20,7 @@ #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/InitAllDialects.h" #include "mlir/InitAllPasses.h" diff --git a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp index a5fb6f68d..3722f73d8 100644 --- a/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp +++ b/lib/TPP/Transforms/ToBlockLayoutAndBack.cpp @@ -459,14 +459,15 @@ namespace { static SmallVector getDefaultBlockingFactors(linalg::LinalgOp linalgOp) { assert(linalgOp && "expect a valid linalgOp"); - if (isa(linalgOp) || - isa(linalgOp)) { + auto *op = linalgOp.getOperation(); + if (isa(op) || + isa(op)) { return {32, 32}; } - assert(isa(linalgOp) || - isa(linalgOp) || - isa(linalgOp) || - isa(linalgOp)); + assert(isa(op) || + isa(op) || + isa(op) || + isa(op)); return {32, 32, 32}; } @@ -492,12 +493,13 @@ struct PackMatmul : public tpp::impl::PackMatmulBase { auto packControlFn = [&](linalg::LinalgOp linalgOp) -> std::optional { linalg::BlockPackMatmulOptions options; + auto *op = linalgOp.getOperation(); // Pack only these named matmul variants. - if (!(isa(linalgOp) || - isa(linalgOp) || - isa(linalgOp) || - isa(linalgOp))) { + if (!(isa(op) || + isa(op) || + isa(op) || + isa(op))) { return std::nullopt; } diff --git a/scripts/ci/setup_gpu_env.sh b/scripts/ci/setup_gpu_env.sh index 04252688f..521df7425 100755 --- a/scripts/ci/setup_gpu_env.sh +++ b/scripts/ci/setup_gpu_env.sh @@ -10,8 +10,7 @@ source ${SCRIPT_DIR}/ci/common.sh # Env CUDA setup if [[ ${GPU,,} =~ "cuda" ]]; then echo "Setting up CUDA environment" - echo "Hard-coding CUDA-compatible GCC version (12.3)" - source /swtools/gcc/gcc-12.3.0/gcc_vars.sh - source /swtools/cuda/latest/cuda_vars.sh + echo "Hard-coding MLIR-compatible CUDA version (12.9)" + source /swtools/cuda/12.9.0/cuda_vars.sh check_program nvcc fi diff --git a/scripts/github/build_tpp.sh b/scripts/github/build_tpp.sh index dd40750b7..b35ae0972 100755 --- a/scripts/github/build_tpp.sh +++ b/scripts/github/build_tpp.sh @@ -32,10 +32,6 @@ echo "--- ENVIRONMENT" if [ ! "${COMPILER}" ]; then COMPILER=clang fi -if [ "${COMPILER}" == "gcc" ]; then - echo "Hard-coding GCC to a known stable version (12.3)" - source /swtools/gcc/gcc-12.3.0/gcc_vars.sh -fi if [ "${SANITIZERS}" ]; then SANITIZERS="-S" fi diff --git a/test/Integration/vector-contract-to-outerproduct.mlir b/test/Integration/vector-contract-to-outerproduct.mlir index fc65e1233..1ca39219b 100644 --- a/test/Integration/vector-contract-to-outerproduct.mlir +++ b/test/Integration/vector-contract-to-outerproduct.mlir @@ -1,19 +1,18 @@ // RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -seed 123 -print > %t.1 // RUN: tpp-opt %s --vector-contract-to-outerproduct | tpp-run -e entry --entry-point-result=void -seed 123 -print > %t.2 -// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty +// RUN: fpcmp -r 0.0001 %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty // RUN: tpp-opt %s | tpp-run -e permA --entry-point-result=void -seed 123 -print > %t.1 // RUN: tpp-opt %s --vector-contract-to-outerproduct | tpp-run -e permA --entry-point-result=void -seed 123 -print > %t.2 -// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMA --allow-empty +// RUN: fpcmp -r 0.0001 %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMA --allow-empty // RUN: tpp-opt %s | tpp-run -e permB --entry-point-result=void -seed 123 -print > %t.1 // RUN: tpp-opt %s --vector-contract-to-outerproduct | tpp-run -e permB --entry-point-result=void -seed 123 -print > %t.2 -// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMA --allow-empty +// RUN: fpcmp -r 0.0001 %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMA --allow-empty // RUN: tpp-opt %s | tpp-run -e permAB --entry-point-result=void -seed 123 -print > %t.1 // RUN: tpp-opt %s --vector-contract-to-outerproduct | tpp-run -e permAB --entry-point-result=void -seed 123 -print > %t.2 -// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMAB --allow-empty - +// RUN: fpcmp -r 0.0001 %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-PERMAB --allow-empty // DIFF-NOT: {{.}} #map = affine_map<(d0, d1, d2) -> (d0, d2)> diff --git a/test/Passes/DefaultPipeline/linalg-matmul-variants.mlir b/test/Passes/DefaultPipeline/linalg-matmul-variants.mlir index d972b8ac7..a5a8b9c57 100644 --- a/test/Passes/DefaultPipeline/linalg-matmul-variants.mlir +++ b/test/Passes/DefaultPipeline/linalg-matmul-variants.mlir @@ -24,52 +24,6 @@ func.func @matmul(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, // ----- -func.func @matmul_transpose_a(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, %arg2: tensor<2048x2048xbf16>) - -> tensor<2048x2048xbf16> { - %0 = linalg.matmul_transpose_a ins(%arg0, %arg1: tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) - outs(%arg2: tensor<2048x2048xbf16>) - -> tensor<2048x2048xbf16> - return %0 : tensor<2048x2048xbf16> -} - -// CHECK-LABEL: @matmul_transpose_a( -// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>, -// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>, -// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref<2048x2048xbf16> -// CHECK: memref.subview %[[ARG0]] -// CHECK: linalg.transpose -// CHECK: memref.subview %[[ARG1]] -// CHECK: call @xsmm_unary_invoke -// CHECK: memref.subview %[[ARG2]] -// CHECK: call @xsmm_intel_amx_tile_config_invoke -// CHECK: call @xsmm_brgemm_invoke -// CHECK: call @xsmm_intel_amx_tile_config_invoke - -// ----- - -func.func @matmul_transpose_b(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, %arg2: tensor<2048x2048xbf16>) - -> tensor<2048x2048xbf16> { - %0 = linalg.matmul_transpose_b ins(%arg0, %arg1: tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) - outs(%arg2: tensor<2048x2048xbf16>) - -> tensor<2048x2048xbf16> - return %0 : tensor<2048x2048xbf16> -} - -// CHECK-LABEL: @matmul_transpose_b( -// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>, -// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: memref<2048x2048xbf16>, -// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref<2048x2048xbf16> -// CHECK: memref.subview %[[ARG0]] -// CHECK: call @xsmm_unary_invoke -// CHECK: memref.subview %[[ARG1]] -// CHECK: linalg.transpose -// CHECK: memref.subview %[[ARG2]] -// CHECK: call @xsmm_intel_amx_tile_config_invoke -// CHECK: call @xsmm_brgemm_invoke -// CHECK: call @xsmm_intel_amx_tile_config_invoke - -// ----- - func.func @batch_matmul(%arg0: tensor<8x2048x2048xbf16>, %arg1: tensor<8x2048x2048xbf16>, %arg2: tensor<8x2048x2048xbf16>) -> tensor<8x2048x2048xbf16> { %0 = linalg.batch_matmul ins(%arg0, %arg1: tensor<8x2048x2048xbf16>, tensor<8x2048x2048xbf16>) diff --git a/test/Passes/fold-add-into-dest.mlir b/test/Passes/fold-add-into-dest.mlir index 3b3df4dac..34c1073fc 100644 --- a/test/Passes/fold-add-into-dest.mlir +++ b/test/Passes/fold-add-into-dest.mlir @@ -36,32 +36,13 @@ func.func @expect_add_to_fold(%arg0: !type, %arg1: !type) -> !type { // ----- -!type = tensor<2048x2048xf32> -func.func @expect_add_to_fold(%arg0: !type, %arg1: !type) -> !type { - %0 = arith.constant dense<1.111111e+00> : !type - %cst = arith.constant 0.000000e+00 : f32 - %1 = tensor.empty() : !type - %2 = linalg.fill ins(%cst : f32) outs(%1 : !type) -> !type - %3 = linalg.matmul_transpose_a ins(%arg0, %0 : !type, !type) outs(%2 : !type) -> !type - %4 = linalg.matmul_transpose_b ins(%arg1, %0 : !type, !type) outs(%2 : !type) -> !type - %5 = linalg.add ins(%3, %4 : !type, !type) outs(%1 : !type) -> !type - return %5 : !type -} - -// CHECK-LABEL: func.func @expect_add_to_fold -// CHECK: %[[ACC:.+]] = linalg.matmul_transpose_a -// CHECK-NEXT: %[[RES:.+]] = linalg.matmul_transpose_b ins(%[[X:.+]]) outs(%[[ACC]] -// CHECK-NEXT: return %[[RES]] - -// ----- - !type = tensor<2048x2048xf32> func.func @expect_no_fold_as_operands_do_not_dominate_each_other(%arg0: !type, %arg1: !type) -> !type { %0 = arith.constant dense<1.111111e+00> : !type %cst = arith.constant 0.000000e+00 : f32 %1 = tensor.empty() : !type %2 = linalg.fill ins(%cst : f32) outs(%1 : !type) -> !type - %3 = linalg.matmul_transpose_b ins(%arg0, %0 : !type, !type) outs(%2 : !type) -> !type + %3 = linalg.matmul ins(%arg0, %0 : !type, !type) outs(%2 : !type) -> !type %4 = linalg.add ins(%3, %3 : !type, !type) outs(%1 : !type) -> !type return %4 : !type } @@ -69,7 +50,7 @@ func.func @expect_no_fold_as_operands_do_not_dominate_each_other(%arg0: !type, % // CHECK-LABEL: func.func @expect_no_fold_as_operands_do_not_dominate_each_other // CHECK: linalg.fill -// CHECK-NEXT: linalg.matmul_transpose_b +// CHECK-NEXT: linalg.matmul // CHECK-NEXT: linalg.add // CHECK-NEXT: return diff --git a/test/Passes/pass-convert-gemm-to-parallel-tile.mlir b/test/Passes/pass-convert-gemm-to-parallel-tile.mlir index f9e132da4..6e8c8c4cc 100644 --- a/test/Passes/pass-convert-gemm-to-parallel-tile.mlir +++ b/test/Passes/pass-convert-gemm-to-parallel-tile.mlir @@ -29,7 +29,7 @@ module { // CHECK: %[[temp0:.*]] = call @xsmm_brgemm_dispatch(%[[c1_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c1024_i64]], %[[c1024_i64]], %[[c0_i64]]) // CHECK: omp.parallel { // CHECK: omp.wsloop { -// CHECK: omp.loop_nest (%[[ARG3:.*]], %[[ARG4:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c8]]) { +// CHECK: omp.loop_nest (%[[ARG3:.*]], %[[ARG4:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c8]]) collapse(2) { // CHECK: memref.alloca_scope { // CHECK: scf.for %[[ARG5:.*]] = %[[c0]] to %[[c2]] step %[[c1]] { // CHECK: %[[temp1:.*]] = arith.addi %[[ARG5]], %[[ARG3]] : index diff --git a/test/Passes/pass-convert-mlp-to-parallel-tile.mlir b/test/Passes/pass-convert-mlp-to-parallel-tile.mlir index 09567de45..265917ba9 100644 --- a/test/Passes/pass-convert-mlp-to-parallel-tile.mlir +++ b/test/Passes/pass-convert-mlp-to-parallel-tile.mlir @@ -80,7 +80,7 @@ module { //CHECK: %[[temp0:.*]] = call @xsmm_fused_brgemm_dispatch(%[[c1_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c32_i64]], %[[c1024_i64]], %[[c1024_i64]], %[[c0_i64]], %[[c0_i64]], %[[c5_i64]], %[[c4_i64]], %[[c1_i64]]) //CHECK: omp.parallel { //CHECK: omp.wsloop { -//CHECK: omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) { +//CHECK: omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) collapse(2) { //CHECK: memref.alloca_scope { //CHECK: scf.for %[[ARG12:.*]] = %[[c0]] to %[[c2]] step %[[c1]] { //CHECK: %[[temp1:.*]] = arith.addi %[[ARG12]], %[[ARG10]] : index @@ -88,7 +88,7 @@ module { //CHECK: %[[temp2:.*]] = arith.addi %[[ARG13]], %[[ARG11]] : index //CHECK: omp.parallel { //CHECK: omp.wsloop { -//CHECK: omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) { +//CHECK: omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) collapse(2) { //CHECK: memref.alloca_scope { //CHECK: scf.for %[[ARG12:.*]] = %[[c0]] to %[[c2]] step %[[c1]] { //CHECK: %[[temp1:.*]] = arith.addi %[[ARG12]], %[[ARG10]] : index @@ -96,7 +96,7 @@ module { //CHECK: %[[temp2:.*]] = arith.addi %[[ARG13]], %[[ARG11]] : index //CHECK: omp.parallel { //CHECK: omp.wsloop { -//CHECK: omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) { +//CHECK: omp.loop_nest (%[[ARG10:.*]], %[[ARG11:.*]]) : index = (%[[c0]], %[[c0]]) to (%[[c8]], %[[c32]]) step (%[[c2]], %[[c16]]) collapse(2) { //CHECK: memref.alloca_scope { //CHECK: scf.for %[[ARG12:.*]] = %[[c0]] to %[[c2]] step %[[c1]] { //CHECK: %[[temp1:.*]] = arith.addi %[[ARG12]], %[[ARG10]] : index diff --git a/test/Passes/pass-matmul-blocking-default.mlir b/test/Passes/pass-matmul-blocking-default.mlir index 425bc7d0f..16cccf2f7 100644 --- a/test/Passes/pass-matmul-blocking-default.mlir +++ b/test/Passes/pass-matmul-blocking-default.mlir @@ -29,64 +29,6 @@ func.func @block_linalg_matmul( // ----- -func.func @block_linalg_matmul_transpose_a( - %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) - -> tensor<128x128xf32> { - %0 = linalg.matmul_transpose_a ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>) - outs(%arg2: tensor<128x128xf32>) - -> tensor<128x128xf32> - return %0 : tensor<128x128xf32> -} - -// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> -// CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> - -// CHECK-LABEL: func @block_linalg_matmul_transpose_a( -// CHECK-SAME: %[[ARG0:[0-9a-z]+]]: tensor<128x128xf32> -// CHECK-SAME: %[[ARG1:[0-9a-z]+]]: tensor<128x128xf32> -// CHECK-SAME: %[[ARG2:[0-9a-z]+]]: tensor<128x128xf32>) -> tensor<128x128xf32> { -// CHECK: %[[BUF0:.+]] = tensor.empty() : tensor<4x4x32x32xf32> -// CHECK: %[[PACK0:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 32] into %[[BUF0]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32> -// CHECK: %[[BUF1:.*]] = tensor.empty() : tensor<4x4x32x32xf32> -// CHECK: %[[PACK1:.+]] = linalg.pack %[[ARG1]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[BUF1]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32> -// CHECK: %[[BUF2:.+]] = tensor.empty() : tensor<4x4x32x32xf32> -// CHECK: %[[PACK2:.+]] = linalg.pack %[[ARG2]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[BUF2]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32> -// CHECK: %[[VAL:.+]] = linalg.generic {indexing_maps = [#[[MAP3]], #[[MAP4]], #[[MAP5]]], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%[[PACK0]], %[[PACK1]] : tensor<4x4x32x32xf32>, tensor<4x4x32x32xf32>) outs(%[[PACK2]] : tensor<4x4x32x32xf32>) -// CHECK: %[[OUT:.+]] = linalg.unpack %[[VAL]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[ARG2]] : tensor<4x4x32x32xf32> -> tensor<128x128xf32> -// CHECK: return %[[OUT]] : tensor<128x128xf32> - -// ----- - -func.func @block_linalg_matmul_transpose_b( - %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) - -> tensor<128x128xf32> { - %0 = linalg.matmul_transpose_b ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>) - outs(%arg2: tensor<128x128xf32>) - -> tensor<128x128xf32> - return %0 : tensor<128x128xf32> -} - -// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> -// CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> - -// CHECK-LABEL: func @block_linalg_matmul_transpose_b( -// CHECK-SAME: %[[ARG0:[0-9a-z]+]]: tensor<128x128xf32> -// CHECK-SAME: %[[ARG1:[0-9a-z]+]]: tensor<128x128xf32> -// CHECK-SAME: %[[ARG2:[0-9a-z]+]]: tensor<128x128xf32>) -> tensor<128x128xf32> { -// CHECK: %[[BUF0:.+]] = tensor.empty() : tensor<4x4x32x32xf32> -// CHECK: %[[PACK0:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[BUF0]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32> -// CHECK: %[[BUF1:.*]] = tensor.empty() : tensor<4x4x32x32xf32> -// CHECK: %[[PACK1:.+]] = linalg.pack %[[ARG1]] outer_dims_perm = [0, 1] inner_dims_pos = [1, 0] inner_tiles = [32, 32] into %[[BUF1]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32> -// CHECK: %[[BUF2:.+]] = tensor.empty() : tensor<4x4x32x32xf32> -// CHECK: %[[PACK2:.+]] = linalg.pack %[[ARG2]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[BUF2]] : tensor<128x128xf32> -> tensor<4x4x32x32xf32> -// CHECK: %[[VAL:.+]] = linalg.generic {indexing_maps = [#[[MAP3]], #[[MAP4]], #[[MAP5]]], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%[[PACK0]], %[[PACK1]] : tensor<4x4x32x32xf32>, tensor<4x4x32x32xf32>) outs(%[[PACK2]] : tensor<4x4x32x32xf32>) -// CHECK: %[[OUT:.+]] = linalg.unpack %[[VAL]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[ARG2]] : tensor<4x4x32x32xf32> -> tensor<128x128xf32> -// CHECK: return %[[OUT]] : tensor<128x128xf32> - -// ----- - func.func @block_linalg_matmul_dynamic( %arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { diff --git a/test/Passes/split-reduction-dim.mlir b/test/Passes/split-reduction-dim.mlir index 78eba15fb..baf849cad 100644 --- a/test/Passes/split-reduction-dim.mlir +++ b/test/Passes/split-reduction-dim.mlir @@ -178,52 +178,6 @@ func.func @tile_batch_reduce_matmul(%A: memref<2x32x64xf32>, %B: memref<2x64x16x // ----- -func.func @tile_matmul_transpose_a(%A: memref<64x32xf32>, %B: memref<64x16xf32>, - %C: memref<32x16xf32>) { - linalg.matmul_transpose_a ins(%A, %B: memref<64x32xf32>, memref<64x16xf32>) - outs(%C: memref<32x16xf32>) - return -} - -// CHECK-LABEL: @tile_matmul_transpose_a( -// CHECK-SAME: %[[A:[0-9a-z]+]]: memref<64x32xf32> -// CHECK-SAME: %[[B:[0-9a-z]+]]: memref<64x16xf32> -// CHECK-SAME: %[[C:[0-9a-z]+]]: memref<32x16xf32> -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[UB:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[K_TILE:.+]] = arith.constant 8 : index -// CHECK: scf.for %[[IV:.+]] = %[[C0]] to %[[UB]] step %[[K_TILE]] { -// CHECK: %[[SUBVIEW_A:.+]] = memref.subview %[[A]][%[[IV]], 0] [8, 32] [1, 1] -// CHECK: %[[SUBVIEW_B:.+]] = memref.subview %[[B]][%[[IV]], 0] [8, 16] [1, 1] -// CHECK: linalg.matmul_transpose_a -// CHECK-SAME: ins(%[[SUBVIEW_A]], %[[SUBVIEW_B]] -// CHECK-SAME: outs(%[[C]] - -// ----- - -func.func @tile_matmul_transpose_b(%A: memref<32x64xf32>, %B: memref<16x64xf32>, - %C: memref<32x16xf32>) { - linalg.matmul_transpose_b ins(%A, %B: memref<32x64xf32>, memref<16x64xf32>) - outs(%C: memref<32x16xf32>) - return -} - -// CHECK-LABEL: @tile_matmul_transpose_b( -// CHECK-SAME: %[[A:[0-9a-z]+]]: memref<32x64xf32> -// CHECK-SAME: %[[B:[0-9a-z]+]]: memref<16x64xf32> -// CHECK-SAME: %[[C:[0-9a-z]+]]: memref<32x16xf32> -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[UB:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[K_TILE:.+]] = arith.constant 8 : index -// CHECK: scf.for %[[IV:.+]] = %[[C0]] to %[[UB]] step %[[K_TILE]] { -// CHECK: %[[SUBVIEW_A:.+]] = memref.subview %[[A]][0, %[[IV]]] [32, 8] [1, 1] -// CHECK: %[[SUBVIEW_B:.+]] = memref.subview %[[B]][0, %[[IV]]] [16, 8] [1, 1] -// CHECK: linalg.matmul_transpose_b -// CHECK-SAME: ins(%[[SUBVIEW_A]], %[[SUBVIEW_B]] -// CHECK-SAME: outs(%[[C]] - -// ----- - #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> func.func @tile_generic_1D(%A: memref<32xf32>, %B: memref<32xf32>, %C: memref) { diff --git a/tools/tpp-opt/CMakeLists.txt b/tools/tpp-opt/CMakeLists.txt index 2c6128096..99b69e5a0 100644 --- a/tools/tpp-opt/CMakeLists.txt +++ b/tools/tpp-opt/CMakeLists.txt @@ -8,10 +8,17 @@ if(USE_OneDNN) set(ONEDNN_LIBS "tpp_dnnl_runner_utils") endif() +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set_source_files_properties( + tpp-opt.cpp + PROPERTIES COMPILE_FLAGS "-Wno-error=subobject-linkage") +endif() + + set(LIBS - ${dialect_libs} - ${conversion_libs} - ${extension_libs} + MLIRRegisterAllExtensions + MLIRRegisterAllDialects + MLIRRegisterAllPasses MLIRToLLVMIRTranslationRegistration MLIROptLib TPPPipeline diff --git a/tools/tpp-opt/tpp-opt.cpp b/tools/tpp-opt/tpp-opt.cpp index 4b3fd8b8b..43826add3 100644 --- a/tools/tpp-opt/tpp-opt.cpp +++ b/tools/tpp-opt/tpp-opt.cpp @@ -23,6 +23,7 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h" +#include "mlir/Dialect/Transform/TuneExtension/TuneExtension.h" #include "TPP/Dialect/Check/BufferizableOpInterfaceImpl.h" #include "TPP/Dialect/Check/CheckDialect.h" diff --git a/tools/tpp-run/CMakeLists.txt b/tools/tpp-run/CMakeLists.txt index 791954d67..00e097670 100644 --- a/tools/tpp-run/CMakeLists.txt +++ b/tools/tpp-run/CMakeLists.txt @@ -10,9 +10,9 @@ if(USE_OneDNN) endif() set(LIBS - ${dialect_libs} - ${conversion_libs} - ${extension_libs} + MLIRRegisterAllExtensions + MLIRRegisterAllDialects + MLIRRegisterAllPasses MLIRAnalysis MLIRExecutionEngine MLIRIR diff --git a/tools/tpp-run/tpp-run.cpp b/tools/tpp-run/tpp-run.cpp index a9257c938..4ab5d4606 100644 --- a/tools/tpp-run/tpp-run.cpp +++ b/tools/tpp-run/tpp-run.cpp @@ -52,6 +52,7 @@ #include "mlir/Target/LLVMIR/Export.h" #include "mlir/Target/LLVMIR/ModuleTranslation.h" #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h" +#include "mlir/Dialect/Transform/TuneExtension/TuneExtension.h" #include "TPP/Dialect/Check/CheckDialect.h" #include "TPP/Dialect/Perf/PerfDialect.h"