Merge branch 'main' into xiaohui/vectorization

Xu, Xiaohui1 · Xu, Xiaohui1 · commit 9a62f0b5475e · 2024-09-20T17:28:27.000+08:00
diff --git a/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h b/include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h
@@ -11,6 +11,8 @@
 
 #include <cmath>
 
+#include "gc/Utils.h"
+
 extern "C" {
 // Runtime interfaces
 
@@ -26,24 +28,25 @@ extern "C" {
  * 	                given in dnnl type value.
  * Output: A handle of dispatched kernel.
  */
-int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
-                             int64_t LDB, int64_t LDC, int64_t stride_a,
-                             int64_t stride_b, float beta, int64_t dtypeA,
-                             int64_t dtypeB);
+GC_DLL_EXPORT int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K,
+                                           int64_t LDA, int64_t LDB,
+                                           int64_t LDC, int64_t stride_a,
+                                           int64_t stride_b, float beta,
+                                           int64_t dtypeA, int64_t dtypeB);
 
 /**
  * Config the AMX tile context for given kernel.
  * Inputs: A handle of dispatched kernel.
  * Output: None.
  */
-void dnnl_brgemm_tileconfig(int64_t kernel);
+GC_DLL_EXPORT void dnnl_brgemm_tileconfig(int64_t kernel);
 
 /**
  * Release the current AMX tile context.
  * Inputs: None.
  * Output: None.
  */
-void dnnl_brgemm_tilerelease();
+GC_DLL_EXPORT void dnnl_brgemm_tilerelease();
 
 /**
  * Execute the given kernel with given parameters.
@@ -54,9 +57,10 @@ void dnnl_brgemm_tilerelease();
  * 	num: Batch size of Brgemm.
  * Output: None.
  */
-void dnnl_brgemm_execute(int64_t kernel, void *A, uint64_t A_offset, void *B,
-                         uint64_t B_offset, void *C, uint64_t C_offset,
-                         int num);
+GC_DLL_EXPORT void dnnl_brgemm_execute(int64_t kernel, void *A,
+                                       uint64_t A_offset, void *B,
+                                       uint64_t B_offset, void *C,
+                                       uint64_t C_offset, int num);
 }
 
 struct bf16_t {
diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h
@@ -115,6 +115,10 @@ std::unique_ptr<Pass> createMergeAllocPass();
 void populateFrontendPasses(mlir::OpPassManager &);
 void populateCPUPipeline(mlir::OpPassManager &);
 
+#ifdef GC_USE_IMEX
+void populateGPUPipeline(mlir::OpPassManager &);
+#endif
+
 #define GEN_PASS_DECL
 #include "gc/Transforms/Passes.h.inc"
 
diff --git a/include/gc/Utils.h b/include/gc/Utils.h
@@ -0,0 +1,18 @@
+//===-- Utils.h - Common utility functions and macros -----------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GC_UTILS_H
+#define GC_UTILS_H
+
+#if defined _WIN32 || defined __CYGWIN__
+#define GC_DLL_EXPORT __declspec(dllexport)
+#else
+#define GC_DLL_EXPORT __attribute__((visibility("default")))
+#endif
+
+#endif
diff --git a/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp b/lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp
@@ -21,6 +21,8 @@
 #include <unistd.h>
 #endif
 
+#include "gc/Utils.h"
+
 #ifdef _MSC_VER
 #define __builtin_expect(EXP_, C) (EXP_)
 #endif
@@ -239,16 +241,18 @@ static thread_local FILOMemoryPool mainMemoryPool_{mainChunkSize};
 // if the current thread is a worker thread, use this pool
 static thread_local FILOMemoryPool threadMemoryPool_{threadlocalChunkSize};
 
-extern "C" void *gcAlignedMalloc(size_t sz) noexcept {
+extern "C" GC_DLL_EXPORT void *gcAlignedMalloc(size_t sz) noexcept {
   return mainMemoryPool_.alloc(sz);
 }
 
-extern "C" void gcAlignedFree(void *p) noexcept { mainMemoryPool_.dealloc(p); }
+extern "C" GC_DLL_EXPORT void gcAlignedFree(void *p) noexcept {
+  mainMemoryPool_.dealloc(p);
+}
 
-extern "C" void *gcThreadAlignedMalloc(size_t sz) noexcept {
+extern "C" GC_DLL_EXPORT void *gcThreadAlignedMalloc(size_t sz) noexcept {
   return threadMemoryPool_.alloc(sz);
 }
 
-extern "C" void gcThreadAlignedFree(void *p) noexcept {
+extern "C" GC_DLL_EXPORT void gcThreadAlignedFree(void *p) noexcept {
   threadMemoryPool_.dealloc(p);
 }
diff --git a/lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt b/lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt
@@ -4,6 +4,9 @@ gc_add_mlir_library(GcOpenclRuntime
     SHARED
     OpenCLRuntimeWrappers.cpp
 
+    LINK_LIBS PUBLIC
+    GcInterface
+
     EXCLUDE_FROM_LIBMLIR
   )
 
diff --git a/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp b/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp
@@ -18,11 +18,9 @@
 #include <stdexcept>
 #include <vector>
 
-#ifdef _WIN32
-#define OCL_RUNTIME_EXPORT __declspec(dllexport)
-#else
-#define OCL_RUNTIME_EXPORT __attribute__((visibility("default")))
-#endif // _WIN32
+#include "gc/Utils.h"
+
+#define OCL_RUNTIME_EXPORT GC_DLL_EXPORT
 
 namespace {
 
diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt
@@ -13,8 +13,8 @@ gc_add_mlir_library(GcPasses
   MemRefToCPURuntime.cpp
   OneDNNGraphToLinalg.cpp
   Pipeline.cpp
+  TileUsingInterfaceX.cpp
   IterativeTilingAndFusion.cpp
-  TilingUsingInterfaceX.cpp
   VerifyTargetDescription.cpp
   DecomposeAggregatedOps.cpp
   DeepTileContractionOp.cpp
diff --git a/lib/gc/Transforms/GPU/CMakeLists.txt b/lib/gc/Transforms/GPU/CMakeLists.txt
@@ -1,5 +1,6 @@
 gc_add_mlir_library(GcGpuPasses
   LinalgToXeGPU.cpp
+  Pipeline.cpp
 
   DEPENDS
     GraphCompilerPassIncGen
@@ -18,3 +19,7 @@ gc_add_mlir_library(GcGpuPasses
     GcUtilsIR
 )
 
+include(imex)
+get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
+target_include_directories(GcGpuPasses PRIVATE ${IMEX_INCLUDES})
+
diff --git a/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp b/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp
@@ -1405,6 +1405,17 @@ LogicalResult createMemoryFillKernel(linalg::LinalgOp linalgOp,
   auto outputType = cast<ShapedType>(output.getType());
   auto outputShape = outputType.getShape();
 
+  if (outputShape.size() != 2) {
+    return rewriter.notifyMatchFailure(
+        linalgOp, "Memory fill operation expects 2D output");
+  }
+
+  // Otherwise 'xegpu-to-vc' pass will fail to convert it to VC
+  if (outputShape[0] * outputShape[1] < 16) {
+    return rewriter.notifyMatchFailure(
+        linalgOp, "Memory fill operation is to small to be converted to xegpu");
+  }
+
   // Extract SIMD sized sub-tiles
   int maxSizeSIMD = 256;
   int64_t subTileCols = outputShape[1];
diff --git a/lib/gc/Transforms/GPU/Pipeline.cpp b/lib/gc/Transforms/GPU/Pipeline.cpp
@@ -0,0 +1,120 @@
+//===- Pipeline.cpp - Graph Compiler GPU pipeline ---------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Math/Transforms/Passes.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/Passes.h"
+#include <iostream>
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
+
+#include <imex/Conversion/Passes.h>
+#include <imex/Transforms/Passes.h>
+
+#include <string>
+
+#include "gc/Transforms/Passes.h"
+
+namespace mlir::gc {
+
+void populateGPUPipeline(mlir::OpPassManager &pm) {
+  pm.addNestedPass<func::FuncOp>(createIterativeTilingAndFusion());
+
+  pm.addPass(bufferization::createEmptyTensorEliminationPass());
+  pm.addPass(bufferization::createEmptyTensorToAllocTensorPass());
+
+  bufferization::OneShotBufferizationOptions options;
+  options.bufferizeFunctionBoundaries = true;
+  options.setFunctionBoundaryTypeConversion(
+      bufferization::LayoutMapOption::IdentityLayoutMap);
+  pm.addPass(bufferization::createOneShotBufferizePass(options));
+
+  pm.addPass(bufferization::createDropEquivalentBufferResultsPass());
+  pm.addNestedPass<func::FuncOp>(
+      bufferization::createFinalizingBufferizePass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+  pm.addPass(bufferization::createDropEquivalentBufferResultsPass());
+  pm.addPass(memref::createExpandReallocPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(bufferization::createOwnershipBasedBufferDeallocationPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(bufferization::createBufferDeallocationSimplificationPass());
+  pm.addPass(bufferization::createLowerDeallocationsPass());
+  pm.addPass(createCSEPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createBufferizationToMemRefPass());
+
+  pm.addNestedPass<func::FuncOp>(createForallToParallelLoopPass());
+  pm.addNestedPass<func::FuncOp>(createLinalgToXeGPU(
+      {/*kTile=*/16, /*stages=*/1, /*dpasTiles=*/{8, 16, 16}}));
+
+  pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
+  pm.addPass(xegpu::createXeGPUFoldAliasOps());
+  pm.addPass(memref::createFoldMemRefAliasOpsPass());
+  pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());
+  pm.addNestedPass<func::FuncOp>(createParallelLoopToGpuPass());
+
+  pm.addNestedPass<func::FuncOp>(imex::createInsertGPUAllocsPass("opencl"));
+  pm.addPass(createGpuKernelOutliningPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(imex::createSetSPIRVCapabilitiesPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(
+      imex::createSetSPIRVAbiAttributePass("opencl"));
+  pm.addPass(createLowerAffinePass());
+  pm.addPass(imex::createVectorLinearizePass());
+  pm.addNestedPass<gpu::GPUModuleOp>(imex::createConvertXeGPUToVCPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
+  pm.addPass(imex::createBF16ToGPUPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertFuncToSPIRVPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSPIRVPass());
+  pm.addPass(imex::createConvertGPUXToSPIRVPass());
+  pm.addNestedPass<spirv::ModuleOp>(spirv::createSPIRVLowerABIAttributesPass());
+  pm.addNestedPass<spirv::ModuleOp>(spirv::createSPIRVUpdateVCEPass());
+  pm.addNestedPass<func::FuncOp>(LLVM::createRequestCWrappersPass());
+  pm.addPass(imex::createSerializeSPIRVPass());
+  pm.addPass(createConvertVectorToSCFPass());
+  pm.addPass(imex::createConvertGPUToGPUXPass());
+  pm.addPass(createConvertSCFToCFPass());
+  pm.addPass(createConvertControlFlowToLLVMPass());
+  pm.addPass(createConvertVectorToLLVMPass());
+  pm.addPass(createConvertIndexToLLVMPass());
+  pm.addPass(createArithToLLVMConversionPass());
+  pm.addPass(createConvertFuncToLLVMPass());
+  pm.addPass(createConvertMathToLLVMPass());
+  pm.addPass(imex::createConvertGPUXToLLVMPass());
+  pm.addPass(createConvertIndexToLLVMPass());
+  pm.addPass(memref::createExpandStridedMetadataPass());
+  pm.addPass(createLowerAffinePass());
+  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
+}
+
+void registerGPUPipeline() {
+  PassPipelineRegistration<>("gc-gpu-pipeline",
+                             "The GPU pipeline for Graph Compiler with IMEX",
+                             populateGPUPipeline);
+}
+
+} // namespace mlir::gc
diff --git a/lib/gc/Transforms/IterativeTilingAndFusion.cpp b/lib/gc/Transforms/IterativeTilingAndFusion.cpp
@@ -33,7 +33,7 @@
 #include <memory>
 #include <unordered_map>
 
-#include "TilingUsingInterfaceX.h"
+#include "TileUsingInterfaceX.h"
 
 namespace mlir {
 namespace gc {
diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp
@@ -56,9 +56,9 @@ void populateTensorPasses(mlir::OpPassManager &pm) {
   // linalg.matmul lowering to (scf.loop + linalg.brgemm) pass
   pm.addNestedPass<func::FuncOp>(createDeepTileContractionOp());
 
-  // Fine-grain fusion pass
+  // fine-grain fusion pass
   pm.addNestedPass<func::FuncOp>(createIterativeTilingAndFusion());
-  // todo: fine-grain fusion pass
+
   pm.addNestedPass<func::FuncOp>(
       mlir::microkernel::createConvertLinalgToMicrokernel());
   // todo: lower linalg to arith/math on virtual vector pass
diff --git a/lib/gc/Transforms/TileUsingInterfaceX.cpp b/lib/gc/Transforms/TileUsingInterfaceX.cpp
@@ -1,4 +1,4 @@
-//===-- TilingUsingInterfaceX.cpp -  upstream eXtension ---------*- C++ -*-===//
+//===-- TileUsingInterfaceX.cpp - upstream eXtension ------------*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -26,7 +26,7 @@
 #include "llvm/Support/Debug.h"
 #include <optional>
 
-#include "TilingUsingInterfaceX.h"
+#include "TileUsingInterfaceX.h"
 
 #define DEBUG_TYPE "tile-using-interface-x"
 
diff --git a/lib/gc/Transforms/TileUsingInterfaceX.h b/lib/gc/Transforms/TileUsingInterfaceX.h
@@ -1,13 +1,13 @@
-//===-- TilingUsingInterfaceX.h -  upstream eXtension -----------*- C++ -*-===//
+//===-- TileUsingInterfaceX.h - upstream eXtension --------------*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TEMPORARY_TILEUSINGINTERFACE_X_H
-#define TEMPORARY_TILEUSINGINTERFACE_X_H
+#ifndef TILE_USING_INTERFACE_X_H
+#define TILE_USING_INTERFACE_X_H
 
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 
diff --git a/lib/gc/Transforms/TilingUtil.hpp b/lib/gc/Transforms/TilingUtil.hpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TEMPORARY_TILEUSINGINTERFACE_X_H
-#define TEMPORARY_TILEUSINGINTERFACE_X_H
+#ifndef TILING_UTIL_H
+#define TILING_UTIL_H
 
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@@ -16,7 +16,7 @@
 namespace mlir {
 namespace linalgX {
 
-// An enahncement for the upstream pass to support tiling reduction for MKmk
+// An enhancement for the upstream pass to support tiling reduction for MKmk
 // like cases(with multiple reduction iterators).
 FailureOr<linalg::ForallReductionTilingResult> tileReductionUsingForall(
     RewriterBase &b, PartialReductionOpInterface op,
diff --git a/src/dnnl/dnnl_graph_compiler.cpp b/src/dnnl/dnnl_graph_compiler.cpp
@@ -22,6 +22,7 @@
 
 #include "JsonParser.h"
 #include "gc/ExecutionEngine/Driver/Driver.h"
+#include "gc/Utils.h"
 #include "gc_version.h"
 
 #include "mlir/ExecutionEngine/MemRefUtils.h"
@@ -33,12 +34,6 @@
 
 #include "graph/backend/elyzor/include/dnnl_graph_compiler.h"
 
-#if defined _WIN32 || defined __CYGWIN__
-#define GC_DLL_EXPORT __declspec(dllexport)
-#else
-#define GC_DLL_EXPORT __attribute__((visibility("default")))
-#endif
-
 // dnnl_graph_compiler.h interface implementation.
 
 using namespace mlir;
diff --git a/src/gc-opt/gc-opt.cpp b/src/gc-opt/gc-opt.cpp
diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_32x32.mlir b/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_32x32.mlir
diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_64x64.mlir b/test/mlir/test/gc/gpu-runner/XeGPU/f16_matmul_64x64.mlir
diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir b/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir
diff --git a/test/mlir/test/gc/gpu-runner/mlp.mlir b/test/mlir/test/gc/gpu-runner/mlp.mlir

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,9 @@ gc_add_mlir_library(GcOpenclRuntime`
`4`	`4`	`SHARED`
`5`	`5`	`OpenCLRuntimeWrappers.cpp`
`6`	`6`
	`7`	`+ LINK_LIBS PUBLIC`
	`8`	`+ GcInterface`
	`9`	`+`
`7`	`10`	`EXCLUDE_FROM_LIBMLIR`
`8`	`11`	`)`
`9`	`12`