Skip to content

Commit 9a62f0b

Browse files
author
Xu, Xiaohui1
committed
Merge branch 'main' into xiaohui/vectorization
2 parents aac20a0 + ce6d1d3 commit 9a62f0b

File tree

21 files changed

+333
-135
lines changed

21 files changed

+333
-135
lines changed

include/gc/ExecutionEngine/CPURuntime/Microkernel/BrgemmInterface.h

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111

1212
#include <cmath>
1313

14+
#include "gc/Utils.h"
15+
1416
extern "C" {
1517
// Runtime interfaces
1618

@@ -26,24 +28,25 @@ extern "C" {
2628
* given in dnnl type value.
2729
* Output: A handle of dispatched kernel.
2830
*/
29-
int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K, int64_t LDA,
30-
int64_t LDB, int64_t LDC, int64_t stride_a,
31-
int64_t stride_b, float beta, int64_t dtypeA,
32-
int64_t dtypeB);
31+
GC_DLL_EXPORT int64_t dnnl_brgemm_dispatch(int64_t M, int64_t N, int64_t K,
32+
int64_t LDA, int64_t LDB,
33+
int64_t LDC, int64_t stride_a,
34+
int64_t stride_b, float beta,
35+
int64_t dtypeA, int64_t dtypeB);
3336

3437
/**
3538
* Config the AMX tile context for given kernel.
3639
* Inputs: A handle of dispatched kernel.
3740
* Output: None.
3841
*/
39-
void dnnl_brgemm_tileconfig(int64_t kernel);
42+
GC_DLL_EXPORT void dnnl_brgemm_tileconfig(int64_t kernel);
4043

4144
/**
4245
* Release the current AMX tile context.
4346
* Inputs: None.
4447
* Output: None.
4548
*/
46-
void dnnl_brgemm_tilerelease();
49+
GC_DLL_EXPORT void dnnl_brgemm_tilerelease();
4750

4851
/**
4952
* Execute the given kernel with given parameters.
@@ -54,9 +57,10 @@ void dnnl_brgemm_tilerelease();
5457
* num: Batch size of Brgemm.
5558
* Output: None.
5659
*/
57-
void dnnl_brgemm_execute(int64_t kernel, void *A, uint64_t A_offset, void *B,
58-
uint64_t B_offset, void *C, uint64_t C_offset,
59-
int num);
60+
GC_DLL_EXPORT void dnnl_brgemm_execute(int64_t kernel, void *A,
61+
uint64_t A_offset, void *B,
62+
uint64_t B_offset, void *C,
63+
uint64_t C_offset, int num);
6064
}
6165

6266
struct bf16_t {

include/gc/Transforms/Passes.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,10 @@ std::unique_ptr<Pass> createMergeAllocPass();
115115
void populateFrontendPasses(mlir::OpPassManager &);
116116
void populateCPUPipeline(mlir::OpPassManager &);
117117

118+
#ifdef GC_USE_IMEX
119+
void populateGPUPipeline(mlir::OpPassManager &);
120+
#endif
121+
118122
#define GEN_PASS_DECL
119123
#include "gc/Transforms/Passes.h.inc"
120124

include/gc/Utils.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
//===-- Utils.h - Common utility functions and macros -----------*- C++ -*-===//
2+
//
3+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef GC_UTILS_H
10+
#define GC_UTILS_H
11+
12+
#if defined _WIN32 || defined __CYGWIN__
13+
#define GC_DLL_EXPORT __declspec(dllexport)
14+
#else
15+
#define GC_DLL_EXPORT __attribute__((visibility("default")))
16+
#endif
17+
18+
#endif

lib/gc/ExecutionEngine/CPURuntime/MemoryPool.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
#include <unistd.h>
2222
#endif
2323

24+
#include "gc/Utils.h"
25+
2426
#ifdef _MSC_VER
2527
#define __builtin_expect(EXP_, C) (EXP_)
2628
#endif
@@ -239,16 +241,18 @@ static thread_local FILOMemoryPool mainMemoryPool_{mainChunkSize};
239241
// if the current thread is a worker thread, use this pool
240242
static thread_local FILOMemoryPool threadMemoryPool_{threadlocalChunkSize};
241243

242-
extern "C" void *gcAlignedMalloc(size_t sz) noexcept {
244+
extern "C" GC_DLL_EXPORT void *gcAlignedMalloc(size_t sz) noexcept {
243245
return mainMemoryPool_.alloc(sz);
244246
}
245247

246-
extern "C" void gcAlignedFree(void *p) noexcept { mainMemoryPool_.dealloc(p); }
248+
extern "C" GC_DLL_EXPORT void gcAlignedFree(void *p) noexcept {
249+
mainMemoryPool_.dealloc(p);
250+
}
247251

248-
extern "C" void *gcThreadAlignedMalloc(size_t sz) noexcept {
252+
extern "C" GC_DLL_EXPORT void *gcThreadAlignedMalloc(size_t sz) noexcept {
249253
return threadMemoryPool_.alloc(sz);
250254
}
251255

252-
extern "C" void gcThreadAlignedFree(void *p) noexcept {
256+
extern "C" GC_DLL_EXPORT void gcThreadAlignedFree(void *p) noexcept {
253257
threadMemoryPool_.dealloc(p);
254258
}

lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ gc_add_mlir_library(GcOpenclRuntime
44
SHARED
55
OpenCLRuntimeWrappers.cpp
66

7+
LINK_LIBS PUBLIC
8+
GcInterface
9+
710
EXCLUDE_FROM_LIBMLIR
811
)
912

lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,9 @@
1818
#include <stdexcept>
1919
#include <vector>
2020

21-
#ifdef _WIN32
22-
#define OCL_RUNTIME_EXPORT __declspec(dllexport)
23-
#else
24-
#define OCL_RUNTIME_EXPORT __attribute__((visibility("default")))
25-
#endif // _WIN32
21+
#include "gc/Utils.h"
22+
23+
#define OCL_RUNTIME_EXPORT GC_DLL_EXPORT
2624

2725
namespace {
2826

lib/gc/Transforms/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ gc_add_mlir_library(GcPasses
1313
MemRefToCPURuntime.cpp
1414
OneDNNGraphToLinalg.cpp
1515
Pipeline.cpp
16+
TileUsingInterfaceX.cpp
1617
IterativeTilingAndFusion.cpp
17-
TilingUsingInterfaceX.cpp
1818
VerifyTargetDescription.cpp
1919
DecomposeAggregatedOps.cpp
2020
DeepTileContractionOp.cpp

lib/gc/Transforms/GPU/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
gc_add_mlir_library(GcGpuPasses
22
LinalgToXeGPU.cpp
3+
Pipeline.cpp
34

45
DEPENDS
56
GraphCompilerPassIncGen
@@ -18,3 +19,7 @@ gc_add_mlir_library(GcGpuPasses
1819
GcUtilsIR
1920
)
2021

22+
include(imex)
23+
get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
24+
target_include_directories(GcGpuPasses PRIVATE ${IMEX_INCLUDES})
25+

lib/gc/Transforms/GPU/LinalgToXeGPU.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,6 +1405,17 @@ LogicalResult createMemoryFillKernel(linalg::LinalgOp linalgOp,
14051405
auto outputType = cast<ShapedType>(output.getType());
14061406
auto outputShape = outputType.getShape();
14071407

1408+
if (outputShape.size() != 2) {
1409+
return rewriter.notifyMatchFailure(
1410+
linalgOp, "Memory fill operation expects 2D output");
1411+
}
1412+
1413+
// Otherwise 'xegpu-to-vc' pass will fail to convert it to VC
1414+
if (outputShape[0] * outputShape[1] < 16) {
1415+
return rewriter.notifyMatchFailure(
1416+
linalgOp, "Memory fill operation is to small to be converted to xegpu");
1417+
}
1418+
14081419
// Extract SIMD sized sub-tiles
14091420
int maxSizeSIMD = 256;
14101421
int64_t subTileCols = outputShape[1];

lib/gc/Transforms/GPU/Pipeline.cpp

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
//===- Pipeline.cpp - Graph Compiler GPU pipeline ---------------*- C++ -*-===//
2+
//
3+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "mlir/Conversion/Passes.h"
10+
#include "mlir/Dialect/Arith/Transforms/Passes.h"
11+
#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
12+
#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
13+
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
14+
#include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
15+
#include "mlir/Dialect/Linalg/Passes.h"
16+
#include "mlir/Dialect/Math/Transforms/Passes.h"
17+
#include "mlir/Dialect/MemRef/IR/MemRef.h"
18+
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
19+
#include "mlir/Dialect/SCF/IR/SCF.h"
20+
#include "mlir/Dialect/Tensor/IR/Tensor.h"
21+
#include "mlir/IR/DialectRegistry.h"
22+
#include "mlir/InitAllPasses.h"
23+
#include "mlir/Pass/PassManager.h"
24+
#include "mlir/Support/LogicalResult.h"
25+
#include "mlir/Transforms/Passes.h"
26+
#include <iostream>
27+
28+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
29+
#include "mlir/Dialect/GPU/Transforms/Passes.h"
30+
#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
31+
32+
#include <imex/Conversion/Passes.h>
33+
#include <imex/Transforms/Passes.h>
34+
35+
#include <string>
36+
37+
#include "gc/Transforms/Passes.h"
38+
39+
namespace mlir::gc {
40+
41+
void populateGPUPipeline(mlir::OpPassManager &pm) {
42+
pm.addNestedPass<func::FuncOp>(createIterativeTilingAndFusion());
43+
44+
pm.addPass(bufferization::createEmptyTensorEliminationPass());
45+
pm.addPass(bufferization::createEmptyTensorToAllocTensorPass());
46+
47+
bufferization::OneShotBufferizationOptions options;
48+
options.bufferizeFunctionBoundaries = true;
49+
options.setFunctionBoundaryTypeConversion(
50+
bufferization::LayoutMapOption::IdentityLayoutMap);
51+
pm.addPass(bufferization::createOneShotBufferizePass(options));
52+
53+
pm.addPass(bufferization::createDropEquivalentBufferResultsPass());
54+
pm.addNestedPass<func::FuncOp>(
55+
bufferization::createFinalizingBufferizePass());
56+
pm.addPass(createCanonicalizerPass());
57+
pm.addPass(createCSEPass());
58+
pm.addPass(bufferization::createDropEquivalentBufferResultsPass());
59+
pm.addPass(memref::createExpandReallocPass());
60+
pm.addPass(createCanonicalizerPass());
61+
pm.addPass(bufferization::createOwnershipBasedBufferDeallocationPass());
62+
pm.addPass(createCanonicalizerPass());
63+
pm.addPass(bufferization::createBufferDeallocationSimplificationPass());
64+
pm.addPass(bufferization::createLowerDeallocationsPass());
65+
pm.addPass(createCSEPass());
66+
pm.addPass(createCanonicalizerPass());
67+
pm.addPass(createBufferizationToMemRefPass());
68+
69+
pm.addNestedPass<func::FuncOp>(createForallToParallelLoopPass());
70+
pm.addNestedPass<func::FuncOp>(createLinalgToXeGPU(
71+
{/*kTile=*/16, /*stages=*/1, /*dpasTiles=*/{8, 16, 16}}));
72+
73+
pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
74+
pm.addPass(xegpu::createXeGPUFoldAliasOps());
75+
pm.addPass(memref::createFoldMemRefAliasOpsPass());
76+
pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());
77+
pm.addNestedPass<func::FuncOp>(createParallelLoopToGpuPass());
78+
79+
pm.addNestedPass<func::FuncOp>(imex::createInsertGPUAllocsPass("opencl"));
80+
pm.addPass(createGpuKernelOutliningPass());
81+
pm.addPass(createCanonicalizerPass());
82+
pm.addPass(imex::createSetSPIRVCapabilitiesPass());
83+
pm.addNestedPass<gpu::GPUModuleOp>(
84+
imex::createSetSPIRVAbiAttributePass("opencl"));
85+
pm.addPass(createLowerAffinePass());
86+
pm.addPass(imex::createVectorLinearizePass());
87+
pm.addNestedPass<gpu::GPUModuleOp>(imex::createConvertXeGPUToVCPass());
88+
pm.addPass(createReconcileUnrealizedCastsPass());
89+
pm.addPass(imex::createBF16ToGPUPass());
90+
pm.addNestedPass<gpu::GPUModuleOp>(createConvertFuncToSPIRVPass());
91+
pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSPIRVPass());
92+
pm.addPass(imex::createConvertGPUXToSPIRVPass());
93+
pm.addNestedPass<spirv::ModuleOp>(spirv::createSPIRVLowerABIAttributesPass());
94+
pm.addNestedPass<spirv::ModuleOp>(spirv::createSPIRVUpdateVCEPass());
95+
pm.addNestedPass<func::FuncOp>(LLVM::createRequestCWrappersPass());
96+
pm.addPass(imex::createSerializeSPIRVPass());
97+
pm.addPass(createConvertVectorToSCFPass());
98+
pm.addPass(imex::createConvertGPUToGPUXPass());
99+
pm.addPass(createConvertSCFToCFPass());
100+
pm.addPass(createConvertControlFlowToLLVMPass());
101+
pm.addPass(createConvertVectorToLLVMPass());
102+
pm.addPass(createConvertIndexToLLVMPass());
103+
pm.addPass(createArithToLLVMConversionPass());
104+
pm.addPass(createConvertFuncToLLVMPass());
105+
pm.addPass(createConvertMathToLLVMPass());
106+
pm.addPass(imex::createConvertGPUXToLLVMPass());
107+
pm.addPass(createConvertIndexToLLVMPass());
108+
pm.addPass(memref::createExpandStridedMetadataPass());
109+
pm.addPass(createLowerAffinePass());
110+
pm.addPass(createFinalizeMemRefToLLVMConversionPass());
111+
pm.addPass(createReconcileUnrealizedCastsPass());
112+
}
113+
114+
void registerGPUPipeline() {
115+
PassPipelineRegistration<>("gc-gpu-pipeline",
116+
"The GPU pipeline for Graph Compiler with IMEX",
117+
populateGPUPipeline);
118+
}
119+
120+
} // namespace mlir::gc

0 commit comments

Comments
 (0)