Skip to content

Commit 028d4f4

Browse files
author
Xu, Xiaohui1
committed
Merge branch 'main' into xiaohui/vectorization
2 parents e8d7612 + 27a7da6 commit 028d4f4

File tree

9 files changed

+220
-143
lines changed

9 files changed

+220
-143
lines changed

cmake/imex.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ if (NOT DEFINED IMEX_INCLUDES)
1414

1515
# TODO: Change to main https://github.com/intel/mlir-extensions when all the
1616
# required functionality is merged.
17-
gc_fetch_content(imex "${IMEX_HASH}" https://github.com/intel/mlir-extensions
17+
set(IMEX_URL https://github.com/intel/mlir-extensions)
18+
gc_fetch_content(imex "${IMEX_HASH}" "${IMEX_URL}"
1819
SET IMEX_CHECK_LLVM_VERSION=ON IMEX_ENABLE_L0_RUNTIME=0
1920
)
2021

cmake/onednn_lite_config.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ if(${DNNL_CPU_RUNTIME} STREQUAL "OMP")
2828
endif()
2929

3030
if(${DNNL_CPU_RUNTIME} STREQUAL "TBB")
31-
include("${DNNL_PATH}/cmake/TBB.cmake")
31+
find_package(TBB REQUIRED)
3232
endif()
3333

3434
########## copied from main cmake file of DNNL
File renamed without changes.

include/gc/Analysis/MatmulConfigAnalysis.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ struct MatmulConfig {
3333
uint32_t innerMostMBlock, innerMostNBlock, innerMostKBlock;
3434
};
3535

36+
bool validateConfig(const MatmulConfig &cfg, ArrayRef<uint32_t> shape,
37+
bool allowIndivisibleInnerblock, bool isVNNIMM2D);
38+
3639
enum DimType { Batch, M, N, K };
3740

3841
// Extract the index of the given DimType in the DimType list

include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -149,33 +149,27 @@ static constexpr auto ZERO_PTR = const_cast<int64_t *>(&ZERO);
149149
struct OclContext {
150150
const OclRuntime &runtime;
151151
const cl_command_queue queue;
152-
// Preserve the execution order. This is required in case of out-of-order
153-
// execution (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE). When the execution
154-
// is completed, the 'lastEvent' field contains the event of the last enqueued
155-
// command. If this field is false, 'waitList' is ignored.
156-
const bool preserveOrder;
152+
// Create 'cl_event' object, for each enqueued command, that can be used to
153+
// query or wait for the command to complete. This is required in case of
154+
// out-of-order execution (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE), but can
155+
// also be used to get the last event. When the execution is completed, the
156+
// 'lastEvent' field contains the event of the last enqueued command. If this
157+
// field is false, 'waitList' is ignored.
158+
const bool createEvents;
157159
cl_uint waitListLen;
158160
cl_event *waitList;
159161
cl_event lastEvent;
160162

161-
explicit OclContext(const OclRuntime &runtime, cl_command_queue queue,
162-
cl_uint waitListLen = 0, cl_event *waitList = nullptr)
163-
: OclContext(runtime, queue, OclRuntime::isOutOfOrder(queue), waitListLen,
164-
waitList) {}
163+
explicit OclContext(const OclRuntime &runtime, cl_command_queue queue)
164+
: OclContext(runtime, queue, OclRuntime::isOutOfOrder(queue)) {}
165165

166166
explicit OclContext(const OclRuntime &runtime, cl_command_queue queue,
167-
bool preserveOrder, cl_uint waitListLen,
168-
cl_event *waitList)
169-
: runtime(runtime), queue(queue), preserveOrder(preserveOrder),
170-
waitListLen(preserveOrder ? waitListLen : 0),
171-
waitList(preserveOrder ? waitList : nullptr), lastEvent(nullptr),
172-
clPtrs(nullptr) {
173-
assert(!OclRuntime::isOutOfOrder(queue) || preserveOrder);
174-
assert(preserveOrder || (waitListLen == 0 && waitList == nullptr));
175-
}
167+
bool createEvents, cl_uint waitListLen = 0,
168+
cl_event *waitList = nullptr);
176169

177170
OclContext(const OclContext &) = delete;
178171
OclContext &operator=(const OclContext &) = delete;
172+
~OclContext();
179173

180174
[[nodiscard]] llvm::Expected<bool> finish();
181175

@@ -186,16 +180,7 @@ struct OclContext {
186180
template <unsigned N> friend struct StaticExecutor;
187181
std::unordered_set<void *> *clPtrs;
188182

189-
void setLastEvent(cl_event event) {
190-
lastEvent = event;
191-
if (event) {
192-
waitListLen = 1;
193-
waitList = &lastEvent;
194-
} else {
195-
waitListLen = 0;
196-
waitList = nullptr;
197-
}
198-
}
183+
void setLastEvent(cl_event event);
199184
};
200185

201186
struct OclModule {

lib/gc/Analysis/MatmulConfigAnalysis.cpp

Lines changed: 58 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,29 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &ss,
3737
return ss;
3838
}
3939

40-
bool validateConfig(const MatmulConfig &cfg) {
40+
bool validateConfig(const MatmulConfig &cfg, ArrayRef<uint32_t> shape,
41+
bool allowIndivisibleInnerblock, bool isVNNIMM2D) {
4142
if (cfg.MThreads <= 0 || cfg.NThreads <= 0 || cfg.KThreads <= 0 ||
4243
cfg.MBlock <= 0 || cfg.NBlock <= 0 || cfg.KBlock <= 0 ||
4344
cfg.innerMostMBlock <= 0 || cfg.innerMostNBlock <= 0 ||
4445
cfg.innerMostKBlock <= 0)
4546
return false;
4647
if (cfg.MBlock % cfg.innerMostMBlock != 0 ||
47-
cfg.NBlock % cfg.innerMostNBlock != 0 ||
48-
cfg.KBlock % cfg.innerMostKBlock != 0)
48+
(shape[0] % cfg.innerMostMBlock != 0 && !allowIndivisibleInnerblock))
49+
return false;
50+
if (cfg.NBlock % cfg.innerMostNBlock != 0 ||
51+
((shape[1] % cfg.innerMostNBlock != 0) && !allowIndivisibleInnerblock) ||
52+
(shape[1] % cfg.NThreads != 0 && isVNNIMM2D &&
53+
cfg.NBlock != cfg.innerMostNBlock))
54+
return false;
55+
// Require K % KBlock == 0 as brgemm dynamic bs is not supported now
56+
if (cfg.KBlock % cfg.innerMostKBlock != 0 ||
57+
((shape[2] / cfg.KThreads % cfg.KBlock != 0 ||
58+
shape[2] / cfg.KThreads % cfg.innerMostKBlock != 0) &&
59+
!allowIndivisibleInnerblock))
60+
return false;
61+
// KThreads will not shrink automatically
62+
if (llvm::divideCeil(shape[2], cfg.KBlock) < cfg.KThreads)
4963
return false;
5064
return true;
5165
}
@@ -179,21 +193,22 @@ double dynamicBufferizationCost(linalg::LinalgOp &linalgOp,
179193
ArrayRef<uint32_t> shape,
180194
const MatmulConfig &config,
181195
CPUTargetDescriptionAnalysis &sysDesc) {
182-
assert(validateConfig(config) && "config is invalid");
183196
assert(shape.size() >= 3 && "shape.size() should >= 3");
184197
uint32_t M = shape[0], N = shape[1];
185198
double cost = 0;
186199
uint32_t MNumBlockPerThread =
187200
llvm::divideCeil(M / config.innerMostMBlock, config.MThreads);
188201
uint32_t MNumInnerBlockPerBlock =
189202
llvm::divideCeil(config.MBlock, config.innerMostMBlock);
203+
assert(MNumInnerBlockPerBlock > 0 && "Invalid MNumInnerBlockPerBlock.");
190204
uint32_t MCost = MNumBlockPerThread % MNumInnerBlockPerBlock != 0 ||
191205
(M / config.innerMostNBlock % config.MThreads != 0 &&
192206
config.MBlock != config.innerMostMBlock);
193207
uint32_t NNumBlockPerThread =
194208
llvm::divideCeil(N / config.innerMostNBlock, config.NThreads);
195209
uint32_t NNumInnerBlockPerBlock =
196210
llvm::divideCeil(config.NBlock, config.innerMostNBlock);
211+
assert(NNumInnerBlockPerBlock > 0 && "Invalid NNumInnerBlockPerBlock.");
197212
uint32_t NCost = NNumBlockPerThread % NNumInnerBlockPerBlock != 0 ||
198213
(N / config.innerMostNBlock % config.NThreads != 0 &&
199214
config.NBlock != config.innerMostNBlock);
@@ -312,39 +327,28 @@ prepareConfigCandidates(Operation *root, CPUTargetDescriptionAnalysis &sysDesc,
312327
KBlockCandidates = innerMostKBlockCandidates;
313328
}
314329

315-
// TODO: improve via multi threading or add more constraints to restrict the
316-
// candidate size
330+
bool isVNNIMM2D =
331+
linalgx::isGenericPackedMatmulOp(root, linalgx::PackingType::VNNI_MM2D);
332+
// TODO: improve via multi threading or add more constraints to restrict
333+
// the candidate size
317334
for (uint32_t MThreads : MThreadsCandidates) {
318335
for (uint32_t NThreads : NThreadsCandidates) {
319336
for (uint32_t KThreads : KThreadsCandidates) {
320337
if (!validateThreads({MThreads, NThreads, KThreads}, sysDesc))
321338
continue;
322339
for (uint32_t MBlock : MBlockCandidates) {
323340
for (uint32_t innerMostMBlock : innerMostMBlockCandidates) {
324-
if (MBlock % innerMostMBlock != 0 ||
325-
(shape[0] % innerMostMBlock != 0 &&
326-
!allowIndivisibleInnerblock))
327-
continue;
328341
for (uint32_t NBlock : NBlockCandidates) {
329342
for (uint32_t innerMostNBlock : innerMostNBlockCandidates) {
330-
if (NBlock % innerMostNBlock != 0 ||
331-
(shape[1] % innerMostNBlock != 0 &&
332-
!allowIndivisibleInnerblock))
333-
continue;
334343
for (uint32_t KBlock : KBlockCandidates) {
335344
for (uint32_t innerMostKBlock : innerMostKBlockCandidates) {
336-
// Require K % KBlock == 0 as dynamic bs is not supported
337-
// now
338-
if (KBlock % innerMostKBlock != 0 ||
339-
((shape[2] / KThreads % KBlock != 0 ||
340-
shape[2] / KThreads % innerMostKBlock != 0) &&
341-
!allowIndivisibleInnerblock))
342-
continue;
343345
MatmulConfig config{
344346
MThreads, NThreads, KThreads,
345347
MBlock, NBlock, KBlock,
346348
innerMostMBlock, innerMostNBlock, innerMostKBlock};
347-
configs.push_back(config);
349+
if (validateConfig(config, shape,
350+
allowIndivisibleInnerblock, isVNNIMM2D))
351+
configs.push_back(config);
348352
}
349353
}
350354
}
@@ -393,12 +397,28 @@ bool readConfigFromAttrs(MatmulConfig &config, ArrayRef<NamedAttribute> attrs) {
393397
cfgItemCnt++;
394398
}
395399
}
396-
if (validateConfig(config)) {
397-
return cfgItemCnt == 9;
398-
} else {
399-
LLVM_DEBUG(llvm::dbgs() << "The predefined config is invalid\n");
400+
return cfgItemCnt == 9;
401+
}
402+
403+
bool readAndValidateConfig(MatmulConfig &config,
404+
const linalg::LinalgOp &linalgOp,
405+
ArrayRef<uint32_t> shape,
406+
bool allowIndivisibleInnerBlock) {
407+
SmallVector<NamedAttribute> attrs(linalgOp->getAttrs());
408+
bool fullConfig = readConfigFromAttrs(config, attrs);
409+
if (!fullConfig) {
410+
LLVM_DEBUG(llvm::dbgs() << "Missing fields in predefined config.\n");
400411
return false;
401412
}
413+
bool validConfig =
414+
validateConfig(config, shape, allowIndivisibleInnerBlock,
415+
linalgx::isGenericPackedMatmulOp(
416+
linalgOp, linalgx::PackingType::VNNI_MM2D));
417+
if (!validConfig) {
418+
LLVM_DEBUG(llvm::dbgs() << "Invalid predefined config.\n");
419+
return false;
420+
}
421+
return true;
402422
}
403423

404424
// Analyze the workload and system description to generate the default config
@@ -482,12 +502,15 @@ MatmulConfig MatmulConfigAnalysis::getConfig() {
482502
<< "M: " << M << ", N: " << N << ", K: " << K << "\n");
483503

484504
// try to read the config from the attributes
485-
SmallVector<NamedAttribute> attrs(linalgOp->getAttrs());
486-
bool hasPredefinedConfig = readConfigFromAttrs(config, attrs);
505+
bool hasValidPredefinedConfig = readAndValidateConfig(
506+
config, linalgOp, SmallVector<uint32_t>{M, N, K},
507+
allowIndivisibleInnerBlock);
487508

488509
// if there is a given config, skip the cost model
489-
if (!hasPredefinedConfig) {
490-
LLVM_DEBUG(llvm::dbgs() << "No predefined config\n");
510+
if (!hasValidPredefinedConfig) {
511+
LLVM_DEBUG(
512+
llvm::dbgs()
513+
<< "No valid predefined config. Setting with default config.\n");
491514
// TODO: Could add a weight or priority for cost model
492515
SmallVector<std::tuple<CostModelFn, std::string, double>>
493516
costModelList = {
@@ -511,6 +534,11 @@ MatmulConfig MatmulConfigAnalysis::getConfig() {
511534
}
512535
if (!configCandidates.empty())
513536
config = configCandidates[0];
537+
538+
assert(validateConfig(config, shape, allowIndivisibleInnerBlock,
539+
linalgx::isGenericPackedMatmulOp(
540+
root, linalgx::PackingType::VNNI_MM2D)) &&
541+
"config is invalid");
514542
}
515543

516544
LLVM_DEBUG(llvm::dbgs()
@@ -520,7 +548,6 @@ MatmulConfig MatmulConfigAnalysis::getConfig() {
520548
hasConfig = true;
521549
}
522550

523-
assert(validateConfig(config) && "config is invalid");
524551
return config;
525552
}
526553
} // namespace gc

0 commit comments

Comments
 (0)