Skip to content

Commit d0c9fb1

Browse files
[mlir][Linalg] Improve codegen strategy
This revision improves the usage of the codegen strategy by adding a few flags that make it easier to control for the CLI. Usage of ModuleOp is replaced by FuncOp as this created issues in multi-threaded mode. A simple benchmarking capability is added for linalg.matmul as well as linalg.matmul_column_major. This latter op is also added to linalg. Now obsolete linalg integration tests that also take too long are deleted. Correctness checks are still missing at this point. Differential revision: https://reviews.llvm.org/D95531
1 parent 279e7ea commit d0c9fb1

16 files changed

+377
-404
lines changed

mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@ def matmul(A: f32(M, K), B: f32(K, N)) -> (C: f32(M, N)) {
33
C(m, n) = std_addf<k>(std_mulf(A(m, k), B(k, n)));
44
}
55

6+
ods_def<MatmulColumnMajorOp>:
7+
def matmul_column_major(A: f32(K, M), B: f32(N, K)) -> (C: f32(N, M)) {
8+
C(n, m) = std_addf<k>(std_mulf(A(k, m), B(n, k)));
9+
}
10+
611
ods_def<MatvecOp>:
712
def matvec(A: f32(M, N), y: f32(N)) -> (x: f32(M)) {
813
x(m) = std_addf<n>(std_mulf(A(m, n), y(n)));

mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,10 @@ def CopyOp : LinalgStructured_Op<"copy", [CopyOpInterface]> {
143143
}];
144144
let verifier = [{ return ::verify(*this); }];
145145

146+
let assemblyFormat = [{
147+
`(` operands `)` attr-dict `:` type(operands)
148+
}];
149+
146150
let hasFolder = 1;
147151
let hasCanonicalizer = 1;
148152
}
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
2+
// RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
3+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
4+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \
5+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" | \
6+
7+
// RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \
8+
// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm | \
9+
// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
10+
// Activate to dump assembly
11+
// R_UN: -dump-object-file -object-filename=/tmp/a.o \
12+
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext | \
13+
// Use tee to both print to stderr and FileCheck
14+
// RUN: tee -a /dev/stderr | FileCheck %s
15+
16+
17+
!row_major_A = type memref<${M}x${K}xf32>
18+
!row_major_B = type memref<${K}x${N}xf32>
19+
!row_major_C = type memref<${M}x${N}xf32>
20+
21+
func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C)
22+
// TODO: activate manually for now.
23+
// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
24+
{
25+
linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
26+
outs(%c: !row_major_C)
27+
return
28+
}
29+
30+
func @print_perf(%iters: index, %total_time: f64) {
31+
%c2 = constant 2 : index
32+
%cM = constant ${M} : index
33+
%cN = constant ${N} : index
34+
%cK = constant ${K} : index
35+
36+
%mn = muli %cM, %cN : index
37+
%mnk = muli %mn, %cK : index
38+
39+
// 2*M*N*K.
40+
%flops_per_iter = muli %c2, %mnk : index
41+
%flops = muli %iters, %flops_per_iter : index
42+
%flops_i64 = index_cast %flops : index to i64
43+
%flops_f = sitofp %flops_i64 : i64 to f64
44+
%flops_per_s = divf %flops_f, %total_time : f64
45+
vector.print %flops_per_s : f64
46+
47+
return
48+
}
49+
50+
func @main() {
51+
%f0 = constant 0.0 : f32
52+
%f1 = constant 1.0 : f32
53+
54+
%A = alloc() : !row_major_A
55+
%B = alloc() : !row_major_B
56+
%C = alloc() : !row_major_C
57+
58+
linalg.fill(%A, %f1) : !row_major_A, f32
59+
linalg.fill(%B, %f1) : !row_major_B, f32
60+
linalg.fill(%C, %f0) : !row_major_C, f32
61+
62+
%c0 = constant 0: index
63+
%c1 = constant 1: index
64+
%iters = constant ${ITERS}: index
65+
66+
/// Run and dump performance for matmul.
67+
/// Preheating run:
68+
scf.for %arg0 = %c0 to %iters step %c1 {
69+
linalg.fill(%C, %f0) : !row_major_C, f32
70+
call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
71+
}
72+
%t_start_matmul = call @rtclock() : () -> f64
73+
scf.for %arg0 = %c0 to %iters step %c1 {
74+
// linalg.matmul writes %C in place, need to reset it to zero every time.
75+
// This is accounts for about 10-15% perf hit on small sizes.
76+
// Once linalg on tensors is ready, fusing fill at teh register level will
77+
// be easy.
78+
linalg.fill(%C, %f0) : !row_major_C, f32
79+
call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
80+
}
81+
%t_end_matmul = call @rtclock() : () -> f64
82+
%tmatmul = subf %t_end_matmul, %t_start_matmul: f64
83+
call @print_perf(%iters, %tmatmul) : (index, f64) -> ()
84+
85+
%res = load %C[%c0, %c0]: !row_major_C
86+
// CHECK: 64
87+
vector.print %res: f32
88+
89+
dealloc %A : !row_major_A
90+
dealloc %B : !row_major_B
91+
dealloc %C : !row_major_C
92+
93+
return
94+
}
95+
96+
func private @rtclock() -> f64
97+
98+
// TODO: init with random, run and check output.
99+
// func private @fill_random_f32(memref<*xf32>)
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
2+
// RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
3+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \
4+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \
5+
6+
// TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed.
7+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.copy register-tile-sizes=4,16 vectorize" | \
8+
9+
// RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \
10+
// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm | \
11+
// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
12+
// Activate to dump assembly
13+
// R_UN: -dump-object-file -object-filename=/tmp/a.o \
14+
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext | \
15+
// Use tee to both print to stderr and FileCheck
16+
// RUN: tee -a /dev/stderr | FileCheck %s
17+
18+
!row_major_A = type memref<${M}x${K}xf32>
19+
!row_major_B = type memref<${K}x${N}xf32>
20+
!row_major_C = type memref<${M}x${N}xf32>
21+
!column_major_A = type memref<${K}x${M}xf32>
22+
!column_major_B = type memref<${N}x${K}xf32>
23+
!column_major_C = type memref<${N}x${M}xf32>
24+
25+
func @matmul_column_major(%a: !column_major_A, %b: !column_major_B, %c: !column_major_C)
26+
// TODO: activate manually for now.
27+
// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
28+
{
29+
linalg.matmul_column_major ins(%a, %b : !column_major_A, !column_major_B)
30+
outs(%c: !column_major_C)
31+
return
32+
}
33+
34+
func @print_perf(%iters: index, %total_time: f64) {
35+
%c2 = constant 2 : index
36+
%cM = constant ${M} : index
37+
%cN = constant ${N} : index
38+
%cK = constant ${K} : index
39+
40+
%mn = muli %cM, %cN : index
41+
%mnk = muli %mn, %cK : index
42+
43+
// 2*M*N*K.
44+
%flops_per_iter = muli %c2, %mnk : index
45+
%flops = muli %iters, %flops_per_iter : index
46+
%flops_i64 = index_cast %flops : index to i64
47+
%flops_f = sitofp %flops_i64 : i64 to f64
48+
%flops_per_s = divf %flops_f, %total_time : f64
49+
vector.print %flops_per_s : f64
50+
51+
return
52+
}
53+
54+
func @main() {
55+
%f0 = constant 0.0 : f32
56+
%f1 = constant 1.0 : f32
57+
58+
%cA = alloc() : !column_major_A
59+
%cB = alloc() : !column_major_B
60+
%cC = alloc() : !column_major_C
61+
62+
linalg.fill(%cA, %f1) : !column_major_A, f32
63+
linalg.fill(%cB, %f1) : !column_major_B, f32
64+
linalg.fill(%cC, %f0) : !column_major_C, f32
65+
66+
%c0 = constant 0: index
67+
%c1 = constant 1: index
68+
%iters = constant ${ITERS}: index
69+
70+
/// Run and dump performance for matmul_column_major.
71+
%t_start_matmul_column_major = call @rtclock() : () -> f64
72+
scf.for %arg0 = %c0 to %iters step %c1 {
73+
// linalg.matmul writes %C in place, need to reset it to zero every time.
74+
// This is accounts for about 10-15% perf hit on small sizes.
75+
// Once linalg on tensors is ready, fusing fill at teh register level will
76+
// be easy.
77+
linalg.fill(%cC, %f0) : !column_major_C, f32
78+
call @matmul_column_major(%cA, %cB, %cC) : (!column_major_A, !column_major_B, !column_major_C) -> ()
79+
}
80+
%t_end_matmul_column_major = call @rtclock() : () -> f64
81+
%tmatmul_column_major = subf %t_end_matmul_column_major, %t_start_matmul_column_major: f64
82+
call @print_perf(%iters, %tmatmul_column_major) : (index, f64) -> ()
83+
84+
%res = load %cC[%c0, %c0]: !column_major_C
85+
// CHECK: 64
86+
vector.print %res: f32
87+
88+
dealloc %cA : !column_major_A
89+
dealloc %cB : !column_major_B
90+
dealloc %cC : !column_major_C
91+
92+
return
93+
}
94+
95+
func private @rtclock() -> f64
96+
97+
// TODO: init with random, run and check output.
98+
// func private @fill_random_f32(memref<*xf32>)
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
2+
// RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
3+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \
4+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
5+
// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \
6+
7+
// TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed.
8+
// R_UN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.copy register-tile-sizes=4,16 vectorize" | \
9+
10+
// RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \
11+
// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm | \
12+
// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
13+
// Activate to dump assembly
14+
// R_UN: -dump-object-file -object-filename=/tmp/a.o \
15+
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext | \
16+
// Use tee to both print to stderr and FileCheck
17+
// RUN: tee -a /dev/stderr | FileCheck %s
18+
19+
!row_major_A = type memref<${M}x${K}xf32>
20+
!row_major_B = type memref<${K}x${N}xf32>
21+
!row_major_C = type memref<${M}x${N}xf32>
22+
!column_major_A = type memref<${K}x${M}xf32>
23+
!column_major_B = type memref<${N}x${K}xf32>
24+
!column_major_C = type memref<${N}x${M}xf32>
25+
26+
func @matmul_column_major_as_row_major(
27+
%ca: !column_major_A, %cb: !column_major_B, %cc: !column_major_C,
28+
%a: !row_major_A, %b: !row_major_B, %c: !row_major_C)
29+
// TODO: activate manually for now.
30+
// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
31+
{
32+
linalg.copy(%ca, %a) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !column_major_A, !row_major_A
33+
linalg.copy(%cb, %b) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !column_major_B, !row_major_B
34+
linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
35+
outs(%c: !row_major_C)
36+
linalg.copy(%c, %cc) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !row_major_C, !column_major_C
37+
return
38+
}
39+
40+
func @print_perf(%iters: index, %total_time: f64) {
41+
%c2 = constant 2 : index
42+
%cM = constant ${M} : index
43+
%cN = constant ${N} : index
44+
%cK = constant ${K} : index
45+
46+
%mn = muli %cM, %cN : index
47+
%mnk = muli %mn, %cK : index
48+
49+
// 2*M*N*K.
50+
%flops_per_iter = muli %c2, %mnk : index
51+
%flops = muli %iters, %flops_per_iter : index
52+
%flops_i64 = index_cast %flops : index to i64
53+
%flops_f = sitofp %flops_i64 : i64 to f64
54+
%flops_per_s = divf %flops_f, %total_time : f64
55+
vector.print %flops_per_s : f64
56+
57+
return
58+
}
59+
60+
func @main() {
61+
%f0 = constant 0.0 : f32
62+
%f1 = constant 1.0 : f32
63+
64+
%cA = alloc() : !column_major_A
65+
%cB = alloc() : !column_major_B
66+
%cC = alloc() : !column_major_C
67+
68+
linalg.fill(%cA, %f1) : !column_major_A, f32
69+
linalg.fill(%cB, %f1) : !column_major_B, f32
70+
linalg.fill(%cC, %f0) : !column_major_C, f32
71+
72+
%c0 = constant 0: index
73+
%c1 = constant 1: index
74+
%iters = constant ${ITERS}: index
75+
76+
/// Run and dump performance for matmul_column_major as a row-major
77+
%A = alloc() : !row_major_A
78+
%B = alloc() : !row_major_B
79+
%C = alloc() : !row_major_C
80+
%t_start_matmul_column_major_as_row_major = call @rtclock() : () -> f64
81+
scf.for %arg0 = %c0 to %iters step %c1 {
82+
// linalg.matmul writes %C in place, need to reset it to zero every time.
83+
// This is accounts for about 10-15% perf hit on small sizes.
84+
// Once linalg on tensors is ready, fusing fill at teh register level will
85+
// be easy.
86+
linalg.fill(%C, %f0) : !row_major_C, f32
87+
call @matmul_column_major_as_row_major(%cA, %cB, %cC, %A, %B, %C) :
88+
(!column_major_A, !column_major_B, !column_major_C,
89+
!row_major_A, !row_major_B, !row_major_C) -> ()
90+
}
91+
%t_end_matmul_column_major_as_row_major = call @rtclock() : () -> f64
92+
%tmatmul_column_major_as_row_major = subf %t_end_matmul_column_major_as_row_major, %t_start_matmul_column_major_as_row_major: f64
93+
call @print_perf(%iters, %tmatmul_column_major_as_row_major) : (index, f64) -> ()
94+
95+
%res = load %cC[%c0, %c0]: !column_major_C
96+
// CHECK: 64
97+
vector.print %res: f32
98+
%res2 = load %C[%c0, %c0]: !row_major_C
99+
// CHECK: 64
100+
vector.print %res2: f32
101+
102+
dealloc %A : !row_major_A
103+
dealloc %B : !row_major_B
104+
dealloc %C : !row_major_C
105+
106+
dealloc %cA : !column_major_A
107+
dealloc %cB : !column_major_B
108+
dealloc %cC : !column_major_C
109+
110+
return
111+
}
112+
113+
func private @rtclock() -> f64
114+
115+
// TODO: init with random, run and check output.
116+
// func private @fill_random_f32(memref<*xf32>)

mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,11 @@ void mlir::linalg::CodegenStrategy::transform(FuncOp func) const {
5151
// Some of these may be too aggressive as a stage 3 that is applied on each
5252
// stage 1 application and may have to be split out to post staged patterns
5353
// application (in which case they could just be passes, TBD).
54-
PassManager pm(op->getContext());
55-
pm.addPass(createLoopInvariantCodeMotionPass());
56-
if (failed(pm.run(op->getParentOfType<ModuleOp>())))
57-
llvm_unreachable("Unexpected failure in cleanup pass pipeline.");
54+
op->walk([&](LoopLikeOpInterface loopLike) {
55+
LLVM_DEBUG(loopLike.print(llvm::dbgs() << "\nOriginal loop:\n"));
56+
if (failed(moveLoopInvariantCode(loopLike)))
57+
llvm_unreachable("unexpected LICM failure");
58+
});
5859
promoteSingleIterationLoops(cast<FuncOp>(op));
5960
hoistViewAllocOps(cast<FuncOp>(op));
6061
hoistRedundantVectorTransfers(cast<FuncOp>(op));
@@ -67,31 +68,28 @@ void mlir::linalg::CodegenStrategy::transform(FuncOp func) const {
6768
// Post staged patterns transforms
6869
//===--------------------------------------------------------------------===//
6970

70-
ModuleOp module = func->getParentOfType<ModuleOp>();
71-
7271
// Programmatic splitting of slow/fast path vector transfers.
7372
OwningRewritePatternList patterns;
7473
patterns.insert<vector::VectorTransferFullPartialRewriter>(
7574
context, vectorTransformsOptions);
76-
applyPatternsAndFoldGreedily(module, std::move(patterns));
75+
applyPatternsAndFoldGreedily(func, std::move(patterns));
7776

7877
// Programmatic controlled lowering of vector.contract only.
7978
OwningRewritePatternList vectorContractLoweringPatterns;
8079
vectorContractLoweringPatterns
8180
.insert<ContractionOpToOuterProductOpLowering,
8281
ContractionOpToMatmulOpLowering, ContractionOpLowering>(
8382
vectorTransformsOptions, context);
84-
applyPatternsAndFoldGreedily(module,
85-
std::move(vectorContractLoweringPatterns));
83+
applyPatternsAndFoldGreedily(func, std::move(vectorContractLoweringPatterns));
8684

8785
// Programmatic controlled lowering of vector.transfer only.
8886
OwningRewritePatternList vectorToLoopsPatterns;
8987
populateVectorToSCFConversionPatterns(vectorToLoopsPatterns, context,
9088
vectorToSCFOptions);
91-
applyPatternsAndFoldGreedily(module, std::move(vectorToLoopsPatterns));
89+
applyPatternsAndFoldGreedily(func, std::move(vectorToLoopsPatterns));
9290

9391
// Ensure we drop the marker in the end.
94-
module.walk([](LinalgOp op) {
92+
func.walk([](LinalgOp op) {
9593
op.removeAttr(LinalgTransforms::kLinalgTransformMarker);
9694
});
9795
}

0 commit comments

Comments
 (0)