-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[mlir][linalg] unfold projected permutation. #114704
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
cdf865c
ce58238
b9094dc
3b238c6
e3373b8
296f805
6f61f9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,272 @@ | ||||||
| //===- UnfoldProjectedPermutation.cpp - extract projected projections ---===// | ||||||
| // | ||||||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
| // See https://llvm.org/LICENSE.txt for license information. | ||||||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
| // | ||||||
| //===----------------------------------------------------------------------===// | ||||||
| // | ||||||
| // This file implements pattern to decompose the operand of a GenericOp that | ||||||
| // has `transpose+broadcast` juxtaposed via its affine map into separate | ||||||
| // transpose and broadcast ops. | ||||||
| // | ||||||
| //===----------------------------------------------------------------------===// | ||||||
| // | ||||||
| #include "mlir/Dialect/Linalg/Transforms/Transforms.h" | ||||||
| #include <utility> | ||||||
|
|
||||||
| #include "mlir/Dialect/Affine/IR/AffineOps.h" | ||||||
| #include "mlir/Dialect/Linalg/IR/Linalg.h" | ||||||
| #include <map> | ||||||
| #include <optional> | ||||||
| #include <vector> | ||||||
|
||||||
|
|
||||||
| using namespace mlir; | ||||||
| using namespace mlir::linalg; | ||||||
|
|
||||||
| namespace { | ||||||
|
|
||||||
| /// Projected permutation are effectively folding in of a mixture of | ||||||
|
||||||
| /// transpose and broadcast into the affine map of the operand. | ||||||
| /// While folding of transpose and broadcast into the affine map of the | ||||||
| /// linalg.generic operand is a very effective optimization, sometimes | ||||||
| /// we may want to unfold that, for instance when recognizing named ops. | ||||||
| /// | ||||||
| /// Example | ||||||
| /// | ||||||
| /// ```mlir | ||||||
| /// | ||||||
| /// #projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)> | ||||||
| /// #identity = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> | ||||||
| /// ... | ||||||
| /// %res = linalg.generic | ||||||
| /// { indexing_maps = [#projection, #identity, #identity], | ||||||
| /// iterator_types = ["parallel", "parallel", "parallel", | ||||||
| /// "parallel", "parallel"]} | ||||||
| /// ins(%x, %y : tensor<7x8x9xf32>, tensor<5x9x7x8x10xf32>) | ||||||
| /// outs(%z : tensor<5x9x7x8x10xf32>) { | ||||||
| /// ^bb0(%in: f32, %in_1: f32, %out: f32): | ||||||
| /// %div = arith.divf %in, %in_1 : f32 | ||||||
| /// linalg.yield %div : f32 | ||||||
| /// } -> tensor<5x9x7x8x10xf32> | ||||||
| /// ``` | ||||||
| /// | ||||||
| /// In the above IR operand `%x` map is a projected-permutation. This can be | ||||||
banach-space marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| /// unfolded as: | ||||||
| /// | ||||||
| /// ```mlir | ||||||
| /// ... | ||||||
| /// %x_trans = linalg.transpose | ||||||
| /// ins(%x : tensor<7x8x9xf32>) | ||||||
| /// outs(%e1 : tensor<9x7x8xf32>) permutation = [2, 0, 1] | ||||||
| /// ... | ||||||
| /// %x_trans_bc = linalg.broadcast | ||||||
| /// ins(%x_trans : tensor<9x7x8xf32>) | ||||||
| /// outs(%e2 : tensor<5x9x7x8x10xf32>) dimensions = [0, 4] | ||||||
| /// %2 = linalg.div | ||||||
| /// ins(%x_trans_bc, %y : | ||||||
| /// tensor<5x9x7x8x10xf32>, tensor<5x9x7x8x10xf32>) | ||||||
| /// outs(%arg2 : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> | ||||||
| /// | ||||||
| /// Note that linalg.generic has been 'specialized' to linalg.div. | ||||||
| /// To unfold it is more effective to transpose first and then do the broadcast. | ||||||
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| /// However, if transpose is done first, the permutation map needs to be | ||||||
| /// expressed in terms of reduced dimension (as broadcast hasn't happened yet). | ||||||
| /// Also, the broadcast dimensions in a linalg.generic come from other operands | ||||||
| /// (those not broadcasted along that particular dimension). We work this out | ||||||
| /// by computing the polytope shape of the linalg.gneric from shapes of all the | ||||||
| /// operands (inputs and outputs). | ||||||
|
|
||||||
| struct UnfoldProjectedPermutation : public OpRewritePattern<GenericOp> { | ||||||
| using OpRewritePattern<GenericOp>::OpRewritePattern; | ||||||
|
|
||||||
| LogicalResult matchAndRewrite(GenericOp genericOp, | ||||||
| PatternRewriter &rewriter) const override; | ||||||
| }; | ||||||
|
|
||||||
| /// Calculate shape (dimensions) of the iteration space polytope. | ||||||
| /// This is calculated by concatenating the indexing maps of all operands | ||||||
| /// of the generic; inverting the concatenation; concatenating all the | ||||||
| /// shapes of the operands; and then doing `apply map` to those two. | ||||||
|
||||||
| SmallVector<int64_t> getPolytopeDims(GenericOp op) { | ||||||
|
||||||
| assert(op.hasPureTensorSemantics() && "works only on tensors"); | ||||||
|
|
||||||
| /// Concat indexing maps of all operands and invert the mapping. | ||||||
| auto maps = op.getIndexingMapsArray(); | ||||||
| auto concat = concatAffineMaps(maps); | ||||||
| auto inverse = inversePermutation(concat); | ||||||
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
|
|
||||||
| /// Concat the size of each dims of all operands. | ||||||
| SmallVector<int64_t> dims; | ||||||
| for (auto &operand : op->getOpOperands()) { | ||||||
| auto rankedType = cast<RankedTensorType>(operand.get().getType()); | ||||||
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| for (auto size : rankedType.getShape()) | ||||||
| dims.push_back(size); | ||||||
| } | ||||||
|
|
||||||
| /// Match the inverse map with dims to get polytope dimensions. | ||||||
| /// Note that some maybe 'kDynamic'. | ||||||
| return applyPermutationMap<int64_t>(inverse, dims); | ||||||
| } | ||||||
|
|
||||||
| /// For the given `map` determine what dimensions are transposed | ||||||
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| /// and what dimensions are broadcasted. | ||||||
| /// Returns : | ||||||
| /// `isTransposed, isBroadcast, | ||||||
| /// transpose-permutation, broadcast-dimensions` | ||||||
| /// | ||||||
| std::tuple<bool, bool, SmallVector<int64_t>, SmallVector<int64_t>> | ||||||
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| computeTransposeBroadcast(AffineMap &map) { | ||||||
| assert(map.isProjectedPermutation(false) && "not a projection"); | ||||||
|
|
||||||
| // Dimensions that don't appear on result are broadcast. | ||||||
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| int64_t minorSize = map.getNumResults(); | ||||||
|
|
||||||
| // Convert affine expr to int64_t. | ||||||
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| SmallVector<int64_t> minorResult; | ||||||
| for (int64_t i = 0; i < minorSize; ++i) { | ||||||
| auto expr = cast<AffineDimExpr>(map.getResults()[i]); | ||||||
| minorResult.push_back(expr.getPosition()); | ||||||
| } | ||||||
|
|
||||||
| // If dims are not monotonically increasing then transpose is present. | ||||||
| SmallVector<int64_t> sorted(minorResult); | ||||||
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| std::sort(sorted.begin(), sorted.end()); | ||||||
| bool hasTranspose = !std::equal(minorResult.begin(), minorResult.end(), | ||||||
| sorted.begin(), sorted.end()); | ||||||
|
|
||||||
| // Walk the sorted map result to determine which dimensions are broadcasted. | ||||||
| SmallVector<int64_t> broadcast; | ||||||
| for (int64_t i = 0, j = 0; i < map.getNumInputs(); ++i) { | ||||||
| if (j < minorSize && sorted[j] == i) { | ||||||
| j++; | ||||||
| continue; | ||||||
| } | ||||||
| broadcast.push_back(i); | ||||||
| } | ||||||
| bool hasBroadcast = broadcast.size(); | ||||||
|
||||||
| bool hasBroadcast = broadcast.size(); | |
| bool hasBroadcast = !broadcast.empty(); |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Intuitively this makes sense, but ... why? 😅 Which part would break?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this work at all for dynamic shapes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For a start this will assert when trying to create tensor.empty with dynamic shape. https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp#L874
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, rather than documenting what the code does, could you add a comment saying "why"? Or what's missing? From what you are saying, we'd need to add logic to compute dynamic sizes of the input tensors for ops like EmptyOp? And probably sth else as well?
banach-space marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| // RUN: mlir-opt %s -split-input-file --linalg-specialize-generic-ops | FileCheck %s | ||
|
|
||
| #projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)> | ||
| #identity = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> | ||
|
|
||
| func.func @transpose_and_broadcast(%x : tensor<7x8x9xf32>, %y: tensor<5x9x7x8x10xf32>, %z : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> { | ||
| %res = linalg.generic | ||
| { indexing_maps = [#projection, #identity, #identity], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} | ||
| ins(%x, %y : tensor<7x8x9xf32>, tensor<5x9x7x8x10xf32>) outs(%z : tensor<5x9x7x8x10xf32>) { | ||
| ^bb0(%in: f32, %in_1: f32, %out: f32): | ||
| %div = arith.divf %in, %in_1 : f32 | ||
| linalg.yield %div : f32 | ||
| } -> tensor<5x9x7x8x10xf32> | ||
| return %res : tensor<5x9x7x8x10xf32> | ||
| } | ||
|
|
||
| // CHECK-LABEL: transpose_and_broadcast | ||
| // CHECK-SAME: %[[X:.+]]: tensor<7x8x9xf32>, %[[Y:.+]]: tensor<5x9x7x8x10xf32>, %[[Z:.+]]: tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> { | ||
| // CHECK: %[[E0:.+]] = tensor.empty() : tensor<9x7x8xf32> | ||
| // CHECK: %[[X_trans:.+]] = linalg.transpose ins(%[[X]] : tensor<7x8x9xf32>) outs(%[[E0]] : tensor<9x7x8xf32>) permutation = [2, 0, 1] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would have expected this to be
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, Now, I managed convince myself that this is correct, but please double check for yourself 😅 @MaheshRavishankar , you might be skewed by: #projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)>I think this is the trick (IIUC, this is the actual mapping here):
Whereas you assume that:
Does it make sense?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes of course you are right! Two parts to this.
Therefore, for input
|
||
| // CHECK: %[[E1:.+]] = tensor.empty() : tensor<5x9x7x8x10xf32> | ||
| // CHECK: %[[X_trans_bc:.+]] = linalg.broadcast ins(%[[X_trans]] : tensor<9x7x8xf32>) outs(%[[E1]] : tensor<5x9x7x8x10xf32>) dimensions = [0, 4] | ||
| // CHECK: {{.*}} = linalg.div ins(%[[X_trans_bc]], %[[Y]] : tensor<5x9x7x8x10xf32>, tensor<5x9x7x8x10xf32>) outs(%[[Z]] : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> | ||
| // CHECK-NOT: linalg.generic | ||
|
|
||
| // ----- | ||
|
|
||
| #identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | ||
| #transposed = affine_map<(d0, d1, d2) -> (d2, d0, d1)> | ||
|
|
||
| func.func @transpose_only(%x : tensor<32x2x16xf32>, %y: tensor<2x16x32xf32>, %z : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { | ||
| %res = linalg.generic | ||
| { indexing_maps = [#transposed, #identity, #identity], iterator_types = ["parallel", "parallel", "parallel"]} | ||
| ins(%x, %y : tensor<32x2x16xf32>, tensor<2x16x32xf32>) | ||
| outs(%z : tensor<2x16x32xf32>) { | ||
| ^bb0(%in: f32, %in_1: f32, %out: f32): | ||
| %div = arith.divf %in, %in_1 : f32 | ||
| linalg.yield %div : f32 | ||
| } -> tensor<2x16x32xf32> | ||
| return %res : tensor<2x16x32xf32> | ||
| } | ||
|
|
||
| // CHECK-LABEL: transpose_only | ||
| // CHECK-SAME: %[[X:.+]]: tensor<32x2x16xf32>, %[[Y:.+]]: tensor<2x16x32xf32>, %[[Z:.+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { | ||
| // CHECK: %[[E0:.+]] = tensor.empty() : tensor<2x16x32xf32> | ||
| // CHECK: %[[X_trans:.+]] = linalg.transpose ins(%[[X]] : tensor<32x2x16xf32>) outs(%[[E0]] : tensor<2x16x32xf32>) permutation = [1, 2, 0] | ||
| // CHECK: {{.*}} = linalg.div ins(%[[X_trans]], %[[Y]] : tensor<2x16x32xf32>, tensor<2x16x32xf32>) outs(%[[Z]] : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> | ||
| // CHECK-NOT: linalg.generic | ||
|
|
||
| // ----- | ||
|
|
||
| #identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | ||
| #broadcast = affine_map<(d0, d1, d2) -> (d0, d2)> | ||
| func.func @broadcast_only(%x : tensor<2x16x32xf32>, %y: tensor<2x32xf32>, %z : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { | ||
| %res = linalg.generic | ||
| { indexing_maps = [#identity, #broadcast, #identity], iterator_types = ["parallel", "parallel", "parallel"]} | ||
| ins(%x, %y : tensor<2x16x32xf32>, tensor<2x32xf32>) | ||
| outs(%z : tensor<2x16x32xf32>) { | ||
| ^bb0(%in: f32, %in_1: f32, %out: f32): | ||
| %div = arith.divf %in, %in_1 : f32 | ||
| linalg.yield %div : f32 | ||
| } -> tensor<2x16x32xf32> | ||
| return %res : tensor<2x16x32xf32> | ||
| } | ||
|
|
||
| // CHECK-LABEL: broadcast_only | ||
| // CHECK-SAME: %[[X:.+]]: tensor<2x16x32xf32>, %[[Y:.+]]: tensor<2x32xf32>, %[[Z:.+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { | ||
| // CHECK: %[[E0:.+]] = tensor.empty() : tensor<2x16x32xf32> | ||
| // CHECK: %[[X_bc:.+]] = linalg.broadcast ins(%[[Y]] : tensor<2x32xf32>) outs(%[[E0]] : tensor<2x16x32xf32>) dimensions = [1] | ||
| // CHECK: {{.*}} = linalg.div ins(%[[X]], %[[X_bc]] : tensor<2x16x32xf32>, tensor<2x16x32xf32>) outs(%arg2 : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> | ||
| // CHECK-NOT: linalg.generic | ||
Uh oh!
There was an error while loading. Please reload this page.