Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 27 additions & 29 deletions lib/gc/Transforms/GPU/AllocsToSLM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "gc/Transforms/Passes.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/TransformOps/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
Expand Down Expand Up @@ -80,22 +81,17 @@ struct ConvertAlloc : public OpRewritePattern<memref::AllocOp> {
return rewriter.notifyMatchFailure(
allocOp, "Only support constant block sizes for now");

int64_t xI = xSz.value();
int64_t yI = ySz.value();
int64_t zI = zSz.value();

if (zI != 1)
return rewriter.notifyMatchFailure(
allocOp, "Only support 2D shared memory for now");

SmallVector<int64_t, 3> blockSizes = {xSz.value(), ySz.value(),
zSz.value()};
MemRefType originalMemRefType = cast<MemRefType>(memref.getType());
auto originalShape = originalMemRefType.getShape();

// Scale the allocation size by the number of threads in the work-group
int64_t newX = originalShape[0] * xI;
int64_t newY = originalShape[1] * yI;

SmallVector<int64_t> newShape = {newX, newY};
// Scale the allocation size (X dimension) by the number of threads in the
// work-group
int64_t newX =
originalShape[0] * blockSizes[0] * blockSizes[1] * blockSizes[2];
SmallVector<int64_t> newShape({newX});
newShape.append(originalShape.begin() + 1, originalShape.end());
Comment on lines -94 to +93
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic of scaling allocation size by the total number of threads was reworked. We now scale only the zero dimension of the memref:

// num threads in the work group: X=2, Y=3, Z=4
// alloc before the pass
%slm_buff = memref.alloc() : memref<8x16xf16>

// alloc after the pass (only scaled the zero dimension)
%slm_buff_root = memref.alloc() : memref<192x16xf16, 3>
%x_offset = slm_chunk_x_size * (X_thread_idx * Y_block_size * Z_block_size + Y_thread_idx * Z_block_size + Z_thread_idx) = 8 * (X_thread_idx * 12 + Y_thread_idx * 4 + Z_thread_idx)
%slm_buff = memref.subview [%x_offset, 0] : memref<192x16xf16, 3> -> memref<8x16xf16, 3>


IntegerAttr sharedAddressSpace =
IntegerAttr::get(rewriter.getIntegerType(64),
Expand All @@ -111,27 +107,29 @@ struct ConvertAlloc : public OpRewritePattern<memref::AllocOp> {
allocOp.getOperands())
.getResult();

// Compute the offsets in SLM chunk for the current thread
auto origXConst = rewriter.create<arith::ConstantIndexOp>(allocOp.getLoc(),
originalShape[0]);
auto origYConst = rewriter.create<arith::ConstantIndexOp>(allocOp.getLoc(),
originalShape[1]);
// Compute the offsets in SLM chunk for the current thread:
// X_off = (Xthr_i * Ybl_sz * Zbl_sz + Ythr_i * Zbl_sz + Zthr_i) * Xchunk_sz
// Offsets for other dimensions = 0
auto xI = getAffineDimExpr(0, rewriter.getContext());
auto yI = getAffineDimExpr(1, rewriter.getContext());
auto zI = getAffineDimExpr(2, rewriter.getContext());
auto idxExpr =
(xI * blockSizes[1] * blockSizes[2] + yI * blockSizes[2] + zI) *
originalShape[0];
auto idxMap = AffineMap::get(/*dimCount=*/3, /*symbolCount=*/0, idxExpr);

auto threadIds = launchOp.getThreadIds();
auto offX = rewriter.create<affine::AffineApplyOp>(
allocOp.getLoc(), idxMap,
/*exprOperands=*/ValueRange({threadIds.x, threadIds.y, threadIds.z}));

auto offX =
rewriter
.create<arith::MulIOp>(allocOp.getLoc(), threadIds.x, origXConst)
.getResult();
auto offY =
rewriter
.create<arith::MulIOp>(allocOp.getLoc(), threadIds.y, origYConst)
.getResult();
SmallVector<int64_t> staticOffsets({ShapedType::kDynamic});
staticOffsets.insert(staticOffsets.end(), originalShape.size() - 1, 0);

auto offsets = getMixedValues({ShapedType::kDynamic, ShapedType::kDynamic},
{offX, offY}, rewriter);
auto offsets = getMixedValues(staticOffsets, {offX}, rewriter);
auto sizes = getMixedValues(originalShape, {}, rewriter);
auto strides = getMixedValues({1, 1}, {}, rewriter);
auto strides = getMixedValues(SmallVector<int64_t>(originalShape.size(), 1),
{}, rewriter);

auto newSlice =
rewriter
Expand Down
29 changes: 27 additions & 2 deletions lib/gc/Transforms/GPU/LinalgToXeGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,28 @@ static Value createFullMask(PatternRewriter &rewriter, Location loc,
return res.getResult();
}

// Extracts the offsets from a subview operation as values.
// The differense from mlir::getMixedOffsets is that this function
// returns the offsets as mlir::Value that can already be used as an argument
// for other mlir::Operations.
static SmallVector<Value> extractOffsetsAsValues(PatternRewriter &rewriter,
Location loc,
memref::SubViewOp subview) {
SmallVector<Value> offsetValues;
auto staticOffsets = subview.getStaticOffsets();
auto dynamicOffsets = subview.getOffsets();
size_t dynIdx = 0;
for (size_t i = 0; i < staticOffsets.size(); i++) {
if (staticOffsets[i] == ShapedType::kDynamic)
offsetValues.push_back(dynamicOffsets[dynIdx++]);
else
offsetValues.push_back(
rewriter.create<arith::ConstantIndexOp>(loc, staticOffsets[i]));
}

return offsetValues;
}

// Max number of elements to load/store from SLM
constexpr int64_t maxSLMTileSize = 32;

Expand Down Expand Up @@ -841,8 +863,11 @@ static SmallVector<Value> createSLMDescTiles(PatternRewriter &rewriter,
// GPU kernel. We have to merge the subview offsets into the descriptor
// offset.
if (auto subView = dyn_cast<memref::SubViewOp>(src.getDefiningOp())) {
auto xIntOffs = subView.getOffsets()[0];
auto yIntOffs = subView.getOffsets()[1];
auto offsets = extractOffsetsAsValues(rewriter, loc, subView);
assert(offsets.size() == 2 && "Expected 2D subview offsets");

auto xIntOffs = offsets[0];
auto yIntOffs = offsets[1];

// compute 'blockOffset' (beginning of the subview block in the original
// flat memref)
Expand Down
43 changes: 43 additions & 0 deletions test/mlir/test/gc/Transforms/GPU/allocs-to-slm-1d.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// RUN: gc-opt %s --allocs-to-slm | FileCheck %s

// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size
// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 256)>

func.func @entry() {
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index

// Memory space wasn't assigned as it's allocated outside of gpu.launch block
// CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<256xf16>
%0 = memref.alloc() : memref<256xf16>
// Capture thread-id variables
// CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads
// CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in
// CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) {
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1)
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) {
// Memory space was changed as it's explicitly specifided
// CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<256xf16, 1>
%1 = memref.alloc() : memref<256xf16, 1>
// Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 256 = 6144)
// CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<6144xf16, 3>
// CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]])
// CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]]] [256] [1]
// CHECK-SAME: memref<6144xf16, 3> to memref<256xf16, strided<[1], offset: ?>, 3>
%2 = memref.alloc() : memref<256xf16>

// CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] :
// CHECK-SAME: memref<256xf16, 1>, memref<256xf16, strided<[1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<256xf16>)
linalg.add ins(%1, %2 :memref<256xf16, 1>, memref<256xf16>) outs(%0 : memref<256xf16>)
// CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<256xf16, 1>
// Verify that there are no deallocs for SLM
// CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .*
// CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .*
memref.dealloc %1 : memref<256xf16, 1>
memref.dealloc %2 : memref<256xf16>
gpu.terminator
}
return
}
Original file line number Diff line number Diff line change
@@ -1,32 +1,35 @@
// RUN: gc-opt %s --allocs-to-slm | FileCheck %s

// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size
// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 16)>

func.func @entry() {
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index

// Memory space wasn't assigned as it's allocated outside of gpu.launch block
// CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<16x32xf16>
%0 = memref.alloc() : memref<16x32xf16>
// Capture thread-id variables
// CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads
// CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[ARG5:.+]]) in
// CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c4, %[[ARG11:.+]] = %c1) {
// CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in
// CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) {
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1)
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c4, %sz_tz = %c1) {
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) {
// Memory space was changed as it's explicitly specifided
// CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<16x32xf16, 1>
%1 = memref.alloc() : memref<16x32xf16, 1>
// Added 'shared' memory space
// CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<32x128xf16, 3>
// CHECK: %[[OFF_X:.*]] = arith.muli %[[THREAD_X]], %c16 : index
// CHECK: %[[OFF_Y:.*]] = arith.muli %[[THREAD_Y]], %c32 : index
// CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], %[[OFF_Y]]] [16, 32] [1, 1]
// CHECK-SAME: memref<32x128xf16, 3> to memref<16x32xf16, strided<[128, 1], offset: ?>, 3>
// Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 16 = 384)
// CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<384x32xf16, 3>
// CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]])
// CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], 0] [16, 32] [1, 1]
// CHECK-SAME: memref<384x32xf16, 3> to memref<16x32xf16, strided<[32, 1], offset: ?>, 3>
%2 = memref.alloc() : memref<16x32xf16>

// CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] :
// CHECK-SAME: memref<16x32xf16, 1>, memref<16x32xf16, strided<[128, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x32xf16>)
// CHECK-SAME: memref<16x32xf16, 1>, memref<16x32xf16, strided<[32, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x32xf16>)
linalg.add ins(%1, %2 :memref<16x32xf16, 1>, memref<16x32xf16>) outs(%0 : memref<16x32xf16>)
// CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<16x32xf16, 1>
// Verify that there are no deallocs for SLM
Expand Down
43 changes: 43 additions & 0 deletions test/mlir/test/gc/Transforms/GPU/allocs-to-slm-4d.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// RUN: gc-opt %s --allocs-to-slm | FileCheck %s

// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size
// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 2)>

func.func @entry() {
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index

// Memory space wasn't assigned as it's allocated outside of gpu.launch block
// CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<2x3x16x32xf16>
%0 = memref.alloc() : memref<2x3x16x32xf16>
// Capture thread-id variables
// CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads
// CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in
// CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) {
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1)
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) {
// Memory space was changed as it's explicitly specifided
// CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<2x3x16x32xf16, 1>
%1 = memref.alloc() : memref<2x3x16x32xf16, 1>
// Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 2 = 48)
// CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<48x3x16x32xf16, 3>
// CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]])
// CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], 0, 0, 0] [2, 3, 16, 32] [1, 1, 1, 1]
// CHECK-SAME: memref<48x3x16x32xf16, 3> to memref<2x3x16x32xf16, strided<[1536, 512, 32, 1], offset: ?>, 3>
%2 = memref.alloc() : memref<2x3x16x32xf16>

// CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] :
// CHECK-SAME: memref<2x3x16x32xf16, 1>, memref<2x3x16x32xf16, strided<[1536, 512, 32, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<2x3x16x32xf16>)
linalg.add ins(%1, %2 :memref<2x3x16x32xf16, 1>, memref<2x3x16x32xf16>) outs(%0 : memref<2x3x16x32xf16>)
// CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<2x3x16x32xf16, 1>
// Verify that there are no deallocs for SLM
// CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .*
// CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .*
memref.dealloc %1 : memref<2x3x16x32xf16, 1>
memref.dealloc %2 : memref<2x3x16x32xf16>
gpu.terminator
}
return
}